In [1]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [8]:
import dataiku
from dataiku import pandasutils as pdu
import pandas as pd
import numpy as np

In [3]:
# Read the dataset as a Pandas dataframe in memory
# Note: here, we only read the first 100K rows. Other sampling options are available
dataset_hotel_reviews = dataiku.Dataset("hotel_reviews")
df = dataset_hotel_reviews.get_dataframe(limit=100000)

In [4]:
# Get some simple descriptive statistics
pdu.audit(df)

  def _ipython_display_formatter_default(self):
  def _singleton_printers_default(self):


Unnamed: 0,_a_variable,_b_data_type,_c_cardinality,_d_missings,_e_sample_values
0,col_1,object,1596,0,[My $200 Gucci sunglasses were stolen out of m...
1,col_2,bool,2,0,"[True, False]"


In [10]:
from sklearn.cross_validation import train_test_split

Split data into train and test data in ratio 80:20

In [119]:
X_train, X_test, y_train, y_test = train_test_split(df.col_1, df.col_2, test_size = 0.2, random_state = 1337)

In [17]:
np.bincount(y_train)

array([638, 642])

In [157]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_vect = count_vect.fit_transform(X_train) # Default bag-of-words

In [158]:
X_train_vect.shape

(1280, 8650)

So X_train_vect has 8650 features.

In [159]:
X_test_vect = count_vect.transform(X_test)

In [33]:
feature_names = count_vect.get_feature_names()
print feature_names[-20:-1]

[u'yougurt', u'young', u'younger', u'your', u'youre', u'yours', u'yourself', u'yr', u'yrs', u'yuck', u'yucky', u'yum', u'yummo', u'yummy', u'yunan', u'yup', u'zest', u'zipped', u'zone']


In [24]:
from sklearn.tree import DecisionTreeClassifier
dtree = DecisionTreeClassifier(random_state=17)

Accuracy for simple Decision Tree with simple bag-of-words

In [160]:
dtree = dtree.fit(X_train_vect, y_train)
dtree.score(X_test_vect, y_test) # accuracy for simple Decision Tree

0.65937500000000004

Using Grid Search and Logistics Regression

In [35]:
from sklearn.grid_search import GridSearchCV
from sklearn.linear_model import LogisticRegression

In [161]:
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10]}
grid = GridSearchCV(LogisticRegression(), param_grid, cv=5)
grid.fit(X_train_vect, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'C': [0.001, 0.01, 0.1, 1, 10]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [40]:
grid.best_score_

0.84843749999999996

In [162]:
grid.score(X_test_vect, y_test) # accuracy for grid search logistic regression

0.875

Using Random Forest with default bag-of-words

In [122]:
from sklearn.ensemble import RandomForestClassifier
rdf = RandomForestClassifier(random_state = 17)

In [163]:
rdf1 = rdf.fit(X_train_vect, y_train)
rdf1.score(X_test_vect, y_test) # accuracy for random forest

0.703125

Using SVM

In [164]:
from sklearn.svm import SVC
grid_svc = GridSearchCV(LogisticRegression(), param_grid, cv=5)
grid_svc.fit(X_train_vect, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'C': [0.001, 0.01, 0.1, 1, 10]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [165]:
grid_svc.best_score_

0.84843749999999996

In [166]:
grid_svc.score(X_test_vect, y_test) # accuracy for grid search support vector machines

0.875

So far the best model is grid search Logistics Regression and Support Vector Machines

### Trying to config the bag-of-words: Firstly I tried to include only tokens that appear in at least 3 reviews

In [167]:
vect_min3 = CountVectorizer(min_df=3).fit(X_train) # tokens need to appear in at least 3 reviews
X_train_vect_min3 = vect_min3.transform(X_train)

In [168]:
X_train_vect_min3.shape

(1280, 3449)

X_train_vec_min3 has gone down to 3449 features.

In [169]:
X_test_vect_min3 = vect_3.transform(X_test)

Checking Grid Search Logistic Regression again, showing same accuracy

In [170]:
grid_v3 = GridSearchCV(LogisticRegression(), param_grid, cv=5)
grid_v3.fit(X_train_vect_min3, y_train)
grid_v3.score(X_test_vect_min3, y_test)

0.875

Random Forest with tokens that appear in at least 3 reviews, accuracy is improved a bit

In [171]:
rdf_3 = rdf.fit(X_train_vect_min3, y_train) # random forest with token in >= 3 reviews
rdf_3.score(X_test_vect_min3, y_test)

0.71250000000000002

### Trying tokens that appear in at least 10% of reviews

In [173]:
vect_01 = CountVectorizer(min_df=.1).fit(X_train) # tokens need to appear in at least 10% of reviews
X_train_vect_01 = vect_01.transform(X_train)

In [175]:
X_test_vect_01 = vect_01.transform(X_test)

In [176]:
rdf_01 = rdf.fit(X_train_vect_01, y_train) # random forest with token in >= 10% reviews
rdf_01.score(X_test_vect_01, y_test)

0.66249999999999998

Result is much worst off

## Trying Tfidf

In [217]:
from sklearn.feature_extraction.text import TfidfVectorizer # term frequency-inverse document frequency

In [218]:
vect_tf = TfidfVectorizer(min_df = 5).fit(X_train) # token needs to appear in at least 5 reviews
X_train_vect_tf = vect_tf.transform(X_train)

In [219]:
X_train_vect_tf.shape

(1280, 2404)

In [220]:
X_test_vect_tf = vect_tf.transform(X_test)

Grid Search Logistics Regression with tfidf shows same result as default bag-of-words

In [221]:
grid_tf = GridSearchCV(LogisticRegression(), param_grid, cv=5)
grid_tf.fit(X_train_vect_tf, y_train)
grid_tf.score(X_test_vect_tf, y_test)

0.875

Random Forest, this result is the best so far

In [222]:
rdf_tf = rdf.fit(X_train_vect_tf, y_train) # random forest with tfidf
rdf_tf.score(X_test_vect_tf, y_test)

0.77500000000000002

## Trying n-gram bag-of-words

In [178]:
vect_ng3 = CountVectorizer(ngram_range = (1,3)).fit(X_train) # bag-of-words combo fo 1, 2, or 3 words
X_train_vect_ng3 = vect_ng3.transform(X_train)

In [179]:
X_train_vect_ng3.shape

(1280, 214911)

The number of features has exploded.

In [180]:
X_test_vect_ng3 = vect_ng3.transform(X_test)

Random Forest, actually worse off

In [181]:
rdf_ng3 = rdf.fit(X_train_vect_ng3, y_train)
rdf_ng3.score(X_test_vect_ng3, y_test)

0.70625000000000004

## Combining Tfidf and n-gram

In [87]:
vect_tf2 = TfidfVectorizer(ngram_range = (1,3), min_df = 5).fit(X_train)
X_train_vect_tf2 = vect_tf2.transform(X_train)

In [88]:
X_train_vect_tf2.shape

(1280, 10156)

Now number of features is 10156

In [89]:
X_test_vect_tf2 = vect_tf2.transform(X_test)

Random forest shows some improvement, but not as good as default tfidf

In [91]:
rdf_tf2 = rdf.fit(X_train_vect_tf2, y_train)
rdf_tf2.score(X_test_vect_tf2, y_test)

0.72812500000000002

## Trying lemmatization

In [102]:
from nltk import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer

In [106]:
class LemmaTokenziner(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]

In [110]:
vect_lema = TfidfVectorizer(ngram_range = (1,3), tokenizer = LemmaTokenziner(), min_df = 5)

In [225]:
nltk.download('all')

[nltk_data] Downloading collection u'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /home/dataiku/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     /home/dataiku/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package biocreative_ppi to
[nltk_data]    |     /home/dataiku/nltk_data...
[nltk_data]    |   Unzipping corpora/biocreative_ppi.zip.
[nltk_data]    | Downloading package brown to
[nltk_data]    |     /home/dataiku/nltk_data...
[nltk_data]    |   Unzipping corpora/brown.zip.
[nltk_data]    | Downloading package brown_tei to
[nltk_data]    |     /home/dataiku/nltk_data...
[nltk_data]    |   Unzipping corpora/brown_tei.zip.
[nltk_data]    | Downloading package cess_cat to
[nltk_data]    |     /home/dataiku/nltk_data...
[nltk_data]    |   Unzipping corpora/cess_cat.zip.
[nltk_data]    | Downloading package cess_esp to
[nltk_data]    |     /home/d

True

In [253]:
X_train_lema = vect_lema.fit_transform(X_train)

In [249]:
X_train_lema.shape

(1280, 12811)

In [254]:
X_test_lema = vect_lema.transform(X_test)

Random forest perform quite well with lemmatization, just below tfidf

In [256]:
rdf_lema = rdf.fit(X_train_lema, y_train) # random forest
rdf_lema.score(X_test_lema, y_test)

0.765625

In [144]:
class StemTokenziner(object):
    def __init__(self):
        self.wnl = PorterStemmer()
    def __call__(self, doc):
        return [self.wnl.stem(t) for t in word_tokenize(doc)]

In [145]:
vect_stem = TfidfVectorizer(ngram_range = (1,3), tokenizer = StemTokenziner(), min_df = 5)

In [257]:
X_train_stem = vect_stem.fit_transform(X_train)

In [258]:
X_train_stem.shape

(1280, 12820)

In [259]:
X_test_stem = vect_stem.transform(X_test)

Random forest result is as good as tfidf

In [260]:
rdf_stem = rdf.fit(X_train_stem, y_train) # random forest
rdf_stem.score(X_test_stem, y_test)

0.77500000000000002

### Stemming or lemmatization actually doesn't work because dataiku doesn't fully include the punkt folder of nltk

# Topic Modelling

In [112]:
vect_lda = CountVectorizer(max_features = 10000, max_df=.20)
from sklearn.decomposition import LatentDirichletAllocation
lda = LatentDirichletAllocation(n_topics = 20, learning_method = "batch", max_iter=25, random_state=17)
X_train_lda = lda.fit_transform(vect_lda.fit_transform(X_train))

In [113]:
X_train_lda.shape

(1280, 20)

In [115]:
X_test_lda = lda.transform(vect_lda.transform(X_test))

In [215]:
rdf_lda = rdf.fit(X_train_lda, y_train) # random forest
rdf_lda.score(X_test_lda, y_test)

0.58750000000000002

In [199]:
from scipy.sparse import hstack

### Combining topic modelling and default bag-of-words

In [205]:
X_train_2 = hstack([X_train_vect, X_train_lda])

In [204]:
X_train_2.shape

(1280, 8670)

In [209]:
X_test_2 = hstack([X_test_vect, X_test_lda])

Random forest shows improvement from just topic modelling or bag-of-words

In [210]:
rdf_tm = rdf.fit(X_train_2, y_train)
rdf_tm.score(X_test_2, y_test)

0.73750000000000004

### Combining topic modelling and tfidf

In [212]:
X_train_3 = hstack([X_train_vect_tf, X_train_lda])

In [213]:
X_train_3.shape

(1280, 2424)

In [214]:
X_test_3 = hstack([X_test_vect_tf, X_test_lda])

Random forest is not better than just tfidf

In [263]:
rdf_tm3 = rdf.fit(X_train_3, y_train)
rdf_tm3.score(X_test_3, y_test)

0.734375

Grid Search Logistics Regression not better than default tfidf

In [269]:
grid_3 = GridSearchCV(LogisticRegression(), param_grid, cv=5)
grid_3.fit(X_train_3, y_train)
grid_3.score(X_test_3, y_test)

0.87187499999999996

### Combining topic modelling and stemming

In [261]:
X_train_4 = hstack([X_train_stem, X_train_lda])

In [266]:
X_train_4.shape

(1280, 12840)

In [262]:
X_test_4 = hstack([X_test_stem, X_test_lda])

Random forest result is as good as tfidf. It seems to be strange that result can't get better than this

In [265]:
rdf_tm4 = rdf.fit(X_train_stem, y_train)
rdf_tm4.score(X_test_stem, y_test)

0.77500000000000002

Grid Search Logistics Regression. This is the highest accuracy out of all models.

In [268]:
grid_4 = GridSearchCV(LogisticRegression(), param_grid, cv=5)
grid_4.fit(X_train_4, y_train)
grid_4.score(X_test_4, y_test)

0.90937500000000004