In [0]:
import pandas as pd
import numpy as np

import re

np.random.seed(2)

from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Normalizer
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC, LinearSVC

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [0]:
cd /content/drive/My\ Drive/Colab\ Notebooks/dataset

/content/drive/My Drive/Colab Notebooks/dataset


## Load Data

In [0]:
data = pd.read_csv('yelp2013.csv')

In [0]:
msk = np.random.rand(len(data)) < 0.9
train = data[msk]
test = data[~msk]

In [0]:
Y_train = train['stars'][:].values
X_train = train['text'][:].values
Y_test = test['stars'][:].values
X_test = test['text'][:].values
X_train.shape

(301465,)

In [0]:
import time

from scipy.stats import uniform

parameters_nbsvm = {
    'vect__ngram_range': [(1,2), (1,3)],
    'vect__min_df': [2],
    'vect__max_df': [1.0],
    'clf__beta': uniform(0,1),
    'clf__alpha': uniform(.5,1),
    'clf__C': uniform(30, 40)
}
pipeline_nbsvm = Pipeline([
    ('vect', CountVectorizer()),
    ('clf', NBSVM())
])

gs_nbsvm = RandomizedSearchCV(pipeline_nbsvm, parameters_nbsvm, 
                                   cv=5, scoring='accuracy', n_jobs=-1, verbose=50,
                                 return_train_score=True, n_iter=20, random_state=62)
start = time.time()
gs_nbsvm.fit(X_train, Y_train)
time.time()-start, gs_nbsvm.best_params_, gs_nbsvm.best_score_

### Split into train/validation sets

In [0]:
random_seed = 2
X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size = 0.1, random_state=random_seed)

In [0]:
del data
del train
del test

### SVM

In [0]:
lsvm = Pipeline([
    ('vect', CountVectorizer(ngram_range=(1,1))),
    #('tfidf', TfidfTransformer()),
    ('norm', Normalizer()),
    ('clf', LinearSVC(random_state = 0, tol = 1e-7)),
])

lsvm.fit(X_train, Y_train)
y_pred = lsvm.predict(X_test)
print('SVM-unigram Accuracy', accuracy_score(Y_test, y_pred))

SVM-unigram Accuracy 0.6734517494188472


In [0]:
lsvm2 = Pipeline([
    ('vect', CountVectorizer(ngram_range=(2,2))),
    ('norm', Normalizer()),
    ('clf', LinearSVC(random_state = 0, tol = 1e-7)),
])

lsvm2.fit(X_train, Y_train)
y_pred = lsvm2.predict(X_test)
print('SVM-bigram Accuracy', accuracy_score(Y_test, y_pred))

SVM-bigram Accuracy 0.6742564224831614


In [0]:
lsvm3 = Pipeline([
    ('vect', CountVectorizer(ngram_range=(1,2))),
    ('norm', Normalizer()),
    ('clf', LinearSVC(random_state = 0, tol = 1e-7)),
])

lsvm3.fit(X_train, Y_train)
y_pred = lsvm3.predict(X_test)
print('SVM-[uni,bi]gram Accuracy', accuracy_score(Y_test, y_pred))

SVM-[uni,bi]gram Accuracy 0.6913035703641891


In [0]:
lsvm4 = Pipeline([
    ('vect', CountVectorizer(ngram_range=(1,3))),
    ('norm', Normalizer()),
    ('clf', LinearSVC(random_state = 0, tol = 1e-7)),
])

lsvm4.fit(X_train, Y_train)
y_pred = lsvm4.predict(X_test)
print('SVM-[uni,bi,tri]gram Accuracy', accuracy_score(Y_test, y_pred))

SVM-[uni,bi,tri]gram Accuracy 0.6941348274423318


In [0]:
lsvm4 = Pipeline([
    ('vect', CountVectorizer(ngram_range=(3,3))),
    ('norm', Normalizer()),
    ('clf', LinearSVC(random_state = 0, tol = 1e-7)),
])

lsvm4.fit(X_train, Y_train)
y_pred = lsvm4.predict(X_test)
print('SVM-trigram Accuracy', accuracy_score(Y_test, y_pred))

SVM-trigram Accuracy 0.6554211122369911


In [0]:
lsvm5 = Pipeline([
    ('vect', CountVectorizer(2,2)),
    ('tfidf', TfidfTransformer()),
    ('norm', Normalizer()),
    ('clf', LinearSVC(random_state = 0, tol = 1e-7)),
])

lsvm5.fit(X_train, Y_train)
y_pred = lsvm5.predict(X_test)
print('TF-IDF Accuracy', accuracy_score(Y_test, y_pred))

TF-IDF Accuracy 0.6646301484174763


In [0]:
lsvm6 = Pipeline([
    ('vect', CountVectorizer(1,2)),
    ('tfidf', TfidfTransformer()),
    ('norm', Normalizer()),
    ('clf', LinearSVC(random_state = 0, tol = 1e-7)),
])

lsvm6.fit(X_train, Y_train)
y_pred = lsvm6.predict(X_test)
print('SVM-[uni,bi]-gram,TFIDF Accuracy', accuracy_score(Y_test, y_pred))

SVM-[uni,bi]-gram,TFIDF Accuracy 0.6646301484174763


### Multinomial Naive Bayes

In [0]:
mnb = Pipeline([
    ('vect', CountVectorizer(ngram_range=(1,2))),
    ('norm', Normalizer()),
    ('clf', MultinomialNB(alpha=0.01)),
])

mnb.fit(X_train, Y_train)
y_pred = mnb.predict(X_test)
print('Multinomial NB Accuracy', accuracy_score(Y_test, y_pred))

Multinomial NB Accuracy 0.6479406330094772


In [0]:
mnb2 = Pipeline([
    ('vect', CountVectorizer(ngram_range=(1,2))),
    ('tfidf', TfidfTransformer(use_idf=1,smooth_idf=1,sublinear_tf=1)),
    ('norm', Normalizer()),
    ('clf', MultinomialNB(alpha=0.01)),
])

mnb2.fit(X_train, Y_train)
y_pred = mnb2.predict(X_test)
print('Multinomial NB Accuracy', accuracy_score(Y_test, y_pred))

Multinomial NB Accuracy 0.6368540263455922


In [0]:
import time 

parameters_mnv_bow = {
    'vec__min_df': [1, 3],
    'vec__max_df': [290000],
    'vec__ngram_range':[(1, 2), (2, 2)],
    'clf__alpha': [.5, .1, 2], 
}

pipeline_mnv_bow = Pipeline([
    ('vec', CountVectorizer()),
    ('clf', MultinomialNB())
])
                  
rs_mnv_bow = GridSearchCV(pipeline_mnv_bow, parameters_mnv_bow,
                                   cv=5, scoring='accuracy', n_jobs=-1, verbose=50, return_train_score=True)
start = time.time()
rs_mnv_bow.fit(X_train, Y_train)
time.time() - start, rs_mnv_bow.best_params_, rs_mnv_bow.best_score_

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:  4.2min
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:  4.2min
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:  8.4min




[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:  8.5min
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed: 11.6min
[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed: 12.6min
[Parallel(n_jobs=-1)]: Done   7 tasks      | elapsed: 14.8min
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed: 15.7min
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed: 17.9min
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed: 18.8min
[Parallel(n_jobs=-1)]: Done  11 tasks      | elapsed: 22.0min
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed: 22.9min
[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed: 26.0min
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed: 27.0min
[Parallel(n_jobs=-1)]: Done  15 tasks      | elapsed: 30.0min
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed: 30.1min
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed: 33.0min
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed: 33.1min
[Parallel(n_jobs=-1)]: Done  19 tasks      | elapsed: 36.1min
[Paralle

(6578.482490301132,
 {'clf__alpha': 2,
  'vec__max_df': 290000,
  'vec__min_df': 3,
  'vec__ngram_range': (2, 2)},
 0.6480992783375965)

### NBSVM

In [0]:
from scipy.sparse import spmatrix, coo_matrix
from sklearn.base import BaseEstimator
from sklearn.linear_model.base import LinearClassifierMixin, SparseCoefMixin
from sklearn.svm import LinearSVC
import numpy as np

'''
Implementation NBSVM from baselines and bigrams [1]. Code taken from [2].
[1] https://nlp.stanford.edu/pubs/sidaw12_simple_sentiment.pdf
[2] https://github.com/Joshua-Chin/nbsvm
'''
class NBSVM(BaseEstimator, LinearClassifierMixin, SparseCoefMixin):

    def __init__(self, alpha=1, C=1, beta=0.25, fit_intercept=False):
        self.alpha = alpha
        self.C = C
        self.beta = beta
        self.fit_intercept = fit_intercept

    def fit(self, X, y):
        self.classes_ = np.unique(y)
        if len(self.classes_) == 2:
            coef_, intercept_ = self._fit_binary(X, y)
            self.coef_ = coef_
            self.intercept_ = intercept_
        else:
            coef_, intercept_ = zip(*[
                self._fit_binary(X, y == class_)
                for class_ in self.classes_
            ])
            self.coef_ = np.concatenate(coef_)
            self.intercept_ = np.array(intercept_).flatten()
        return self

    def _fit_binary(self, X, y):
        p = np.asarray(self.alpha + X[y == 1].sum(axis=0)).flatten()
        q = np.asarray(self.alpha + X[y == 0].sum(axis=0)).flatten()
        r = np.log(p / np.abs(p).sum()) - np.log(q / np.abs(q).sum())
        b = np.log((y == 1).sum()) - np.log((y == 0).sum())

        if isinstance(X, spmatrix):
            indices = np.arange(len(r))
            r_sparse = coo_matrix(
                (r, (indices, indices)),
                shape=(len(r), len(r))
            )
            X_scaled = X * r_sparse
        else:
            X_scaled = X * r

        lsvc = LinearSVC(
            C=self.C,
            fit_intercept=self.fit_intercept,
            max_iter=10000
        ).fit(X_scaled, y)

        mean_mag = np.abs(lsvc.coef_).mean()
        coef_ = (1 - self.beta) * mean_mag * r + self.beta * (r * lsvc.coef_)
        intercept_ = (1 - self.beta) * mean_mag * b + self.beta * lsvc.intercept_

        return coef_, intercept_

In [0]:
parameters_nbsvm = {
    'vect__ngram_range': [(1,2), (1,3)],
    'vect__min_df': [2],
    'vect__max_df': [1.0],
    'clf__beta': uniform(0,1),
    'clf__alpha': uniform(.5,1),
    'clf__C': uniform(30, 40)
}
pipeline_nbsvm = Pipeline([
    ('vect', CountVectorizer()),
    ('clf', NBSVM())
])

gs_nbsvm = RandomizedSearchCV(pipeline_nbsvm, parameters_nbsvm, 
                                   cv=5, scoring='accuracy', n_jobs=-1, verbose=3,
                                 return_train_score=True, n_iter=20, random_state=62)
start = time.time()
gs_nbsvm.fit(X_train, Y_train)
time.time()-start, gs_nbsvm.best_params_, gs_nbsvm.best_score_

## Pre-processing Data



In [0]:
data = pd.read_csv('yelp2013.csv')
msk = np.random.rand(len(data)) < 0.9
train = data[msk]
test = data[~msk]

In [0]:
Y_train = train['stars']
X_train_raw = train['text']
Y_test = test['stars']
X_test_raw = test['text']
X_train_raw.shape

(301465,)

In [0]:
del data
del train
del test

#### remove HTML tags, remove stop words, stemming, lemmatization, one-hot encoding





#### Lowercase

In [0]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

from nltk.corpus import stopwords
stop = stopwords.words('english')

nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer 
lm = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [0]:
def preprocess(dataset):
    dataset.apply(lambda x: " ".join(x.lower() for x in x.split()))               # lowercase
    dataset.str.replace('[^\w\s]','')                                             # remove punctuations
    dataset.apply(lambda x: " ".join(x for x in x.split() if x not in stop))      # remove stopwords
    dataset.apply(lambda x: " ".join([lm.lemmatize(word) for word in x.split()])) # lemmatization
    
    return dataset

In [0]:
X_train = preprocess(X_train_raw)
X_test = preprocess(X_test_raw)

del X_train_raw
del X_test_raw

In [0]:
import time
start = time.time()

nbsvm = Pipeline([
    ('vect', CountVectorizer(ngram_range=(1,2))),
    ('clf', NBSVM(C=41.29497015311559, alpha=0.7914671389502778, beta=0.5176297089108057))
])
nbsvm.fit(X_train, Y_train)
y_pred = nbsvm.predict(X_test)
print('NBSVM Accuracy', accuracy_score(Y_test, y_pred))
print('Time Elapsed', time.time() - start)

### Most frequent words

In [0]:
freq = pd.Series(' '.join(X_train).split()).value_counts()
freq[:10]

food       154101
good       145263
place      144654
great      129009
service    107483
like       104810
time       102913
get         97905
one         96406
would       90572
dtype: int64

### Least frequent words

In [0]:
freq[-10:]

gljust              1
fasttothetable      1
chowfreakingmein    1
shortversion        1
helpanswer          1
hudsonjust          1
brazilparkersons    1
runof               1
godhow              1
lukewarmwe          1
dtype: int64

## Train with processed data

### SVM

In [0]:
lsvm7 = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('norm', Normalizer()),
    ('clf', LinearSVC(random_state = 0, tol = 1e-7)),
])

lsvm7.fit(X_train, Y_train)
y_pred = lsvm7.predict(X_test)
print('SVM-TFIDF Accuracy', accuracy_score(Y_test, y_pred))

SVM-TFIDF Accuracy 0.66762773831109


In [0]:
lsvm8 = Pipeline([
    ('vect', CountVectorizer(ngram_range=(3,3))),
    ('norm', Normalizer()),
    ('clf', LinearSVC(random_state = 0, tol = 1e-7)),
])

lsvm8.fit(X_train, Y_train)
y_pred = lsvm8.predict(X_test)
print('SVM-trigram Accuracy', accuracy_score(Y_test, y_pred))

SVM-trigram Accuracy 0.6592954640281338


In [0]:
lsvm8 = Pipeline([
    ('vect', CountVectorizer(ngram_range=(1,2))),
    ('norm', Normalizer()),
    ('clf', LinearSVC(random_state = 0, tol = 1e-7)),
])

lsvm8.fit(X_train, Y_train)
y_pred = lsvm8.predict(X_test)
print('SVM-[uni,bi]-gram Accuracy', accuracy_score(Y_test, y_pred))

SVM-[uni,bi]-gram Accuracy 0.693963082959308


In [0]:
lsvm9 = Pipeline([
    ('vect', CountVectorizer(ngram_range=(1,1))),
    ('norm', Normalizer()),
    ('clf', LinearSVC(random_state = 0, tol = 1e-7)),
])

lsvm9.fit(X_train, Y_train)
y_pred = lsvm9.predict(X_test)
print('SVM-unigram Accuracy', accuracy_score(Y_test, y_pred))

SVM-unigram Accuracy 0.6758612489968195


In [0]:
lsvm = Pipeline([
    ('vect', CountVectorizer(ngram_range=(2,2))),
    ('norm', Normalizer()),
    ('clf', LinearSVC(random_state = 0, tol = 1e-7)),
])

lsvm.fit(X_train, Y_train)
y_pred = lsvm.predict(X_test)
print('SVM-bigram Accuracy', accuracy_score(Y_test, y_pred))

SVM-bigram Accuracy 0.6773474422613917


In [0]:
lsvm2 = Pipeline([
    ('vect', CountVectorizer(ngram_range=(1,3))),
    ('norm', Normalizer()),
    ('clf', LinearSVC(random_state = 0, tol = 1e-7)),
])

lsvm2.fit(X_train, Y_train)
y_pred = lsvm2.predict(X_test)
print('SVM-[uni,bi,tri]gram Accuracy', accuracy_score(Y_test, y_pred))

SVM-[uni,bi,tri]gram Accuracy 0.6976488422554469


### Multinomial Naive Bayes

In [0]:
mnb = Pipeline([
    ('vect', CountVectorizer(ngram_range=(1,2), min_df=7)),
    ('norm', Normalizer()),
    ('clf', MultinomialNB(alpha=0.01)),
])

mnb.fit(X_train, Y_train)
y_pred = mnb.predict(X_test)
print('Multinomial NB Accuracy', accuracy_score(Y_test, y_pred))

Multinomial NB Accuracy 0.6563623933656333


In [0]:
mnb = Pipeline([
    ('vect', CountVectorizer(ngram_range=(2,2), min_df=10)),
    ('norm', Normalizer()),
    ('clf', MultinomialNB(alpha=0.01)),
])

mnb.fit(X_train, Y_train)
y_pred = mnb.predict(X_test)
print('Multinomial NB Accuracy', accuracy_score(Y_test, y_pred))

Multinomial NB Accuracy 0.6573730047855423
