In [0]:
import pandas as pd
import numpy as np

import re

np.random.seed(2)

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Normalizer
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC, LinearSVC

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [0]:
cd /content/drive/My\ Drive/Colab\ Notebooks/dataset

/content/drive/My Drive/Colab Notebooks/dataset


## Load Data

In [0]:
data = pd.read_csv('yelp2015.csv')

In [0]:
msk = np.random.rand(len(data)) < 0.9
train = data[msk]
test = data[~msk]

In [0]:
Y_train = train['stars'][:].values
X_train = train['text'][:].values
Y_test = test['stars'][:].values
X_test = test['text'][:].values

### Split into train/validation sets

In [0]:
random_seed = 2
X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size = 0.1, random_state=random_seed)

In [0]:
del data
del train
del test

## SVM

In [0]:
lsvm = Pipeline([
    ('vect', CountVectorizer(ngram_range=(1,1))),
    #('tfidf', TfidfTransformer()),
    ('norm', Normalizer()),
    ('clf', LinearSVC(random_state = 0, tol = 1e-7)),
])

lsvm.fit(X_train, Y_train)
y_pred = lsvm.predict(X_test)
print('SVM-unigram Accuracy', accuracy_score(Y_test, y_pred))

SVM-unigram Accuracy 0.681401234116889


In [0]:
lsvm2 = Pipeline([
    ('vect', CountVectorizer(ngram_range=(2,2))),
    ('norm', Normalizer()),
    ('clf', LinearSVC(random_state = 0, tol = 1e-7)),
])

lsvm2.fit(X_train, Y_train)
y_pred = lsvm2.predict(X_test)
print('SVM-bigram Accuracy', accuracy_score(Y_test, y_pred))

SVM-bigram Accuracy 0.6930051211201371


In [0]:
lsvm3 = Pipeline([
    ('vect', CountVectorizer(ngram_range=(1,2))),
    ('norm', Normalizer()),
    ('clf', LinearSVC(random_state = 0, tol = 1e-7)),
])

lsvm3.fit(X_train, Y_train)
y_pred = lsvm3.predict(X_test)
print('SVM-unigram-bigram Accuracy', accuracy_score(Y_test, y_pred))

SVM-unigram-bigram Accuracy 0.7053795009066575


In [0]:
lsvm4 = Pipeline([
    ('vect', CountVectorizer(ngram_range=(1,3))),
    ('norm', Normalizer()),
    ('clf', LinearSVC(random_state = 0, tol = 1e-7)),
])

lsvm4.fit(X_train, Y_train)
y_pred = lsvm4.predict(X_test)
print('SVM-ngram Accuracy', accuracy_score(Y_test, y_pred))

In [0]:
lsvm5 = Pipeline([
    ('vect', CountVectorizer(2,2)),
    ('tfidf', TfidfTransformer()),
    ('norm', Normalizer()),
    ('clf', LinearSVC(random_state = 0, tol = 1e-7)),
])

lsvm5.fit(X_train, Y_train)
y_pred = lsvm5.predict(X_test)
print('TF-IDF Accuracy', accuracy_score(Y_test, y_pred))

TF-IDF Accuracy 0.6756690334965095


In [0]:
lsvm6 = Pipeline([
    ('vect', CountVectorizer(1,2)),
    ('tfidf', TfidfTransformer()),
    ('norm', Normalizer()),
    ('clf', LinearSVC(random_state = 0, tol = 1e-7)),
])

lsvm6.fit(X_train, Y_train)
y_pred = lsvm6.predict(X_test)
print('SVM-[uni,bi]-gram,TFIDF Accuracy', accuracy_score(Y_test, y_pred))

SVM-[uni,bi]-gram,TFIDF Accuracy 0.6756690334965095


## Multinomial NB

In [0]:
mnb = Pipeline([
    ('vect', CountVectorizer(ngram_range=(1,2))),
    ('norm', Normalizer()),
    ('clf', MultinomialNB(alpha=0.01)),
])

mnb.fit(X_train, Y_train)
y_pred = mnb.predict(X_test)
print('Multinomial NB Accuracy', accuracy_score(Y_test, y_pred))

Multinomial NB Accuracy 0.6537963375024078
