In [1]:
import pandas as pd
import numpy as np

import re

np.random.seed(2)

from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Normalizer
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC, LinearSVC

from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


In [2]:
cd /content/drive/My\ Drive/Colab\ Notebooks/dataset

/content/drive/My Drive/Colab Notebooks/dataset


## Load Data

In [0]:
data = pd.read_csv('yelp2014.csv')

In [0]:
msk = np.random.rand(len(data)) < 0.9
train = data[msk]
test = data[~msk]

In [5]:
Y_train = train['stars'][:].values
X_train = train['text'][:].values
Y_test = test['stars'][:].values
X_test = test['text'][:].values
X_train.shape

(1012527,)

### Split into train/validation sets

In [6]:
random_seed = 2
X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size = 0.1, random_state=random_seed)
X_train.shape

(911274,)

In [0]:
del data
del train
del test

## SVM

In [0]:
lsvm = Pipeline([
    ('vect', CountVectorizer(ngram_range=(1,1))),
    ('norm', Normalizer()),
    ('clf', LinearSVC(random_state = 0, tol = 1e-7)),
])

lsvm.fit(X_train, Y_train)
y_pred = lsvm.predict(X_test)
print('SVM-unigram Accuracy', accuracy_score(Y_test, y_pred))

In [0]:
lsvm2 = Pipeline([
    ('vect', CountVectorizer(ngram_range=(2,2))),
    ('norm', Normalizer()),
    ('clf', LinearSVC(random_state = 0, tol = 1e-7)),
])

lsvm2.fit(X_train, Y_train)
y_pred = lsvm2.predict(X_test)
print('SVM-bigram Accuracy', accuracy_score(Y_test, y_pred))

In [0]:
lsvm3 = Pipeline([
    ('vect', CountVectorizer(ngram_range=(1,2))),
    ('norm', Normalizer()),
    ('clf', LinearSVC(random_state = 0, tol = 1e-7)),
])

lsvm3.fit(X_train, Y_train)
y_pred = lsvm3.predict(X_test)
print('SVM-[uni,bi]gram Accuracy', accuracy_score(Y_test, y_pred))

In [0]:
lsvm4 = Pipeline([
    ('vect', CountVectorizer(ngram_range=(1,3))),
    ('norm', Normalizer()),
    ('clf', LinearSVC(random_state = 0, tol = 1e-7)),
])

lsvm4.fit(X_train, Y_train)
y_pred = lsvm4.predict(X_test)
print('SVM-[uni,bi,tri]gram Accuracy', accuracy_score(Y_test, y_pred))

In [0]:
lsvm5 = Pipeline([
    ('vect', CountVectorizer(ngram_range=(3,3))),
    ('norm', Normalizer()),
    ('clf', LinearSVC(random_state = 0, tol = 1e-7)),
])

lsvm5.fit(X_train, Y_train)
y_pred = lsvm5.predict(X_test)
print('SVM-trigram Accuracy', accuracy_score(Y_test, y_pred))

In [0]:
lsvm6 = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('norm', Normalizer()),
    ('clf', LinearSVC(random_state = 0, tol = 1e-7)),
])

lsvm6.fit(X_train, Y_train)
y_pred = lsvm6.predict(X_test)
print('TF-IDF Accuracy', accuracy_score(Y_test, y_pred))

### Multinomial Naive Bayes

In [0]:
mnb = Pipeline([
    ('vect', CountVectorizer(ngram_range=(1,2))),
    ('norm', Normalizer()),
    ('clf', MultinomialNB(alpha=0.01)),
])

mnb.fit(X_train, Y_train)
y_pred = mnb.predict(X_test)
print('Multinomial NB Accuracy', accuracy_score(Y_test, y_pred))

## Data preprocessing

In [0]:
data = pd.read_csv('yelp2014.csv')
msk = np.random.rand(len(data)) < 0.9
train = data[msk]
test = data[~msk]

Y_train = train['stars']
X_train = train['text']
Y_test = test['stars']
X_test = test['text']
X_train.shape

del train
del test

In [0]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

from nltk.corpus import stopwords
stop = stopwords.words('english')

nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer 
lm = WordNetLemmatizer()

In [0]:
def preprocess(dataset):
    dataset.apply(lambda x: " ".join(x.lower() for x in x.split()))               # lowercase
    dataset.str.replace('[^\w\s]','')                                             # remove punctuations
    dataset.apply(lambda x: " ".join(x for x in x.split() if x not in stop))      # remove stopwords
    dataset.apply(lambda x: " ".join([lm.lemmatize(word) for word in x.split()])) # lemmatization
    
    return dataset

In [0]:
X_train = preprocess(X_train)
X_test = preprocess(X_test)

## Data Exploration

#### Most frequent words

In [0]:
freq = pd.Series(' '.join(X_train).split()).value_counts()
freq[:10]

#### Least frequent words

In [0]:
freq[-10:]