# Setting up

In [0]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
import warnings
warnings.filterwarnings('ignore')

import pickle
import re
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score, GridSearchCV

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier

## Data Cleaning


In [0]:
cd /content/drive/My\ Drive/Colab\ Notebooks

/content/drive/My Drive/Colab Notebooks


In [0]:
from data_cleaning import clean_text

## Data Loading

In [0]:
cd /content/drive/My\ Drive/Colab\ Notebooks/data

/content/drive/My Drive/Colab Notebooks/data


In [0]:
X_train, y_train = [], []
with open("reviews_8_categories.txt", "r") as file:
    for line in file:
        label, text = line.split("\t")
        text = clean_text(text)
        X_train.append(text)
        y_train.append(label)

## Define Models

In [0]:
# Linear Support Vector Classifier 
linear_SVC_BoW = Pipeline([("vec", CountVectorizer()), ("clf", LinearSVC())])
linear_SVC_tfidf = Pipeline([("vec", TfidfVectorizer()), ("clf", LinearSVC())])

# Naive Bayes
naive_bayes_BoW = Pipeline([("vec", CountVectorizer()), ("clf", MultinomialNB())])
naive_bayes_tfidf = Pipeline([("vec", TfidfVectorizer()), ("clf", MultinomialNB())])

# k-Nearest Neighbors
kNN_BoW = Pipeline([("vec", CountVectorizer()), ("clf", KNeighborsClassifier(weights='distance'))])
kNN_tfidf = Pipeline([("vec", TfidfVectorizer()), ("clf", KNeighborsClassifier(weights='distance'))])

# Logistic Regression
log_reg_BoW = Pipeline([("vec", CountVectorizer()), ("clf", LogisticRegression(solver='lbfgs', multi_class='auto'))])
log_reg_tfidf = Pipeline([("vec", TfidfVectorizer()), ("clf", LogisticRegression(solver='lbfgs', multi_class='auto'))])

## Cross Validation

In [0]:
# Construct a dictionary for all the models above
all_models = {
    'Naive Bayes BoW': naive_bayes_BoW, 'Naive Bayes TFIDF': naive_bayes_tfidf, 
    'Logistic Reg BoW': log_reg_BoW, 'Logistic Reg TFIDF': log_reg_tfidf, 
    'LinearSVC BoW': linear_SVC_BoW, 'LinearSVC TFIDF': linear_SVC_tfidf,
    'kNN BoW': kNN_BoW, 'kNN TFIDF': kNN_tfidf
}

# Perform cross validation for each model
cv_scores = [(name, cross_val_score(model, X_train, y_train, cv=4, scoring='accuracy', n_jobs=-1, verbose=1).mean()) for name, model in all_models.items()]
sorted_scores = sorted(cv_scores, key=lambda x: -x[1], reverse=True)

# Print the accuracy score per model, from lowest to highest
for score in sorted_scores:
    print(score)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   18.7s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   18.2s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  1.5min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  1.5min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  1.1min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   26.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4

('kNN TFIDF', 0.20732499999999998)
('kNN BoW', 0.463025)
('Naive Bayes TFIDF', 0.8186)
('Naive Bayes BoW', 0.836375)
('LinearSVC BoW', 0.847925)
('Logistic Reg BoW', 0.85385)
('Logistic Reg TFIDF', 0.8634375000000001)
('LinearSVC TFIDF', 0.87085)


[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:  3.3min finished


## Hyperparameter Tuning

In [0]:
# Define values for Logistic Regression hyperparameters
params_log_reg = {
    'vec__min_df': [1, 0.0001, 0.0005],
    'vec__max_df': [0.75, 0.85, 1.0],
    'clf__C':  [0.1, 1, 10, 50]
}

gsearch_log_reg = GridSearchCV(log_reg_tfidf, params_log_reg, cv=4, n_jobs=-1, verbose=1, scoring='accuracy', return_train_score=True)
gsearch_log_reg.fit(X_train, y_train)

# Store the best Logistic Regression model for stacking later
log_reg_tuned = gsearch_log_reg.best_estimator_
log_reg_score = gsearch_log_reg.best_score_

print("Best score: {}\nBest params: {}".format(log_reg_score, gsearch_log_reg.best_params_))

Fitting 4 folds for each of 36 candidates, totalling 144 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed: 14.5min
[Parallel(n_jobs=-1)]: Done 144 out of 144 | elapsed: 43.5min finished


Best score: 0.8663125
Best params: {'clf__C': 10, 'vec__max_df': 0.75, 'vec__min_df': 0.0001}


In [0]:
# Define values for Naive Bayes hyperparameters
params_naive_bayes = {
    'vec__ngram_range': [(1, 1), (1, 2)],
    'vec__max_df': [0.7, 0.8, 1.0],
    'vec__min_df': [1, 0.0001, 0.0005],
}

gsearch_naive_bayes = GridSearchCV(naive_bayes_tfidf, params_naive_bayes, cv=4, n_jobs=-1, verbose=1, scoring='accuracy', return_train_score=True)
gsearch_naive_bayes.fit(X_train, y_train)

# Store the best Naive Bayes model for stacking later
naive_bayes_tuned = gsearch_naive_bayes.best_estimator_
naive_bayes_score = gsearch_naive_bayes.best_score_

print("Best score: {}\nBest params: {}".format(naive_bayes_score, gsearch_naive_bayes.best_params_))

Fitting 4 folds for each of 18 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:  9.9min
[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed: 15.7min finished


Best score: 0.855925
Best params: {'vec__max_df': 0.7, 'vec__min_df': 0.0001, 'vec__ngram_range': (1, 1)}


In [0]:
# Define values for Linear SVC hyperparameters
params_linear_SVC = {
    'vec__max_df': [0.75, 0.85, 1.0],
    'clf__dual': [True, False],
    'clf__C': [0.1, 1, 10]
}

gsearch_linear_SVC = GridSearchCV(linear_SVC_tfidf, params_linear_SVC, cv=4, n_jobs=-1, verbose=1, scoring='accuracy', return_train_score=True)
gsearch_linear_SVC.fit(X_train, y_train)

linear_SVC_score = gsearch_linear_SVC.best_score_
linear_SVC_tuned = gsearch_linear_SVC.best_estimator_

print("Best score: {}\nBest params: {}".format(linear_SVC_score, gsearch_linear_SVC.best_params_))

Fitting 4 folds for each of 18 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:  8.2min
[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed: 15.1min finished


Best score: 0.8708625000000001
Best params: {'clf__C': 1, 'clf__dual': True, 'vec__max_df': 1.0}


In [0]:
pickle.dump(log_reg_tuned, open('LR_model.pkl', 'wb'))

In [0]:
model = pickle.load(open('LR_model.pkl', 'rb'))