In [260]:
import pandas as pd
import numpy as np
import pickle
from time import time

# utility imports
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

# Classifier imports
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier

# "gensim" modules
from gensim.sklearn_api import W2VTransformer

from nltk.tokenize import word_tokenize

In [124]:
with open('../01_data_preparation/pickle_cleaned_data', 'rb') as data:
    df = pickle.load(data)

In [125]:
df.head()

Unnamed: 0,review_cleaned,sentiment_category
0,great working environment good support co work...,2
1,enjoyed tough job but loved camaraderie within...,1
2,working staff everyday ability work different ...,2
3,great opportunity career advancement right peo...,2
4,peek sale period casuals get great hour manage...,1


In [126]:
reviews_train, reviews_test, rating_train_target, rating_test_target = train_test_split(
    df['review_cleaned'], 
    df['sentiment_category'], 
    test_size=0.4, 
    random_state=69, 
    shuffle=True)

following model `parameters` have been manually altered one by one or two by two and run through `GridSearchCV()` for optimized parameters (Running grid search on all parameters option is highly time and resource consuming for a mid-level PC)

# 1. Random Forest

In [127]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()), 
    ('RFclf', RandomForestClassifier(random_state=69))
])

parameters = {
    'tfidf__lowercase': (False,),
    'tfidf__ngram_range': ((1, 2),), 
    'tfidf__max_df': (0.5,), 
    'tfidf__min_df': (1,), 
    'tfidf__max_features': (5000,), 
    'tfidf__norm': ('l2',), 
    'tfidf__use_idf': (True,), 
    'RFclf__n_estimators': (100,), 
    'RFclf__max_depth': (100,), 
    'RFclf__min_samples_split': (5,),
    'RFclf__min_samples_leaf': (1,), 
    'RFclf__max_features': ('auto',), 
    'RFclf__bootstrap': (True,)
}

In [128]:
if __name__ == "__main__":
    #rand_search = RandomizedSearchCV(pipeline, parameters, n_iter=100, n_jobs=-1, verbose=2)
    grid_search = GridSearchCV(pipeline, parameters, scoring='accuracy', n_jobs=-1, verbose=2)
    t0 = time()
    #rand_search.fit(reviews_train, rating_train_target)
    grid_search.fit(reviews_train, rating_train_target)
    print("done in %0.3fs" % (time() - t0))

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   13.9s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   13.9s finished


done in 15.715s


In [129]:
print("{:0.3f}".format(grid_search.best_score_))

0.627


In [130]:
grid_search.best_params_

{'RFclf__bootstrap': True,
 'RFclf__max_depth': 100,
 'RFclf__max_features': 'auto',
 'RFclf__min_samples_leaf': 1,
 'RFclf__min_samples_split': 5,
 'RFclf__n_estimators': 100,
 'tfidf__lowercase': False,
 'tfidf__max_df': 0.5,
 'tfidf__max_features': 5000,
 'tfidf__min_df': 1,
 'tfidf__ngram_range': (1, 2),
 'tfidf__norm': 'l2',
 'tfidf__use_idf': True}

In [132]:
grid_search.score(reviews_test, rating_test_target)

0.6083123425692695

# 2. Logistic Regression

In [170]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()), 
    ('LRclf', LogisticRegression(random_state=69))
])

parameters = {
    'tfidf__lowercase': (False,),
    'tfidf__ngram_range': ((1, 2),), 
    'tfidf__max_df': (0.5,), 
    'tfidf__min_df': (1,), 
    'tfidf__max_features': (5000,), 
    'tfidf__norm': ('l2',), 
    'tfidf__use_idf': (True,), 
    'LRclf__penalty': ('l2',),
    'LRclf__C': (1,), 
    'LRclf__class_weight': (None,), 
    'LRclf__solver': ('lbfgs',), 
    'LRclf__multi_class': ('auto',),
}

In [166]:
if __name__ == "__main__":
    grid_search = GridSearchCV(pipeline, parameters, scoring='accuracy', n_jobs=-1, verbose=2)
    t0 = time()
    grid_search.fit(reviews_train, rating_train_target)
    print("done in %0.3fs" % (time() - t0))

Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    3.8s finished


done in 4.408s


In [167]:
print("{:0.3f}".format(grid_search.best_score_))

0.631


In [168]:
grid_search.best_params_

{'LRclf__C': 1,
 'LRclf__class_weight': None,
 'LRclf__multi_class': 'auto',
 'LRclf__penalty': 'l2',
 'LRclf__solver': 'lbfgs',
 'tfidf__lowercase': False,
 'tfidf__max_df': 0.5,
 'tfidf__max_features': 5000,
 'tfidf__min_df': 1,
 'tfidf__ngram_range': (1, 2),
 'tfidf__norm': 'l2',
 'tfidf__use_idf': True}

In [169]:
grid_search.score(reviews_test, rating_test_target)

0.6385390428211587

# 3. Spport Vector Machine

In [192]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()), 
    ('SVMclf', svm.SVC(random_state=69))
])

parameters = {
    'tfidf__lowercase': (False,),
    'tfidf__ngram_range': ((1, 2),), 
    'tfidf__max_df': (0.5,), 
    'tfidf__min_df': (1,), 
    'tfidf__max_features': (5000,), 
    'tfidf__norm': ('l2',), 
    'tfidf__use_idf': (True,), 
    'SVMclf__C': (1,), 
    'SVMclf__kernel': ('rbf',), 
    'SVMclf__degree': (1,), 
    'SVMclf__gamma': ('scale',), 
}

In [188]:
if __name__ == "__main__":
    grid_search = GridSearchCV(pipeline, parameters, scoring='accuracy', n_jobs=-1, verbose=2)
    t0 = time()
    grid_search.fit(reviews_train, rating_train_target)
    print("done in %0.3fs" % (time() - t0))

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:   22.5s finished


done in 24.620s


In [189]:
print("{:0.3f}".format(grid_search.best_score_))

0.634


In [190]:
grid_search.best_params_

{'SVMclf__C': 1,
 'SVMclf__degree': 1,
 'SVMclf__gamma': 'scale',
 'SVMclf__kernel': 'rbf',
 'tfidf__lowercase': False,
 'tfidf__max_df': 0.5,
 'tfidf__max_features': 5000,
 'tfidf__min_df': 1,
 'tfidf__ngram_range': (1, 2),
 'tfidf__norm': 'l2',
 'tfidf__use_idf': True}

In [191]:
grid_search.score(reviews_test, rating_test_target)

0.6353904282115869

# 4. Multinomial Naïve Bayes

In [205]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()), 
    ('NBclf', MultinomialNB())
])

parameters = {
    'tfidf__lowercase': (False,),
    'tfidf__ngram_range': ((1, 2),), 
    'tfidf__max_df': (0.5,), 
    'tfidf__min_df': (1,), 
    'tfidf__max_features': (5000,), 
    'tfidf__norm': ('l2',), 
    'tfidf__use_idf': (True,), 
    'NBclf__alpha': (1,), 
    'NBclf__fit_prior': (False,), 
}

In [206]:
if __name__ == "__main__":
    grid_search = GridSearchCV(pipeline, parameters, scoring='accuracy', n_jobs=-1, verbose=2)
    t0 = time()
    grid_search.fit(reviews_train, rating_train_target)
    print("done in %0.3fs" % (time() - t0))

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


done in 0.880s


[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.7s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.7s finished


In [207]:
print("{:0.3f}".format(grid_search.best_score_))

0.629


In [208]:
grid_search.best_params_

{'NBclf__alpha': 1,
 'NBclf__fit_prior': False,
 'tfidf__lowercase': False,
 'tfidf__max_df': 0.5,
 'tfidf__max_features': 5000,
 'tfidf__min_df': 1,
 'tfidf__ngram_range': (1, 2),
 'tfidf__norm': 'l2',
 'tfidf__use_idf': True}

In [209]:
grid_search.score(reviews_test, rating_test_target)

0.6265743073047859

# 5. K Nearest Neighbors

In [258]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()), 
    ('KNNclf', KNeighborsClassifier())
])

parameters = {
    'tfidf__lowercase': (False,),
    'tfidf__ngram_range': ((1, 2),), 
    'tfidf__max_df': (0.5,), 
    'tfidf__min_df': (1,), 
    'tfidf__max_features': (5000,), 
    'tfidf__norm': ('l2',), 
    'tfidf__use_idf': (True,), 
    'KNNclf__n_neighbors': (30,), 
    'KNNclf__weights': ('distance',),
    'KNNclf__algorithm': ('auto',), 
    'KNNclf__leaf_size': (30,), 
    'KNNclf__p': (2,),
}

In [254]:
if __name__ == "__main__":
    grid_search = GridSearchCV(pipeline, parameters, scoring='accuracy', n_jobs=-1, verbose=2)
    t0 = time()
    grid_search.fit(reviews_train, rating_train_target)
    print("done in %0.3fs" % (time() - t0))

Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


done in 1.757s


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    1.6s finished


In [255]:
print("{:0.3f}".format(grid_search.best_score_))

0.615


In [256]:
grid_search.best_params_

{'KNNclf__algorithm': 'auto',
 'KNNclf__leaf_size': 30,
 'KNNclf__n_neighbors': 30,
 'KNNclf__p': 2,
 'KNNclf__weights': 'distance',
 'tfidf__lowercase': False,
 'tfidf__max_df': 0.5,
 'tfidf__max_features': 5000,
 'tfidf__min_df': 1,
 'tfidf__ngram_range': (1, 2),
 'tfidf__norm': 'l2',
 'tfidf__use_idf': True}

In [257]:
grid_search.score(reviews_test, rating_test_target)

0.6001259445843828

# 6. Gradient Boosting

In [298]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()), 
    ('GBclf', GradientBoostingClassifier(random_state=69))
])

parameters = {
    'tfidf__lowercase': (False,),
    'tfidf__ngram_range': ((1, 2),), 
    'tfidf__max_df': (0.5,), 
    'tfidf__min_df': (1,), 
    'tfidf__max_features': (5000,), 
    'tfidf__norm': ('l2',), 
    'tfidf__use_idf': (True,), 
    'GBclf__loss': ('deviance',), 
    'GBclf__learning_rate': (0.1,), 
    'GBclf__n_estimators': (200,), 
    'GBclf__min_samples_split': (50, 100), 
    'GBclf__min_samples_leaf': (2,),
    'GBclf__max_depth': (1, 3, 5), 
    'GBclf__max_features': ('sqrt',)
    
}

In [299]:
if __name__ == "__main__":
    grid_search = GridSearchCV(pipeline, parameters, scoring='accuracy', n_jobs=-1, verbose=2)
    t0 = time()
    grid_search.fit(reviews_train, rating_train_target)
    print("done in %0.3fs" % (time() - t0))

Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:   32.8s finished


done in 34.957s


In [300]:
print("{:0.3f}".format(grid_search.best_score_))

0.606


In [301]:
grid_search.best_params_

{'GBclf__learning_rate': 0.1,
 'GBclf__loss': 'deviance',
 'GBclf__max_depth': 3,
 'GBclf__max_features': 'sqrt',
 'GBclf__min_samples_leaf': 2,
 'GBclf__min_samples_split': 100,
 'GBclf__n_estimators': 200,
 'tfidf__lowercase': False,
 'tfidf__max_df': 0.5,
 'tfidf__max_features': 5000,
 'tfidf__min_df': 1,
 'tfidf__ngram_range': (1, 2),
 'tfidf__norm': 'l2',
 'tfidf__use_idf': True}

In [302]:
grid_search.score(reviews_test, rating_test_target)

0.6051637279596978