In [37]:
import pandas as pd
import numpy as np
import pickle
from time import time

# sklearn utility imports
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.base import BaseEstimator, TransformerMixin

# sklearn classifier imports
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier

# "gensim" modules
from gensim.models.word2vec import Word2Vec
from gensim.sklearn_api import W2VTransformer

from nltk.tokenize import word_tokenize

In [44]:
with open('../01_data_preparation/pickle_cleaned_data', 'rb') as data:
    df = pickle.load(data)

In [45]:
df.head()

Unnamed: 0,review_cleaned,sentiment_category
0,-Great working environment with very good supp...,2
1,"I enjoyed what I am doing, it's a tough job, b...",1
2,Working with staff everyday. The ability to wo...,2
3,Great opportunities for career advancement for...,2
4,During peek sales periods; casuals get great h...,1


In [46]:
reviews_train, reviews_test, rating_train_target, rating_test_target = train_test_split(
    df['review_cleaned'], 
    df['sentiment_category'], 
    test_size=0.2, 
    random_state=69, 
    shuffle=True)

following model `parameters` have been manually altered one by one or two by two and run through `GridSearchCV()` for optimized parameters (Running grid search on all parameters option is highly time and resource consuming for a mid-level PC)

In [5]:
# to keep track of each model result
result = {'model': [], 'train_score': [], 'test_score': []}
result

{'model': [], 'train_score': [], 'test_score': []}

# 1. Random Forest

In [6]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()), 
    ('RFclf', RandomForestClassifier(random_state=69))
])

parameters = {
    'tfidf__lowercase': (False,),
    'tfidf__ngram_range': ((1, 2),), 
    'tfidf__max_df': (0.5,), 
    'tfidf__min_df': (1,), 
    'tfidf__max_features': (5000,), 
    'tfidf__norm': ('l2',), 
    'tfidf__use_idf': (True,), 
    'RFclf__n_estimators': (100,), 
    'RFclf__max_depth': (100,), 
    'RFclf__min_samples_split': (5,),
    'RFclf__min_samples_leaf': (1,), 
    'RFclf__max_features': ('auto',), 
    'RFclf__bootstrap': (True,)
}

In [7]:
if __name__ == "__main__":
    #rand_search = RandomizedSearchCV(pipeline, parameters, n_iter=100, n_jobs=-1, verbose=2)
    grid_search = GridSearchCV(pipeline, parameters, scoring='accuracy', n_jobs=-1, verbose=2)
    t0 = time()
    #rand_search.fit(reviews_train, rating_train_target)
    grid_search.fit(reviews_train, rating_train_target)
    print("done in %0.3fs" % (time() - t0))

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    6.8s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    6.8s finished


done in 9.080s


In [8]:
print("{:0.3f}".format(grid_search.best_score_))

0.626


In [9]:
grid_search.best_params_

{'RFclf__bootstrap': True,
 'RFclf__max_depth': 100,
 'RFclf__max_features': 'auto',
 'RFclf__min_samples_leaf': 1,
 'RFclf__min_samples_split': 5,
 'RFclf__n_estimators': 100,
 'tfidf__lowercase': False,
 'tfidf__max_df': 0.5,
 'tfidf__max_features': 5000,
 'tfidf__min_df': 1,
 'tfidf__ngram_range': (1, 2),
 'tfidf__norm': 'l2',
 'tfidf__use_idf': True}

In [10]:
grid_search.score(reviews_test, rating_test_target)

0.6134800550206327

In [11]:
# keep track of this model results
result['model'].append('Random Forest')
result['train_score'].append("{:0.3f}".format(grid_search.best_score_))
result['test_score'].append("{:0.3f}".format(grid_search.score(reviews_test, rating_test_target)))

# 2. Logistic Regression

In [12]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()), 
    ('LRclf', LogisticRegression(random_state=69))
])

parameters = {
    'tfidf__lowercase': (False,),
    'tfidf__ngram_range': ((1, 2),), 
    'tfidf__max_df': (0.5,), 
    'tfidf__min_df': (1,), 
    'tfidf__max_features': (5000,), 
    'tfidf__norm': ('l2',), 
    'tfidf__use_idf': (True,), 
    'LRclf__penalty': ('l2',),
    'LRclf__C': (1,), 
    'LRclf__class_weight': (None,), 
    'LRclf__solver': ('lbfgs',), 
    'LRclf__multi_class': ('auto',),
}

In [13]:
if __name__ == "__main__":
    grid_search = GridSearchCV(pipeline, parameters, scoring='accuracy', n_jobs=-1, verbose=2)
    t0 = time()
    grid_search.fit(reviews_train, rating_train_target)
    print("done in %0.3fs" % (time() - t0))

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    2.3s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    2.3s finished


done in 3.065s


In [14]:
print("{:0.3f}".format(grid_search.best_score_))

0.635


In [15]:
grid_search.best_params_

{'LRclf__C': 1,
 'LRclf__class_weight': None,
 'LRclf__multi_class': 'auto',
 'LRclf__penalty': 'l2',
 'LRclf__solver': 'lbfgs',
 'tfidf__lowercase': False,
 'tfidf__max_df': 0.5,
 'tfidf__max_features': 5000,
 'tfidf__min_df': 1,
 'tfidf__ngram_range': (1, 2),
 'tfidf__norm': 'l2',
 'tfidf__use_idf': True}

In [16]:
grid_search.score(reviews_test, rating_test_target)

0.6327372764786795

In [17]:
# keep track of this model results
result['model'].append('Logistic Regression')
result['train_score'].append("{:0.3f}".format(grid_search.best_score_))
result['test_score'].append("{:0.3f}".format(grid_search.score(reviews_test, rating_test_target)))

# 3. Spport Vector Machine

In [18]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()), 
    ('SVMclf', svm.SVC(random_state=69))
])

parameters = {
    'tfidf__lowercase': (False,),
    'tfidf__ngram_range': ((1, 2),), 
    'tfidf__max_df': (0.5,), 
    'tfidf__min_df': (1,), 
    'tfidf__max_features': (5000,), 
    'tfidf__norm': ('l2',), 
    'tfidf__use_idf': (True,), 
    'SVMclf__C': (1,), 
    'SVMclf__kernel': ('rbf',), 
    'SVMclf__degree': (1,), 
    'SVMclf__gamma': ('scale',), 
}

In [19]:
if __name__ == "__main__":
    grid_search = GridSearchCV(pipeline, parameters, scoring='accuracy', n_jobs=-1, verbose=2)
    t0 = time()
    grid_search.fit(reviews_train, rating_train_target)
    print("done in %0.3fs" % (time() - t0))

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.4s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.4s finished


done in 10.362s


In [20]:
print("{:0.3f}".format(grid_search.best_score_))

0.645


In [21]:
grid_search.best_params_

{'SVMclf__C': 1,
 'SVMclf__degree': 1,
 'SVMclf__gamma': 'scale',
 'SVMclf__kernel': 'rbf',
 'tfidf__lowercase': False,
 'tfidf__max_df': 0.5,
 'tfidf__max_features': 5000,
 'tfidf__min_df': 1,
 'tfidf__ngram_range': (1, 2),
 'tfidf__norm': 'l2',
 'tfidf__use_idf': True}

In [22]:
grid_search.score(reviews_test, rating_test_target)

0.6341127922971114

In [23]:
# keep track of this model results
result['model'].append('Support Vector')
result['train_score'].append("{:0.3f}".format(grid_search.best_score_))
result['test_score'].append("{:0.3f}".format(grid_search.score(reviews_test, rating_test_target)))

# 4. Multinomial Naïve Bayes

In [24]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()), 
    ('NBclf', MultinomialNB())
])

parameters = {
    'tfidf__lowercase': (False,),
    'tfidf__ngram_range': ((1, 2),), 
    'tfidf__max_df': (0.5,), 
    'tfidf__min_df': (1,), 
    'tfidf__max_features': (5000,), 
    'tfidf__norm': ('l2',), 
    'tfidf__use_idf': (True,), 
    'NBclf__alpha': (1,), 
    'NBclf__fit_prior': (False,), 
}

In [25]:
if __name__ == "__main__":
    grid_search = GridSearchCV(pipeline, parameters, scoring='accuracy', n_jobs=-1, verbose=2)
    t0 = time()
    grid_search.fit(reviews_train, rating_train_target)
    print("done in %0.3fs" % (time() - t0))

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.8s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.8s finished


done in 1.077s


In [26]:
print("{:0.3f}".format(grid_search.best_score_))

0.631


In [27]:
grid_search.best_params_

{'NBclf__alpha': 1,
 'NBclf__fit_prior': False,
 'tfidf__lowercase': False,
 'tfidf__max_df': 0.5,
 'tfidf__max_features': 5000,
 'tfidf__min_df': 1,
 'tfidf__ngram_range': (1, 2),
 'tfidf__norm': 'l2',
 'tfidf__use_idf': True}

In [28]:
grid_search.score(reviews_test, rating_test_target)

0.6368638239339752

In [29]:
# keep track of this model results
result['model'].append('M Naive Bayes')
result['train_score'].append("{:0.3f}".format(grid_search.best_score_))
result['test_score'].append("{:0.3f}".format(grid_search.score(reviews_test, rating_test_target)))

# 5. K Nearest Neighbors

In [30]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()), 
    ('KNNclf', KNeighborsClassifier())
])

parameters = {
    'tfidf__lowercase': (False,),
    'tfidf__ngram_range': ((1, 2),), 
    'tfidf__max_df': (0.5,), 
    'tfidf__min_df': (1,), 
    'tfidf__max_features': (5000,), 
    'tfidf__norm': ('l2',), 
    'tfidf__use_idf': (True,), 
    'KNNclf__n_neighbors': (30,), 
    'KNNclf__weights': ('distance',),
    'KNNclf__algorithm': ('auto',), 
    'KNNclf__leaf_size': (30,), 
    'KNNclf__p': (2,),
}

In [31]:
if __name__ == "__main__":
    grid_search = GridSearchCV(pipeline, parameters, scoring='accuracy', n_jobs=-1, verbose=2)
    t0 = time()
    grid_search.fit(reviews_train, rating_train_target)
    print("done in %0.3fs" % (time() - t0))

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    1.2s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    1.2s finished


done in 1.442s


In [32]:
print("{:0.3f}".format(grid_search.best_score_))

0.606


In [33]:
grid_search.best_params_

{'KNNclf__algorithm': 'auto',
 'KNNclf__leaf_size': 30,
 'KNNclf__n_neighbors': 30,
 'KNNclf__p': 2,
 'KNNclf__weights': 'distance',
 'tfidf__lowercase': False,
 'tfidf__max_df': 0.5,
 'tfidf__max_features': 5000,
 'tfidf__min_df': 1,
 'tfidf__ngram_range': (1, 2),
 'tfidf__norm': 'l2',
 'tfidf__use_idf': True}

In [34]:
grid_search.score(reviews_test, rating_test_target)

0.6217331499312242

In [35]:
# keep track of this model results
result['model'].append('K Nearest Neighbors')
result['train_score'].append("{:0.3f}".format(grid_search.best_score_))
result['test_score'].append("{:0.3f}".format(grid_search.score(reviews_test, rating_test_target)))

# 6. Gradient Boosting

In [36]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()), 
    ('GBclf', GradientBoostingClassifier(random_state=69))
])

parameters = {
    'tfidf__lowercase': (False,),
    'tfidf__ngram_range': ((1, 2),), 
    'tfidf__max_df': (0.5,), 
    'tfidf__min_df': (1,), 
    'tfidf__max_features': (5000,), 
    'tfidf__norm': ('l2',), 
    'tfidf__use_idf': (True,), 
    'GBclf__loss': ('deviance',), 
    'GBclf__learning_rate': (0.1,), 
    'GBclf__n_estimators': (200,), 
    'GBclf__min_samples_split': (50, 100), 
    'GBclf__min_samples_leaf': (2,),
    'GBclf__max_depth': (1, 3, 5), 
    'GBclf__max_features': ('sqrt',)
    
}

In [37]:
if __name__ == "__main__":
    grid_search = GridSearchCV(pipeline, parameters, scoring='accuracy', n_jobs=-1, verbose=2)
    t0 = time()
    grid_search.fit(reviews_train, rating_train_target)
    print("done in %0.3fs" % (time() - t0))

Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:   46.6s finished


done in 48.937s


In [38]:
print("{:0.3f}".format(grid_search.best_score_))

0.614


In [39]:
grid_search.best_params_

{'GBclf__learning_rate': 0.1,
 'GBclf__loss': 'deviance',
 'GBclf__max_depth': 3,
 'GBclf__max_features': 'sqrt',
 'GBclf__min_samples_leaf': 2,
 'GBclf__min_samples_split': 50,
 'GBclf__n_estimators': 200,
 'tfidf__lowercase': False,
 'tfidf__max_df': 0.5,
 'tfidf__max_features': 5000,
 'tfidf__min_df': 1,
 'tfidf__ngram_range': (1, 2),
 'tfidf__norm': 'l2',
 'tfidf__use_idf': True}

In [40]:
grid_search.score(reviews_test, rating_test_target)

0.5997248968363136

In [41]:
# keep track of this model results
result['model'].append('Gradient Boosting')
result['train_score'].append("{:0.3f}".format(grid_search.best_score_))
result['test_score'].append("{:0.3f}".format(grid_search.score(reviews_test, rating_test_target)))

# Comparing Models' Performance

In [42]:
pd.DataFrame.from_dict(result)

Unnamed: 0,model,train_score,test_score
0,Random Forest,0.626,0.613
1,Logistic Regression,0.635,0.633
2,Support Vector,0.645,0.634
3,M Naive Bayes,0.631,0.637
4,K Nearest Neighbors,0.606,0.622
5,Gradient Boosting,0.614,0.6


# Word2Vector

To apply `Gensim`'s `word2vec` model, going to need to creat two `sklearn` compatible costomized transformers. one for tokenizing the input-text to `gensim` vectorizer and another to reduce the dimension of output vector by avaraging each word's vector before feeding the model.

Reference : [Github](https://github.com/nadbordrozd/blog_stuff/blob/master/classification_w2v/benchmarking_python3.ipynb()https://github.com/nadbordrozd/blog_stuff/blob/master/classification_w2v/benchmarking_python3.ipynb)

In [47]:
# sklearn compatible tokenizer class

class custom_tokenizer(BaseEstimator, TransformerMixin):
    '''sklearn compatible tokenizer'''
    
    def __init__(self, arg=None):
        self.arg = arg
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return [word_tokenize(sentence) for sentence in X]
        

In [48]:
class custom_array_mean(BaseEstimator, TransformerMixin):
    '''avaraging each word's vector'''
    
    def __init__(self, wordvecs):
        self.wordvecs = wordvecs
        self.dim = len(wordvecs[next(iter(wordvecs))])
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        # returns zero if word not in gensim  trained word2vec model
        return np.array([
            np.mean([self.wordvecs[word] for word in words if word in self.wordvecs] 
                    or [np.zeros(self.dim)], axis=0) for words in X
        ])

In [49]:
# craeting a <<list of lists of tokens>> of all reviews to train gensim word2vec model
corpus = custom_tokenizer().fit_transform(df['review_cleaned'])

In [50]:
# Training the gensim's word2vec model and creating / mapping the word-vectors
w2v_model = Word2Vec(corpus, size=50, window=5, min_count=3, workers=2)
wordvecs = {word: vec for word, vec in zip(w2v_model.wv.index2word, w2v_model.wv.syn0)}

  This is separate from the ipykernel package so we can avoid doing imports until


In [67]:
# option for loading a pre-trained GloVe model
# see https://nlp.stanford.edu/projects/glove/

wordvecs = {}

all_words = [word for sentence in corpus for word in sentence]

with open("glove.6B.50d.txt", "rb") as model:
    for line in model:
        tokenized_line = line.split()
        word = tokenized_line[0].decode('utf8')
        if word in all_words:
            vectors = np.array(tokenized_line[1:], dtype=np.float32)
            
            wordvecs[word] = vectors

In [69]:
pipeline = Pipeline([
    ('tknz', custom_tokenizer()), 
    ('w2vec', custom_array_mean(wordvecs)), 
    ('SVMclf', svm.SVC(random_state=69))
])

parameters = {
    'SVMclf__C': (1,), 
    'SVMclf__kernel': ('rbf',), 
    'SVMclf__degree': (1,), 
    'SVMclf__gamma': ('scale',), 
}

In [70]:
if __name__ == "__main__":
    grid_search = GridSearchCV(pipeline, parameters, scoring='accuracy', n_jobs=-1, verbose=2)
    t0 = time()
    grid_search.fit(reviews_train, rating_train_target)
    print("done in %0.3fs" % (time() - t0))

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   31.4s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   31.4s finished


done in 34.586s


In [71]:
print("{:0.3f}".format(grid_search.best_score_))

0.565


In [72]:
grid_search.score(reviews_test, rating_test_target)

0.579092159559835