In [23]:
import pandas as pd
import pickle
from time import time

# utility imports
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

# Classifier imports
from sklearn.ensemble import RandomForestClassifier

In [127]:
with open('../01_data_preparation/pickle_cleaned_data', 'rb') as data:
    df = pickle.load(data)

In [128]:
df.head()

Unnamed: 0,review_cleaned,sentiment_category
0,great working environment good support co work...,2
1,enjoyed tough job but loved camaraderie within...,1
2,working staff everyday ability work different ...,2
3,great opportunity career advancement right peo...,2
4,peek sale period casuals get great hour manage...,1


In [129]:
reviews_train, reviews_test, rating_train_target, rating_test_target = train_test_split(
    df['review_cleaned'], 
    df['sentiment_category'], 
    test_size=0.4, 
    random_state=69, 
    shuffle=True)

Below `parameters` have been manually altered one by one or two by two and run through `GridSearchCV()` for optimized parameters (Running grid search on all parameters option is highly time and resource consuming for a mid-level PC)

In [130]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()), 
    ('RFclf', RandomForestClassifier(random_state=69))
])

parameters = {
    'tfidf__lowercase': (False,),
    'tfidf__ngram_range': ((1, 2),), 
    'tfidf__max_df': (0.5,), 
    'tfidf__min_df': (1,), 
    'tfidf__max_features': (5000,), 
    'tfidf__norm': ('l2',), 
    'tfidf__use_idf': (True,), 
    'RFclf__n_estimators': (100,), 
    'RFclf__max_depth': (100,), 
    'RFclf__min_samples_split': (5,),
    'RFclf__min_samples_leaf': (1,), 
    'RFclf__max_features': ('auto',), 
    'RFclf__bootstrap': (True,)
}

In [131]:
if __name__ == "__main__":
    #rand_search = RandomizedSearchCV(pipeline, parameters, n_iter=100, n_jobs=-1, verbose=2)
    grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=2)
    t0 = time()
    #rand_search.fit(reviews_train, rating_train_target)
    grid_search.fit(reviews_train, rating_train_target)
    print("done in %0.3fs" % (time() - t0))

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    5.9s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    5.9s finished


done in 7.674s


In [132]:
print("{:0.3f}".format(grid_search.best_score_))

0.627


In [133]:
grid_search.best_params_

{'RFclf__bootstrap': True,
 'RFclf__max_depth': 100,
 'RFclf__max_features': 'auto',
 'RFclf__min_samples_leaf': 1,
 'RFclf__min_samples_split': 5,
 'RFclf__n_estimators': 100,
 'tfidf__lowercase': False,
 'tfidf__max_df': 0.5,
 'tfidf__max_features': 5000,
 'tfidf__min_df': 1,
 'tfidf__ngram_range': (1, 2),
 'tfidf__norm': 'l2',
 'tfidf__use_idf': True}