In [4]:
import pandas as pd
import pickle
from time import time

# utility imports
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

# Classifier imports
from sklearn.ensemble import RandomForestClassifier

In [5]:
with open('../01_data_preparation/pickle_cleaned_data', 'rb') as data:
    df = pickle.load(data)

In [6]:
df.head()

Unnamed: 0,review_cleaned,rating
0,great working environment good support co work...,5.0
1,enjoyed tough job but loved camaraderie within...,3.0
2,working staff everyday ability work different ...,4.0
3,great opportunity career advancement right peo...,4.0
4,peek sale period casuals get great hour manage...,3.0


In [7]:
reviews_train, reviews_test, rating_train_target, rating_test_target = train_test_split(
    df['review_cleaned'], 
    df['rating'], 
    test_size=0.4, 
    random_state=69, 
    shuffle=True)

In [19]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()), 
    ('RFclf', RandomForestClassifier(random_state=69))
])

parameters = {
    'tfidf__lowercase': (False,),
    'tfidf__ngram_range': ((1, 1), (1, 2)), 
    #'tfidf__max_df': (0.5, 0.75, 1), 
    #'tfidf__min_df': (1, 10, 50), 
    #'tfidf__max_features': (None, 300, 600, 1000), 
    'tfidf__norm': ('l1', 'l2'), 
    'tfidf__use_idf': (True, False), 
    'RFclf__n_estimators': (100, 500, 1000), 
    'RFclf__max_depth': (None, 2, 10, 50, 100), 
    'RFclf__min_samples_split': (2, 5, 10),
    'RFclf__min_samples_leaf': (1, 2, 4), 
    'RFclf__max_features': ('auto', 'sqrt'), 
    'RFclf__bootstrap': (True, False)
}

In [20]:
if __name__ == "__main__":
    rand_search = RandomizedSearchCV(pipeline, parameters, n_iter=100, n_jobs=-1, verbose=2)
    #grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=2)
    t0 = time()
    rand_search.fit(reviews_train, rating_train_target)
    #grid_search.fit(reviews_train, rating_train_target)
    print("done in %0.3fs" % (time() - t0))

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 158 tasks      | elapsed: 11.3min
[Parallel(n_jobs=-1)]: Done 361 tasks      | elapsed: 22.1min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed: 31.9min finished


done in 1917.874s


In [21]:
rand_search.best_score_

0.38749625636418095

In [22]:
rand_search.best_params_

{'tfidf__use_idf': False,
 'tfidf__norm': 'l1',
 'tfidf__ngram_range': (1, 1),
 'tfidf__lowercase': False,
 'RFclf__n_estimators': 500,
 'RFclf__min_samples_split': 2,
 'RFclf__min_samples_leaf': 4,
 'RFclf__max_features': 'auto',
 'RFclf__max_depth': 100,
 'RFclf__bootstrap': False}