In [1]:
import pandas as pd 
import numpy  as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
import pickle 

In [19]:
def build_search_model(search_model):
    """this function takes the search model then train and test the model 
    and return predicted labels,confusion_matrix,classification_report, accuracy_score"""
    
    print("search_model---> Done")
    search_model.fit(X_train,y_train)
    print("fitting the model--> Done")
    best_search = search_model.best_estimator_
    print("choosing the best values--->Done")
    y_pred_search=best_search.predict(X_test)
    print("predicting labels---> Done")
    
    V1=y_pred_search
    V2=confusion_matrix(y_test,y_pred_search)
    V3=classification_report(y_test,y_pred_search)
    V4=accuracy_score(y_test, y_pred_search)*100
    V5= best_search
    print("Done")
    #evaluating the grid search best model 
    #return y_pred_search, confusion_matrix(y_test,y_pred_search),classification_report(y_test,y_pred_search), accuracy_score(y_test, y_pred_search)*100
    return  V1, V2, V3, V4, V5


In [2]:
clnd_data=pd.read_csv("correct_labels_.csv")

In [3]:
clnd_data.head()

Unnamed: 0,Reviews,Label
0,sweet smooth taste every drop,1
1,terrific product drops food straight mixed aga...,1
2,best food market mans food leaves hoppy aftert...,1
3,great refreshing taste weak food safe ingredie...,1
4,purchased item make mimosas left mimosas delic...,1


# word embedding 

In [4]:
tfidf_vctrr= TfidfVectorizer(max_df=0.90, min_df=2, max_features=1000, stop_words='english')
tfidf_features = tfidf_vctrr.fit_transform(clnd_data['Reviews']).toarray()

In [5]:
X=tfidf_features
Y=clnd_data["Label"]

In [8]:
X_train, X_test, y_train, y_test = train_test_split(tfidf_features, Y, test_size=0.2, random_state=0)


In [9]:
#y_train

# Building the model

In [10]:
#training the model on 80% of the datset 
rand_forest = RandomForestClassifier(n_estimators=200, random_state=0)
rand_forest.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [13]:
#testing the model on 20% of the dataset 
#predicting the labels for the test set 
y_pred = rand_forest.predict(X_test)

In [14]:
#evaluating the model 
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print("Accuracy: ",accuracy_score(y_test, y_pred)*100)


[[  6   7  19]
 [  0  32  18]
 [  2  18 382]]
              precision    recall  f1-score   support

          -1       0.75      0.19      0.30        32
           0       0.56      0.64      0.60        50
           1       0.91      0.95      0.93       402

    accuracy                           0.87       484
   macro avg       0.74      0.59      0.61       484
weighted avg       0.86      0.87      0.85       484

Accuracy:  86.77685950413223


# the model performs better without tuning so we will save this random forest model 

# Saving the model 

In [91]:
#saving the model to pickle file 
with open("model_random_forest.pickle","wb") as f:
    pickle.dump(rand_forest,f)

In [92]:
#open the model 
with open("model_random_forest.pickle","rb") as f:
    md=pickle.load(f)

In [93]:
#use the imported model to predict the labels for the test set 
md.predict(X_test)

array([ 1,  1,  1,  1,  1,  1,  1,  1,  0,  0,  1,  1,  0,  1,  1,  1,  1,
       -1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  0,  1,  1,  1,  1,  0,  1,  1,  1,  1,  1,  1,  0,  0,
        1,  1,  1,  1,  0,  1, -1,  1, -1,  1,  1,  1,  1,  1,  1,  1,  1,
        0,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  0,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  0,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  0,  1,  1,  1,  1,  1,  0,  1,  1,  1,
        1,  1,  1,  1,  0,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  0,  1,  0,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  0,  0,  1,
        1,  1,  0,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  0,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  0,  1,  1,  1,  1,  0,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1

# searching for the best parameters 

In [16]:
#setting parameters for random search with 3 folds cross validation 
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}


In [17]:
#create instance of Random forest classifier 
rf = RandomForestClassifier()
#random search with 3 folds cross validation & 100 iterations 
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)


In [39]:
pred_rand_search_labels,conf_mtrx_rand_search,class_report_rand_search,acc_score_rand_search,model_rand_search = build_search_model(rf_random)

search_model---> Done
Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed: 15.7min
[Parallel(n_jobs=-1)]: Done 158 tasks      | elapsed: 74.2min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed: 130.4min finished


fitting the model--> Done
choosing the best values--->Done
predicting labels---> Done
Done


In [49]:
print(conf_mtrx_rand_search,class_report_rand_search,"random search Accuracy: ",acc_score_rand_search)

[[  4   1  27]
 [  0  18  32]
 [  2   3 397]]               precision    recall  f1-score   support

          -1       0.67      0.12      0.21        32
           0       0.82      0.36      0.50        50
           1       0.87      0.99      0.93       402

   micro avg       0.87      0.87      0.87       484
   macro avg       0.79      0.49      0.55       484
weighted avg       0.85      0.87      0.83       484
 random search Accuracy:  86.57024793388429


In [18]:
#grid search parameters 
param_grid = {
    'bootstrap': [True],
    'max_depth': [80, 90, 100, 110],
    'max_features': [2, 3],
    'min_samples_leaf': [1, 2, 3],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [10,100, 200, 300, 1000]
}

# Create an instance of random forest classifier model
rf_G_search = RandomForestClassifier(random_state = 42)

# Create an instance of the grid search model 
grid_search = GridSearchCV(estimator = rf_G_search, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2, return_train_score=True)

In [21]:
pred_grid_search_labels,conf_mtrx_grid_search,class_report_grid_search,acc_score_grid_search,model_grid_search = build_search_model(grid_search)

search_model---> Done
Fitting 3 folds for each of 360 candidates, totalling 1080 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   29.2s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done 640 tasks      | elapsed:  5.8min
[Parallel(n_jobs=-1)]: Done 1005 tasks      | elapsed:  9.4min
[Parallel(n_jobs=-1)]: Done 1080 out of 1080 | elapsed:  9.9min finished


fitting the model--> Done
choosing the best values--->Done
predicting labels---> Done
Done


In [24]:
print(conf_mtrx_grid_search,class_report_grid_search,"Grid search Accuracy: ",acc_score_grid_search )

[[  1   0  31]
 [  0   6  44]
 [  0   0 402]]               precision    recall  f1-score   support

          -1       1.00      0.03      0.06        32
           0       1.00      0.12      0.21        50
           1       0.84      1.00      0.91       402

    accuracy                           0.85       484
   macro avg       0.95      0.38      0.40       484
weighted avg       0.87      0.85      0.79       484
 Grid search Accuracy:  84.50413223140497


In [22]:
#saving the model to pickle file 
with open("model_grid_search.pickle","wb") as f:
    pickle.dump(model_grid_search,f)

In [23]:
model_grid_search

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=100, max_features=3, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=10,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [37]:
#open the model 
with open("model_grid_search.pickle","rb") as f:
    md2=pickle.load(f)

In [38]:
md2.predict(X_test)

array([ 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  0,
        1,  1,  1,  1,  1,  1,  1,  1, -1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1

In [39]:
y_test

2328    1
1742    1
810     1
2014    1
618    -1
1300    1
1454    1
2410    1
2394    0
743    -1
1091    1
646     1
793    -1
2028    1
1815    1
2101    1
667     1
1919    1
1452    1
1205    1
806    -1
1023    1
882    -1
651     0
1248    1
2037    1
1448    1
390     1
1393    1
1476    1
       ..
1607    0
418     1
2408    1
1630    0
1467    1
1724    1
254     1
1397    1
1493    1
978     1
1330    1
634     1
300     1
1430    1
1712    1
259     1
1674    1
30      1
1450    1
1799   -1
574     1
1074    0
963     1
1051    1
2086    1
1814    1
1388    1
2270    1
794     1
682     1
Name: Label, Length: 484, dtype: int64