<a href="https://colab.research.google.com/github/lupis30puc/BERT_interpretation_with_RF/blob/main/RF_mimic_BERT_grid.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Set Up

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score, roc_curve
from sklearn.ensemble import RandomForestClassifier

import time
import joblib
import torch


## Initializing the values for the model
x_train, x_test, y_train, y_test

In [None]:
x_train = pd.read_pickle('/content/drive/MyDrive/Yelp/model_128_/binary_tr_words')
x_test = pd.read_pickle('/content/drive/MyDrive/Yelp/model_128_/binary_ts_words')

In [None]:
y_train = torch.load('/content/drive/MyDrive/Yelp/model_128_/pred_labels_train')
y_test = torch.load('/content/drive/MyDrive/Yelp/model_128_/pred_labels_test')

## Grid search
I performed a grid search to determined the best parameters for the Random Forest model, with a cross validation of 10. 

Taking into account only the number of estimators (trees) and the minimum sample leaves for each category.

It takes 5h 40 min min aprox.

In [None]:
param_grid = {'min_samples_leaf': [2, 3], 'n_estimators': [600, 700, 800, 900] }
# The parameter grid to explore, as a dictionary mapping estimator parameters to sequences of allowed values.

In [None]:
rmfr = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(estimator = rmfr, param_grid = param_grid, cv = 10)
# Exhaustive search over specified parameter values for an estimator.
# The parameters of the estimator used to apply these methods are optimized by cross-validated grid-search over a parameter grid.

In [None]:
%%time
grid_search.fit(x_train, y_train) # Run fit with all sets of parameters.
#rmfr.fit(x_train,y_train)

CPU times: user 3h 59min 58s, sys: 3.46 s, total: 4h 1s
Wall time: 4h 37s


GridSearchCV(cv=10, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False, random_state=42,
                                 

In [None]:
best_model = grid_search.best_params_  # Parameter setting that gave the best results on the hold out data.
print(best_model)

{'min_samples_leaf': 2, 'n_estimators': 700}


In [None]:
# saving the grid search
joblib.dump(grid_search, "/content/drive/MyDrive/Yelp/model_128_/rf_grid_above_400.joblib", compress=3)

['/content/drive/MyDrive/Yelp/model_128_/rf_grid_above_400.joblib']

In [None]:
rf_700 = joblib.load("/content/drive/MyDrive/Yelp/model_128_/rf_grid_above_400.joblib")

In [None]:
#rf_grid.best_params_ 
print("Random Forest 700 estimators, min leaf 2")
predrf_grid = rf_700.predict(x_test)
print("Confusion Matrix for Random Forest Classifier:")
print(confusion_matrix(y_test,predrf_grid))
print("Score:",round(accuracy_score(y_test,predrf_grid)*100,2))
print("Classification Report:")
print(classification_report(y_test,predrf_grid))

Random Forest 700 estimators, min leaf 2
Confusion Matrix for Random Forest Classifier:
[[1163  113]
 [ 163 1160]]
Score: 89.38
Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.91      0.89      1276
           1       0.91      0.88      0.89      1323

    accuracy                           0.89      2599
   macro avg       0.89      0.89      0.89      2599
weighted avg       0.89      0.89      0.89      2599



In [None]:
#getting the best model
random_f = grid_search.best_estimator_ 

In [None]:
grid_search.best_score_ 

0.8961906789072334

In [None]:
# for the best estimator on the grid search
print("Test  Accuracy : %.2f"%random_f.score(x_test, y_test))
print("Train Accuracy : %.2f"%random_f.score(x_train, y_train))

Test  Accuracy : 0.89
Train Accuracy : 0.98


In [None]:
# saving the grid search
joblib.dump(grid_search, "/content/drive/MyDrive/Yelp/model_128_/rf_grid_search_feb_03.joblib", compress=3)


#param_grid = {'min_samples_leaf': [2, 3, 4, 5, 6, 7, 8, 9, 10], 'n_estimators': [200, 300, 1000] }

# best_params = {'min_samples_leaf': 3, 'n_estimators': 1000}

['/content/drive/MyDrive/Yelp/model_128_/rf_grid_search_feb_03.joblib']

In [None]:
#make a graph were the test and train accuracy are closer to each other, if they cross or are very close 

## Initializing the models

In [None]:
# load, no need to initialize the loaded_rf
rf_grid1 = joblib.load("/content/drive/MyDrive/Yelp/random_forest/rf_grid_search_feb_03.joblib")
rf_grid2 = joblib.load("/content/drive/MyDrive/Yelp/random_forest/rf_grid_not_1000.joblib")
rf_grid3 = joblib.load("/content/drive/MyDrive/Yelp/model_128_/rf_grid_above_400.joblib")

In [None]:
rf_grid1.best_params_  # Parameter setting that gave the best results on the hold out data.
print(rf_grid1.best_params_)

{'min_samples_leaf': 3, 'n_estimators': 1000}


In [None]:
rf_grid1.get_params

<bound method BaseEstimator.get_params of GridSearchCV(cv=10, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False, random_s

In [None]:
rf_grid1.cv_results_

{'mean_fit_time': array([ 55.54081461,  82.90926821, 273.24089348,  44.40179167,
         65.78138323, 218.3534724 ,  38.71027176,  57.88587589,
        192.15202942,  35.31426017,  52.37427707, 172.94383399,
         32.9204145 ,  49.10529478, 159.11007044,  30.55402312,
         45.27990081, 149.44918005,  28.86918001,  42.85899184,
        141.48494241,  27.5933332 ,  41.09448581, 135.22420564,
         26.13492577,  38.93934672, 128.21310306]),
 'mean_score_time': array([0.19013646, 0.26091881, 0.73954639, 0.16870031, 0.22845831,
        0.60385084, 0.15839217, 0.20378625, 0.54536035, 0.14600222,
        0.19062035, 0.51145914, 0.14231076, 0.1949301 , 0.47538004,
        0.13539629, 0.18288703, 0.46241152, 0.13231564, 0.17350538,
        0.44282076, 0.12728198, 0.16603389, 0.43349113, 0.12460036,
        0.16023617, 0.41210139]),
 'mean_test_score': array([0.89224643, 0.89234203, 0.89397775, 0.89292034, 0.89407437,
        0.89503656, 0.8900335 , 0.89272701, 0.89234258, 0.89022553,

In [None]:
rf_grid2.best_params_  # Parameter setting that gave the best results on the hold out data.
print(rf_grid2.best_params_)

{'min_samples_leaf': 2, 'n_estimators': 400}


In [None]:
rf_grid3.best_params_  # Parameter setting that gave the best results on the hold out data.
print(rf_grid3.best_params_)

{'min_samples_leaf': 2, 'n_estimators': 700}


## Comparing the models

In [None]:
random_f1 = rf_grid1.best_estimator_ 
print("Random Forest 1000 estimators, min leaf 3")
predrf_grid1 = random_f1.predict(x_test)
print("Confusion Matrix for Random Forest Classifier:")
print(confusion_matrix(y_test,predrf_grid1))
print("Score:",round(accuracy_score(y_test,predrf_grid1)*100,2))
print("Classification Report:")
print(classification_report(y_test,predrf_grid1))

Random Forest 1000 estimators, min leaf 3
Confusion Matrix for Random Forest Classifier:
[[1162  114]
 [ 157 1166]]
Score: 89.57
Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.91      0.90      1276
           1       0.91      0.88      0.90      1323

    accuracy                           0.90      2599
   macro avg       0.90      0.90      0.90      2599
weighted avg       0.90      0.90      0.90      2599



In [None]:
random_f2 = rf_grid2.best_estimator_ 
print("Random Forest 400 estimators, min leaf 2")
predrf_grid2 = random_f2.predict(x_test)
print("Confusion Matrix for Random Forest Classifier:")
print(confusion_matrix(y_test,predrf_grid2))
print("Score:",round(accuracy_score(y_test,predrf_grid2)*100,2))
print("Classification Report:")
print(classification_report(y_test,predrf_grid2))

Random Forest 400 estimators, min leaf 2
Confusion Matrix for Random Forest Classifier:
[[1161  115]
 [ 158 1165]]
Score: 89.5
Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.91      0.89      1276
           1       0.91      0.88      0.90      1323

    accuracy                           0.89      2599
   macro avg       0.90      0.90      0.89      2599
weighted avg       0.90      0.89      0.89      2599



In [None]:
 random_f3 = rf_grid3.best_estimator_ 
print("Random Forest 700 estimators, min leaf 2")
predrf_grid3 = random_f3.predict(x_test)
print("Confusion Matrix for Random Forest Classifier:")
print(confusion_matrix(y_test,predrf_grid3))
print("Score:",round(accuracy_score(y_test,predrf_grid3)*100,2))
print("Classification Report:")
print(classification_report(y_test,predrf_grid3))

Random Forest 700 estimators, min leaf 2
Confusion Matrix for Random Forest Classifier:
[[1163  113]
 [ 163 1160]]
Score: 89.38
Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.91      0.89      1276
           1       0.91      0.88      0.89      1323

    accuracy                           0.89      2599
   macro avg       0.89      0.89      0.89      2599
weighted avg       0.89      0.89      0.89      2599



In [None]:
print("Random Forest 1000 estimators, min leaf 3")
print("Test  Accuracy : %f"%random_f1.score(x_test, y_test))
print("Train Accuracy : %f"%random_f1.score(x_train, y_train))

Random Forest 1000 estimators, min leaf 3
Test  Accuracy : 0.895729
Train Accuracy : 0.951703


In [None]:
print("Random Forest 400 estimators, min leaf 2")
print("Test  Accuracy : %f"%random_f2.score(x_test, y_test))
print("Train Accuracy : %f"%random_f2.score(x_train, y_train))

Random Forest 400 estimators, min leaf 2
Test  Accuracy : 0.894960
Train Accuracy : 0.976910


In [None]:
print("Random Forest 700 estimators, min leaf 2")
print("Test  Accuracy : %f"%random_f3.score(x_test, y_test))
print("Train Accuracy : %f"%random_f3.score(x_train, y_train))

Random Forest 700 estimators, min leaf 2
Test  Accuracy : 0.893805
Train Accuracy : 0.977391


## Comparing feature importances

In [None]:
import pickle
with open('/content/drive/MyDrive/Yelp/model_128_/feature_names_feb_03.pkl', 'rb') as f:
  feature_names = pickle.load(f)

In [None]:
with open('/content/drive/MyDrive/Yelp/model_128_/unique_ids_feb_03.pkl', 'rb') as f:
  unique_ids = pickle.load(f)

In [None]:
importance_rf1000 = random_f1.feature_importances_
importance_rf400 = random_f2.feature_importances_
importance_rf700 = random_f3.feature_importances_

In [None]:
feature_importance_1000 = pd.DataFrame({'ids': unique_ids, 'words': feature_names, 'imp': importance_rf1000})
feature_importance_400 = pd.DataFrame({'ids': unique_ids, 'words': feature_names, 'imp': importance_rf400})
feature_importance_700 = pd.DataFrame({'ids': unique_ids, 'words': feature_names, 'imp': importance_rf700})

In [None]:
# top 15 words
feature_importance_1000.sort_values(by=['imp'], ascending=False)[:15] #words with important role 

Unnamed: 0,ids,words,imp
288,2307,great,0.042505
8020,12090,delicious,0.020511
3802,6429,amazing,0.018595
2978,5409,worst,0.016944
279,2293,love,0.015892
2954,5379,friendly,0.015123
373,2409,told,0.014258
5957,9202,horrible,0.014049
327,2356,asked,0.013634
3991,6659,terrible,0.013311


In [None]:
# top 15 words
feature_importance_400.sort_values(by=['imp'], ascending=False)[:15] #words with important role 

Unnamed: 0,ids,words,imp
288,2307,great,0.035311
8020,12090,delicious,0.018375
3802,6429,amazing,0.016065
2978,5409,worst,0.01447
2954,5379,friendly,0.012964
279,2293,love,0.012546
5957,9202,horrible,0.012401
3991,6659,terrible,0.012306
327,2356,asked,0.012191
373,2409,told,0.011806


In [None]:
# top 15 words
feature_importance_700.sort_values(by=['imp'], ascending=False)[:15] #words with important role 

Unnamed: 0,ids,words,imp
288,2307,great,0.036491
8020,12090,delicious,0.01718
3802,6429,amazing,0.01599
2978,5409,worst,0.01407
279,2293,love,0.013364
2954,5379,friendly,0.012707
5957,9202,horrible,0.011838
327,2356,asked,0.011753
3991,6659,terrible,0.011505
373,2409,told,0.011433


## Saving contributions

In [None]:
# saving the features data frame for the best random forest
ft_imp.to_pickle('/content/drive/MyDrive/Yelp/random_forest/feature_importance_1000.pkl')

In [None]:
%%time
preds, bias, contributions = ti.predict(random_f1, x_train)

CPU times: user 49min 5s, sys: 1min 36s, total: 50min 42s
Wall time: 50min 53s


In [None]:
with open('/content/drive/MyDrive/Yelp/random_forest/pred_tr.pkl', 'wb') as f:
  pickle.dump(preds, f)
with open('/content/drive/MyDrive/Yelp/random_forest/bias_tr.pkl', 'wb') as f:
  pickle.dump(bias, f)
with open('/content/drive/MyDrive/Yelp/random_forest/contribs_tr.pkl', 'wb') as f:
  pickle.dump(contributions, f)

In [None]:
%%time
preds_ts, bias_ts, contributions_ts = ti.predict(random_f1, x_test)

In [None]:
import pickle
with open('/content/drive/MyDrive/Yelp/random_forest/pred_ts.pkl', 'wb') as f:
  pickle.dump(preds_ts, f)
with open('/content/drive/MyDrive/Yelp/random_forest/bias_ts.pkl', 'wb') as f:
  pickle.dump(bias_ts, f)
  with open('/content/drive/MyDrive/Yelp/random_forest/contribs_ts.pkl', 'wb') as f:
  pickle.dump(contributions_ts, f)

In [None]:
contributions[0]

array([[-0.00046256,  0.00046256],
       [ 0.        ,  0.        ],
       [ 0.        ,  0.        ],
       ...,
       [ 0.        ,  0.        ],
       [ 0.        ,  0.        ],
       [ 0.        ,  0.        ]])

In [None]:
type(contributions)

numpy.ndarray