In [None]:
#Description of notebook: 
#1. We first tune the hyperparameters of our models on a validation set 
#2. We then make predictions on the test data set

In [1]:
#This notebook is based on all features, i.e. we also consider rank related data compared to previously. 
#Make sure that all the columns are included when constructing X_train and X_test

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
import joblib
from sklearn.metrics import r2_score, f1_score, roc_auc_score, precision_score, recall_score, roc_curve
import json

In [2]:
finalx_trainval_df = pd.read_pickle('./X_trainval.pickle')
finalx_val_df = pd.read_pickle('./X_val.pickle')
finalx_train_df = pd.read_pickle('./X_train.pickle')
finalx_test_df = pd.read_pickle('./X_test.pickle')

finaly_trainval_df = pd.read_pickle('./y_trainval.pickle')
finaly_val_df = pd.read_pickle('./y_val.pickle')
finaly_train_df = pd.read_pickle('./y_train.pickle')
finaly_test_df = pd.read_pickle('./y_test.pickle')
del finalx_trainval_df['asin']
del finalx_val_df['asin']
del finalx_train_df['asin']
del finalx_test_df['asin']

del finaly_trainval_df['asin']
del finaly_val_df['asin']
del finaly_train_df['asin']
del finaly_test_df['asin']

In [3]:
#Create numpy array out of them
X_trainval=finalx_trainval_df.to_numpy()
X_val=finalx_val_df.to_numpy()
X_train=finalx_train_df.to_numpy()
X_test=finalx_test_df.to_numpy()


y_trainval=finaly_trainval_df.to_numpy()
y_trainval=y_trainval[:,0]

y_val=finaly_val_df.to_numpy()
y_val=y_val[:,0]

y_train=finaly_train_df.to_numpy()
y_train=y_train[:,0]

y_test=finaly_test_df.to_numpy()
y_test=y_test[:,0]

In [4]:
#Logistic Regression
regressor=LogisticRegression()
regressor.fit(X_train,y_train)

LogisticRegression()

In [5]:
#Saving the model
#filename_regression = './LogReg.sav'
#joblib.dump(regressor, filename_regression)

In [6]:
#Metrics
y_pred = regressor.predict(X_test)
probas = regressor.predict_proba(X=X_test)[:,1]
f1 = f1_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
auc = roc_auc_score(y_test, probas)
print('\tf1:', f1)
print('\tprecision:', precision)
print('\trecall:', recall)
print('\tauc:', auc)

	f1: 0.012121212121212121
	precision: 1.0
	recall: 0.006097560975609756
	auc: 0.7224605546450723


In [45]:
#XGBOOST
xgboost=GradientBoostingClassifier()
xgboost.fit(X_trainval,y_trainval)

GradientBoostingClassifier()

In [46]:
#Metrics
y_pred = xgboost.predict(X_val)
probas = xgboost.predict_proba(X=X_val)[:,1]
f1 = f1_score(y_val, y_pred)
precision = precision_score(y_val, y_pred)
recall = recall_score(y_val, y_pred)
auc = roc_auc_score(y_val, probas)
print('\tf1:', f1)
print('\tprecision:', precision)
print('\trecall:', recall)
print('\tauc:', auc)

	f1: 0.4104803493449781
	precision: 0.5802469135802469
	recall: 0.31756756756756754
	auc: 0.7731186098121583


In [47]:
#XgboostRes_dict={'f1': f1,
            # 'precision': precision, 
             #'recall': recall, 
            # 'auc': auc}

In [48]:
XgboostRes_dict={'f1': f1}

In [49]:
with open('XgboostMetrics_FullFeatures_Val.json', 'w') as f:
    json.dump(XgboostRes_dict, f)

In [50]:
#Xgboost optimization
nestim = [50,100,150,200,500]
learning_rate = [0.2,0.1,0.05,0.01]
for estim in nestim:
  for lr in learning_rate:
    xgboost=GradientBoostingClassifier(learning_rate=lr, n_estimators=estim)
    xgboost.fit(X_trainval,y_trainval)
    y_pred = xgboost.predict(X_val)
    probas = xgboost.predict_proba(X=X_val)[:,1]
    #Saving the model
    #filename_xgboost_opt = 'Xgboost_model_'+str(lr)+ '_'+str(estim)+'.sav'
    #joblib.dump(xgboost, filename_xgboost_opt)
    #Compute metrics, add to dictionary, and store in JSON file
    f1 = f1_score(y_val, y_pred)
    print(f1)
    precision = precision_score(y_val, y_pred)
    recall = recall_score(y_val, y_pred)
    auc = roc_auc_score(y_val, probas)
    #XgboostRes_dict.update({'f1 '+str(lr) + 'n_estim '+str(estim): f1,
             #'precision '+str(lr) + 'n_estim '+str(estim): precision, 
             #'recall '+str(lr) + 'n_estim '+str(estim): recall, 
             #'auc'+str(lr) + 'n_estim '+str(estim): auc})
    XgboostRes_dict.update({'f1 '+str(lr) + 'n_estim '+str(estim): f1})
    with open('XgboostMetrics_FullFeatures_Val.json', 'w') as f:
        json.dump(XgboostRes_dict, f)

0.4085106382978724
0.3926940639269407
0.4205607476635514
0.0


  _warn_prf(average, modifier, msg_start, len(result))


0.43548387096774194
0.4052863436123348
0.4234234234234234
0.23728813559322035
0.4081632653061224
0.3893805309734513
0.4052863436123348
0.3434343434343434
0.40163934426229514
0.4069264069264069
0.39647577092511016
0.4
0.40157480314960636
0.4100418410041841
0.39484978540772525
0.4200913242009132


In [51]:
#RANDOM FOREST
forest=RandomForestClassifier()
forest.fit(X_trainval,y_trainval)

RandomForestClassifier()

In [52]:
#Metrics
y_pred = forest.predict(X_val)
probas = forest.predict_proba(X=X_val)[:,1]
f1 = f1_score(y_val, y_pred)
precision = precision_score(y_val, y_pred)
recall = recall_score(y_val, y_pred)
auc = roc_auc_score(y_val, probas)
print('\tf1:', f1)
print('\tprecision:', precision)
print('\trecall:', recall)
print('\tauc:', auc)

	f1: 0.4070796460176991
	precision: 0.5897435897435898
	recall: 0.3108108108108108
	auc: 0.7668720773559483


In [53]:
#ForestRes_dict={'f1': f1,
#               'precision': precision, 
#             'recall': recall, 
#             'auc': auc

In [54]:
ForestRes_dict={'f1': f1}

In [55]:
with open('ForestMetrics_FullFeatures_Val.json', 'w') as f:
    json.dump(ForestRes_dict, f)

In [56]:
#Random forest trying to optimize
ntrees = [50,100,150,200,500]
tree_depth = [3,5,8,None]
for tree in ntrees:
  for depth in tree_depth:
    forest=RandomForestClassifier(max_depth=depth, n_estimators=tree)
    forest.fit(X_trainval,y_trainval)
    y_pred = forest.predict(X_val)
    probas = forest.predict_proba(X=X_val)[:,1]
    #Saving the model
    #filename_forest_opt = 'Forest_model_'+str(depth)+ '_'+str(tree)+'.sav'
    #joblib.dump(forest, filename_forest_opt)
    #Compute metrics, add to dictionary, and store in JSON file
    f1 = f1_score(y_val, y_pred)
    precision = precision_score(y_val, y_pred)
    recall = recall_score(y_val, y_pred)
    auc = roc_auc_score(y_val, probas)
    #ForestRes_dict.update({'f1 '+str(depth) + 'n_estim '+str(tree): f1,
    #       'precision '+str(depth) + 'n_estim '+str(tree): precision, 
    #       'recall '+str(depth) + 'n_estim '+str(tree): recall, 
    #       'auc '+str(depth) + 'n_estim '+str(tree): auc})
    ForestRes_dict.update({'f1 '+str(depth) + 'n_estim '+str(tree): f1})
    with open('ForestMetrics_FullFeatures_Val.json', 'w') as f:
        json.dump(ForestRes_dict, f)
    

In [7]:
#Xgboost Best model
xgboost=GradientBoostingClassifier(learning_rate=0.2, n_estimators=100)
xgboost.fit(X_train,y_train)

GradientBoostingClassifier(learning_rate=0.2)

In [8]:
#Metrics
y_pred = xgboost.predict(X_test)
probas = xgboost.predict_proba(X=X_test)[:,1]
f1 = f1_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
auc = roc_auc_score(y_test, probas)
print('\tf1:', f1)
print('\tprecision:', precision)
print('\trecall:', recall)
print('\tauc:', auc)

	f1: 0.36885245901639346
	precision: 0.5625
	recall: 0.27439024390243905
	auc: 0.7455694270381439


In [9]:
#Saving the model
filename_regression = './XgboostBestModel_FullFeatures_Val.sav'
joblib.dump(xgboost, filename_regression)

['./XgboostBestModel_FullFeatures_Val.sav']

In [10]:
#RANDOM FOREST Best Model
forest=RandomForestClassifier(max_depth=5, n_estimators=200)
forest.fit(X_train,y_train)

RandomForestClassifier(max_depth=5, n_estimators=200)

In [11]:
#Metrics
y_pred = forest.predict(X_test)
probas = forest.predict_proba(X=X_test)[:,1]
f1 = f1_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
auc = roc_auc_score(y_test, probas)
print('\tf1:', f1)
print('\tprecision:', precision)
print('\trecall:', recall)
print('\tauc:', auc)

	f1: 0.3436123348017621
	precision: 0.6190476190476191
	recall: 0.23780487804878048
	auc: 0.7638339920948617


In [12]:
#Saving the model
filename_regression = './RandomForestBestModel_FullFeatures_Val.sav'
joblib.dump(forest, filename_regression)

['./RandomForestBestModel_FullFeatures_Val.sav']