In [1]:
from IPython.display import display, HTML

display(HTML(data="""
<style>
    div#notebook-container    { width: 95%; }
    div#menubar-container     { width: 85%; }
    div#maintoolbar-container { width: 99%; }
</style>
"""))

pd.set_option('display.float_format', lambda x: '%.2f' % x)


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pprint import pprint
from scipy.stats import uniform
import statsmodels.api as sm
import math

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_validate, cross_val_score, cross_val_predict, GridSearchCV, RandomizedSearchCV, KFold, StratifiedKFold, validation_curve
from sklearn.pipeline import Pipeline as Pipeline, make_pipeline as make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, precision_score, recall_score, confusion_matrix, classification_report
from sklearn.svm import SVC, LinearSVC
from sklearn.utils import resample
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE, RandomOverSampler, SMOTENC
from imblearn.under_sampling import NearMiss, RandomUnderSampler
from imblearn.pipeline import Pipeline as Imbpipeline
from imblearn.pipeline import make_pipeline as Imb_make_pipeline

from imblearn.combine import SMOTETomek, SMOTEENN

from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectFromModel

df_data_set_2010 = pd.read_csv(r'C:\Users\moham\Dropbox\QSE\Thesis\Geopattern\My data\df_data_set_Europe_North_America_2003_2010.csv')
df_data_set_2015 = pd.read_csv(r'C:\Users\moham\Dropbox\QSE\Thesis\Geopattern\My data\df_data_set_Europe_North_America_2011_2015.csv')
df_data_set_2017 = pd.read_csv(r'C:\Users\moham\Dropbox\QSE\Thesis\Geopattern\My data\df_data_set_Europe_North_America_2016_2017.csv')
df_data_set_2018 = pd.read_csv(r'C:\Users\moham\Dropbox\QSE\Thesis\Geopattern\My data\df_data_set_Europe_North_America_2018.csv')

frames = [df_data_set_2010, 
          df_data_set_2015, 
          df_data_set_2017, 
          df_data_set_2018]

df_data_set = pd.concat(frames)

df_data_set['Log_TENB'] = df_data_set.TENB.apply(lambda x :math.log1p(x))
df_data_set['Sq_Log_Geo_Dist'] = df_data_set['Log_Geo_Dist'] * df_data_set['Log_Geo_Dist']
df_data_set['Log_Geo_Dist X Log_TENB'] = df_data_set['Log_Geo_Dist'] * df_data_set['Log_TENB']
df_data_set['Sq_Log_Geo_Dist X Log_TENB'] = df_data_set['Sq_Log_Geo_Dist'] * df_data_set['Log_TENB']
df_data_set['Sq_Geo_Dist'] = df_data_set['Geo_Dist'] * df_data_set['Geo_Dist']
df_data_set['Log_Sq_Geo_Dist'] = df_data_set.Sq_Geo_Dist.apply(lambda x :math.log1p(x))
df_data_set['Log_Sq_Geo_Dist X Log_TENB'] = df_data_set['Log_Sq_Geo_Dist'] * df_data_set['Log_TENB']
df_data_set['Diff_Country X Log_TENB'] = df_data_set['Diff_Country'] * df_data_set['Log_TENB']
df_data_set['Diff_Continent X Log_TENB'] = df_data_set['Diff_Continent'] * df_data_set['Log_TENB']


In [29]:
columns = [  'TENB',
             'Geo_Dist',
             'Cog_Dist',
             'Top_regions',
             'Diff_Country', 
             'Diff_Continent',
             'Not_Contig',
             'Log_Geo_Dist',
             'Log_TENB',
             'Sq_Log_Geo_Dist',
             'Log_Sq_Geo_Dist',
             'Log_Geo_Dist X Log_TENB',
             'Diff_Country X Log_TENB',
             'Diff_Continent X Log_TENB',
             'Sq_Log_Geo_Dist X Log_TENB', 
             'Log_Sq_Geo_Dist X Log_TENB',
             'Sq_Geo_Dist']

regional_dummies = [ 'Country_1_Canada', 'Country_1_France', 'Country_1_Germany',
       'Country_1_Greece', 'Country_1_Ireland', 'Country_1_Italy',
       'Country_1_Portugal', 'Country_1_Spain', 'Country_1_Switzerland',
       'Country_1_United Kingdom', 'Country_1_United States',
       'Country_2_Canada', 'Country_2_France', 'Country_2_Germany',
       'Country_2_Greece', 'Country_2_Ireland', 'Country_2_Italy',
       'Country_2_Portugal', 'Country_2_Spain', 'Country_2_Switzerland',
       'Country_2_United Kingdom', 'Country_2_United States',
       'Country_1_Austria', 'Country_1_Netherlands', 'Country_1_Russia',
       'Country_1_Slovenia', 'Country_1_Sweden', 'Country_2_Austria',
       'Country_2_Netherlands', 'Country_2_Russia', 'Country_2_Slovenia',
       'Country_2_Sweden', 'Country_1_Denmark', 'Country_2_Denmark',
       'Country_1_Belgium', 'Country_1_Finland', 'Country_1_Norway',
       'Country_2_Belgium', 'Country_2_Finland', 'Country_2_Norway',
       'Country_1_Croatia', 'Country_1_Hungary', 'Country_1_Poland',
       'Country_2_Croatia', 'Country_2_Hungary', 'Country_2_Poland',
       'Country_1_Cyprus', 'Country_1_Slovakia', 'Country_2_Cyprus',
       'Country_2_Slovakia','Country_1_Romania', 'Country_1_Serbia', 'Country_2_Romania',
       'Country_2_Serbia', 'Country_1_Luxembourg', 'Country_2_Luxembourg',
       'Country_1_Iceland', 'Country_2_Iceland', 'Country_1_Estonia',
       'Country_2_Estonia', 'Country_1_Bulgaria', 'Country_2_Bulgaria',
       'Country_1_Lithuania', 'Country_2_Lithuania',
       'Country_1_Bosnia and Herzegovina', 'Country_1_Ukraine',
       'Country_2_Bosnia and Herzegovina', 'Country_2_Ukraine']


features = [  'Log_Geo_Dist',
              'Log_TENB',
              'Cog_Dist',
              'Diff_Country', 
              'Diff_Continent',
              'Not_Contig',
              'Top_regions']
target = ['collaboration_binary']

X_clf = df_data_set[features]
y = df_data_set.collaboration_binary

In [33]:
df_data_set[features+target].collaboration_binary.value_counts()

5116/2240446

0.002283473915461475

In [34]:
unique, count = np.unique (y, return_counts = True)

y_value_count = {k : v for (k,v) in zip(unique,count)}

print ('Dataset', y_value_count)

X_train, X_test, y_train, y_test = train_test_split(X_clf, y, test_size = 0.15, random_state = 123, stratify = y)

unique, count = np.unique (y_test, return_counts = True)

y_value_count = {k : v for (k,v) in zip(unique,count)}

print ('Dataset Test', y_value_count)

Dataset {0: 2240446, 1: 5116}
Dataset Test {0: 336068, 1: 767}


In [None]:
logreg = LogisticRegression(max_iter = 500)
rnd = RandomForestClassifier(random_state = 123, n_jobs = -1)
gbc = GradientBoostingClassifier(random_state = 123)
xgb = XGBClassifier(use_label_encoder=False, eval_metric = 'logloss', random_state = 123, n_jobs = -1)
knn = KNeighborsClassifier(n_jobs = -1)
rus = RandomUnderSampler()
ros = RandomOverSampler()
smt = SMOTENC(categorical_features = [3,4,5,6], random_state = 123, n_jobs = -1)
smtk = SMOTETomek(n_jobs = 6)
smnn = SMOTEENN(n_jobs = 6)
scl = StandardScaler()
feature_selection_selbest = SelectKBest(chi2, k=7)
feature_selection_selmodel = SelectFromModel(rnd)

In [None]:
# baseline selection

clfs = [logreg, rnd, gbc, xgb, knn]

for clf in clfs:
    
    print (f'The result for {clf} is:')

    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print ('F1_Score:', f1_score(y_test, y_pred))
    print ('Presicion:' , precision_score(y_test, y_pred))
    print ('Recall:' , recall_score(y_test, y_pred))
    print ('Accuracy:', accuracy_score(y_test, y_pred))
    print ('roc_auc_score:', roc_auc_score(y_test, y_pred))
    print ('confusion_matrix:')
    print (confusion_matrix(y_test, y_pred))
    print ('Classification Report:')
    print (classification_report(y_test, y_pred))
    print('\n')

In [None]:
# baseline (RandomForest)

pipe_fs = Pipeline(steps = [['feature_selection', feature_selection_selmodel], ['RandomForestClassifier', rnd]])
pipe_scl = Pipeline(steps = [['SCL', scl], ['RandomForestClassifier', rnd]])
pipe_smt = Imbpipeline(steps = [['SMOTENC', smt], ['RandomForestClassifier', rnd]])
pipe_ros = Imbpipeline(steps = [['RandomOverSampler', ros], ['RandomForestClassifier', rnd]])
pipe_rus = Imbpipeline(steps = [['RandomUnderSampler', rus], ['RandomForestClassifier', rnd]])


clfs = [rnd, pipe_fs, pipe_scl, pipe_smt, pipe_ros, pipe_rus]

for clf in clfs:
    
    print (f'The result for {clf} is:')

    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print ('F1_Score:', f1_score(y_test, y_pred))
    print ('Presicion:' , precision_score(y_test, y_pred))
    print ('Recall:' , recall_score(y_test, y_pred))
    print ('Accuracy:', accuracy_score(y_test, y_pred))
    print ('roc_auc_score:', roc_auc_score(y_test, y_pred))
    print ('confusion_matrix:')
    print (confusion_matrix(y_test, y_pred))
    print ('Classification Report:')
    print (classification_report(y_test, y_pred))
    print('\n')
    

In [None]:
# Hyperparameter tuning - Randomforestclassifier - RandomSearch


n_estimators = [int(x) for x in np.linspace(start = 50, stop = 300, num = 26)]

criterion = ['gini', 'entropy']

max_features = ['auto', 'sqrt', .1, .2, .3, .4, .5]

max_depth = [int(x) for x in np.linspace(40, 60, num = 20)]
max_depth.append(None)

min_samples_split = [2,3,4,5,6,7,8,9]

min_samples_leaf = [1,2]

bootstrap = [True, False]

param_dist = { 'n_estimators': n_estimators,
               'criterion': criterion,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

# print('Parameters used in the grid:\n')
# pprint(param_dist)

skf = StratifiedKFold(n_splits = 5)

rnd_srch_model = RandomizedSearchCV(estimator = rnd,
                           param_distributions = param_dist, 
                           cv = skf, n_jobs = -1, 
                           verbose = 1,
                           n_iter = 200,
                           scoring = 'f1',
                           random_state = 123)

rnd_srch_model.fit(X_train, y_train)
# y_pred = rnd_srch_model.best_estimator_.predict(X_test)
print ('Best Parameters', rnd_srch_model.best_params_)
print ('Best Score', rnd_srch_model.best_score_)

# print ('Presicion:' , precision_score(y_test, y_pred))
# print ('Recall:' , recall_score(y_test, y_pred))
# print ('F1_Score:', f1_score(y_test, y_pred))


# Best Parameters {'n_estimators': 170, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 55, 'criterion': 'gini', 'bootstrap': True}
# Best Score 0.8852592313348533

In [None]:
# Random Search Best parameters result (test data)

rnd_best = RandomForestClassifier(random_state = 123, n_jobs = -1,
                                       n_estimators = 170,
                                       min_samples_split = 5,
                                       min_samples_leaf = 1,
                                       max_features = 'sqrt',
                                       criterion = 'gini',
                                       max_depth = 55,
                                       bootstrap = True)

rnd_best.fit(X_train, y_train)
y_pred_best = rnd_best.predict(X_test)
print ('F1_Score:', f1_score(y_test, y_pred_best))
print ('Presicion:' , precision_score(y_test, y_pred_best))
print ('Recall:' , recall_score(y_test, y_pred_best))
print ('Accuracy:', accuracy_score(y_test, y_pred_best))
print ('confusion_matrix:')
print (confusion_matrix(y_test, y_pred_best))
print ('Classification Report:')
print (classification_report(y_test, y_pred_best))

# rfc_cv_score = cross_val_score(rnd_best, X_train, y_train, cv = skf, scoring = 'f1')

# print ('F1_Score_cv_score:', rfc_cv_score.mean())


In [None]:
# Hyperparameter tuning - Randomforestclassifier - Grid search


n_estimators = [int(x) for x in np.linspace(start = 160, stop = 180, num = 3)]

max_features = ['auto','sqrt']

max_depth = [int(x) for x in np.linspace(53, 57, num = 5)]
max_depth.append(None)

min_samples_split = [4,5,6]

min_samples_leaf = [1]

bootstrap = [True, False]

criterion = ['gini', 'entropy']


param_grid = { 'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap,
               'criterion' : criterion}

print('Parameters used in the grid:\n')
pprint(param_grid)

skf = StratifiedKFold(n_splits = 5)

grid_model = GridSearchCV(estimator = rnd,
                         param_grid = param_grid, 
                         cv = skf, n_jobs = -1, 
                         verbose = 1,
                         scoring = 'f1')

grid_model.fit(X_train, y_train)

print ('Best Parameters', grid_model.best_params_)
print ('Best Score', grid_model.best_score_)


# Best Parameters {'bootstrap': True, 'criterion': 'gini', 'max_depth': 53, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 170}
# Best Score 0.8852592313348533

In [None]:
# grid Search Best parameters result (test data)

grd_best = RandomForestClassifier(random_state = 123, n_jobs = -1,
                                       criterion = 'gini',
                                       n_estimators = 170,
                                       min_samples_split = 5,
                                       min_samples_leaf = 1,
                                       max_features = 'auto',
                                       max_depth = 53,
                                       bootstrap = True)

grd_best.fit(X_train, y_train)
y_pred_best_grd = grd_best.predict(X_test)
print ('F1_Score:', f1_score(y_test, y_pred_best_grd))
print ('Presicion:' , precision_score(y_test, y_pred_best_grd))
print ('Recall:' , recall_score(y_test, y_pred_best_grd))
print ('Accuracy:', accuracy_score(y_test, y_pred_best_grd))
print ('roc_auc_score:', roc_auc_score(y_test, y_pred_best_grd))
print ('confusion_matrix:')
print (confusion_matrix(y_test, y_pred_best_grd))
print ('Classification Report:')
print (classification_report(y_test, y_pred_best_grd))

# rfc_cv_score = cross_val_score(rnd_best, X_train, y_train, cv = skf, scoring = 'f1')

# print ('F1_Score_cv_score:', rfc_cv_score.mean())


In [None]:
# Feature importance - mean decrease in impurity

importances = grd_best.feature_importances_ * 100


df_importance = pd.DataFrame()

df_importance.insert(0,'Feature','')
df_importance.insert(1,'importance',0)

j = 0
for i in columns_clf:
    df_importance.loc[j,'Feature'] = i
    df_importance.loc[j,'importance'] = importances[j]
    j += 1

df_importance.sort_values(by=['importance'], ascending=False, inplace = True)

df_importance[:10].plot(x = 'Feature', y = 'importance', kind = 'bar', rot = 90, ylabel = 'mean decrease in impurity (%)')

In [None]:
# Logit regression


columns_reg0 = [ 'Log_Geo_Dist',
                 'Prov_Border',
                 'Country_Border',
                 'NotContig',
                 'Top_regions']

columns_reg1 = [ 'Log_Geo_Dist',
                 'Log_TENB',
                 'Cog_Dist',
                 'Prov_Border',
                 'Country_Border',
                 'NotContig',
                 'Top_regions']

columns_reg2 = [ 'Log_Geo_Dist',
                 'Log_TENB',
                 'Log_Geo_Dist X Log_TENB', 
                 'Cog_Dist',
                 'Prov_Border',
                 'Country_Border',
                 'NotContig',
                 'Top_regions']


#              'Prov_Border X Cog_Dist',
#              'Country_Border X Log_TENB',
#              'Country_Border X Cog_Dist',

columns_reg21 = ['Log_Geo_Dist',
                 'Log_TENB',
                 'Prov_Border X Log_TENB',
                 'Cog_Dist',
                 'Prov_Border',
                 'Country_Border',
                 'NotContig',
                 'Top_regions']

columns_reg22 = ['Log_Geo_Dist',
                 'Log_TENB',
                 'Country_Border X Log_TENB',
                 'Cog_Dist',
                 'Prov_Border',
                 'Country_Border',
                 'NotContig',
                 'Top_regions']

columns_reg3 = [ 'Log_Geo_Dist',
                 'Log_TENB',
                 'Log_Geo_Dist X Cog_Dist',
                 'Cog_Dist',
                 'Prov_Border',
                 'Country_Border',
                 'NotContig']

columns_reg4 = [ 'Log_Geo_Dist',
                 'Log_TENB',
                 'Cog_Dist',
                 'Prov_Border',
                 'Prov_Border X Log_TENB',
                 'Country_Border',
                 'NotContig']

columns_reg5 = [ 'Log_Geo_Dist',
                 'Log_TENB',
                 'Cog_Dist',
                 'Prov_Border',
                 'Prov_Border X Cog_Dist',
                 'Country_Border',
                 'NotContig']

columns_reg6 = [ 'Log_Geo_Dist',
                 'Log_TENB',
                 'Cog_Dist',
                 'Prov_Border',
                 'Country_Border',
                 'Country_Border X Log_TENB',
                 'NotContig']

columns_reg7 = [ 'Log_Geo_Dist',
                 'Log_TENB',
                 'Cog_Dist',
                 'Prov_Border',
                 'Country_Border',
                 'Country_Border X Cog_Dist',
                 'NotContig']

X_reg0 = df_data_set[columns_reg0]
X_reg1 = df_data_set[columns_reg1]
X_reg2 = df_data_set[columns_reg2]
X_reg21 = df_data_set[columns_reg21]
X_reg22 = df_data_set[columns_reg22]
X_reg3 = df_data_set[columns_reg3]
X_reg4 = df_data_set[columns_reg4]
X_reg5 = df_data_set[columns_reg5]
X_reg6 = df_data_set[columns_reg6]
X_reg7 = df_data_set[columns_reg7]

X_reg_0 = sm.tools.tools.add_constant(X_reg0, prepend=True, has_constant='add')
X_reg_1 = sm.tools.tools.add_constant(X_reg1, prepend=True, has_constant='add')
X_reg_2 = sm.tools.tools.add_constant(X_reg2, prepend=True, has_constant='add')
X_reg_3 = sm.tools.tools.add_constant(X_reg3, prepend=True, has_constant='add')
X_reg_4 = sm.tools.tools.add_constant(X_reg4, prepend=True, has_constant='add')
X_reg_5 = sm.tools.tools.add_constant(X_reg5, prepend=True, has_constant='add')
X_reg_6 = sm.tools.tools.add_constant(X_reg6, prepend=True, has_constant='add')
X_reg_7 = sm.tools.tools.add_constant(X_reg7, prepend=True, has_constant='add')


xx = [X_reg3, X_reg5, X_reg7]

for xx in xx:
    logit_model=sm.Logit(y, xx)
    result=logit_model.fit(method_kwargs={"warn_convergence": False})
    print(result.summary2())

In [None]:
# Validation curve - n_estimators

param_range = [int(x) for x in np.linspace(start = 100, stop = 1000, num = 10)]

skf = StratifiedKFold(n_splits = 3)

train_scores, test_scores = validation_curve(rnd, X_train, y_train, 
                           param_name = 'n_estimators', 
                           param_range = param_range,
                           cv = skf, n_jobs = -1, verbose = 30, scoring = 'f1')

train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)

plt.title("Validation Curve with RandomForest")
plt.xlabel(r"$\gamma$")
plt.ylabel("Score")
plt.ylim(.85, 1.05)

lw = 2
plt.semilogx(param_range, train_scores_mean, label="Training score",
             color="darkorange", lw=lw)
plt.fill_between(param_range, train_scores_mean - train_scores_std,
                 train_scores_mean + train_scores_std, alpha=0.2,
                 color="darkorange", lw=lw)
plt.semilogx(param_range, test_scores_mean, label="Cross-validation score",
             color="navy", lw=lw)
plt.fill_between(param_range, test_scores_mean - test_scores_std,
                 test_scores_mean + test_scores_std, alpha=0.2,
                 color="navy", lw=lw)
plt.legend(loc="best")
plt.show()

test_scores_mean

In [None]:
# Validation curve - max_depth

max_depth = [int(x) for x in np.linspace(0, 150, num = 30)]
max_depth.append(None)

param_range = max_depth

skf = StratifiedKFold(n_splits = 3)

train_scores, test_scores = validation_curve(rnd, X_train, y_train, 
                           param_name = 'max_depth', 
                           param_range = param_range,
                           cv = skf, n_jobs = -1, verbose = 45, scoring = 'f1')

train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)

plt.title("Validation Curve with RandomForest")
plt.xlabel(r"$\gamma$")
plt.ylabel("Score")
plt.ylim(.8, 1)

lw = 2
plt.semilogx(param_range, train_scores_mean, label="Training score",
             color="darkorange", lw=lw)
# plt.fill_between(param_range, train_scores_mean - train_scores_std,
#                  train_scores_mean + train_scores_std, alpha=0.2,
#                  color="darkorange", lw=lw)
plt.semilogx(param_range, test_scores_mean, label="Cross-validation score",
             color="navy", lw=lw)
# plt.fill_between(param_range, test_scores_mean - test_scores_std,
#                  test_scores_mean + test_scores_std, alpha=0.2,
#                  color="navy", lw=lw)
plt.legend(loc="best")
plt.show()

test_scores_mean


In [None]:
# Validation curve - min_sample_split

min_sample_split = [1,2,3,4,5,6,7,8,9,10]

param_range = min_sample_split

skf = StratifiedKFold(n_splits = 3)

train_scores, test_scores = validation_curve(rnd, X_train, y_train, 
                           param_name = 'min_samples_split', 
                           param_range = param_range,
                           cv = skf, n_jobs = -1, verbose = 30, scoring = 'f1')

train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)

plt.title("Validation Curve with RandomForest")
plt.xlabel(r"$\gamma$")
plt.ylabel("Score")
plt.ylim(.8, 1)

lw = 2
plt.semilogx(param_range, train_scores_mean, label="Training score",
             color="darkorange", lw=lw)
plt.fill_between(param_range, train_scores_mean - train_scores_std,
                 train_scores_mean + train_scores_std, alpha=0.2,
                 color="darkorange", lw=lw)
plt.semilogx(param_range, test_scores_mean, label="Cross-validation score",
             color="navy", lw=lw)
plt.fill_between(param_range, test_scores_mean - test_scores_std,
                 test_scores_mean + test_scores_std, alpha=0.2,
                 color="navy", lw=lw)
plt.legend(loc="best")
plt.show()


test_scores_mean

In [None]:
# Validation curve - min_sample_leaf

min_sample_leaf = [0,1,2,3,4,5,6,7,8,9,10]

param_range = min_sample_leaf

skf = StratifiedKFold(n_splits = 3)

train_scores, test_scores = validation_curve(rnd, X_train, y_train, 
                           param_name = 'min_samples_leaf', 
                           param_range = param_range,
                           cv = skf, n_jobs = -1, verbose = 50, scoring = 'f1')

train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)

plt.title("Validation Curve with RandomForest")
plt.xlabel(r"$\gamma$")
plt.ylabel("Score")
plt.ylim(.8, 1)

lw = 2
plt.semilogx(param_range, train_scores_mean, label="Training score",
             color="darkorange", lw=lw)
plt.fill_between(param_range, train_scores_mean - train_scores_std,
                 train_scores_mean + train_scores_std, alpha=0.2,
                 color="darkorange", lw=lw)
plt.semilogx(param_range, test_scores_mean, label="Cross-validation score",
             color="navy", lw=lw)
plt.fill_between(param_range, test_scores_mean - test_scores_std,
                 test_scores_mean + test_scores_std, alpha=0.2,
                 color="navy", lw=lw)
plt.legend(loc="best")
plt.show()


test_scores_mean

In [None]:
# Validation curve - bootstrap

bootstrap = [True, False]

param_range = bootstrap

skf = StratifiedKFold(n_splits = 3)

train_scores, test_scores = validation_curve(rnd, X_train, y_train, 
                           param_name = 'bootstrap', 
                           param_range = param_range,
                           cv = skf, n_jobs = -1, verbose = 6, scoring = 'f1')

train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)

plt.title("Validation Curve with RandomForest")
plt.xlabel(r"$\gamma$")
plt.ylabel("Score")
plt.ylim(.8, 1)

lw = 2
plt.semilogx(param_range, train_scores_mean, label="Training score",
             color="darkorange", lw=lw)
plt.fill_between(param_range, train_scores_mean - train_scores_std,
                 train_scores_mean + train_scores_std, alpha=0.2,
                 color="darkorange", lw=lw)
plt.semilogx(param_range, test_scores_mean, label="Cross-validation score",
             color="navy", lw=lw)
plt.fill_between(param_range, test_scores_mean - test_scores_std,
                 test_scores_mean + test_scores_std, alpha=0.2,
                 color="navy", lw=lw)
plt.legend(loc="best")
plt.show()


test_scores_mean

In [None]:
# Validation curve - criterion

criterion = ['gini', 'entropy']

param_range = criterion

skf = StratifiedKFold(n_splits = 3)

train_scores, test_scores = validation_curve(rnd, X_train, y_train, 
                           param_name = 'criterion', 
                           param_range = param_range,
                           cv = skf, n_jobs = -1, verbose = 6, scoring = 'f1')

train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)

plt.title("Validation Curve with RandomForest")
plt.xlabel(r"$\gamma$")
plt.ylabel("Score")
plt.ylim(.8, 1)

lw = 2
plt.semilogx(param_range, train_scores_mean, label="Training score",
             color="darkorange", lw=lw)
plt.fill_between(param_range, train_scores_mean - train_scores_std,
                 train_scores_mean + train_scores_std, alpha=0.2,
                 color="darkorange", lw=lw)
plt.semilogx(param_range, test_scores_mean, label="Cross-validation score",
             color="navy", lw=lw)
plt.fill_between(param_range, test_scores_mean - test_scores_std,
                 test_scores_mean + test_scores_std, alpha=0.2,
                 color="navy", lw=lw)
plt.legend(loc="best")
plt.show()


test_scores_mean

In [None]:
# Validation curve - max_features

max_features = ['auto', 'sqrt', .1, .2, .3, .4, .5, .6, .7, .8, .9]

param_range = max_features

skf = StratifiedKFold(n_splits = 3)

train_scores, test_scores = validation_curve(rnd, X_train, y_train, 
                           param_name = 'max_features', 
                           param_range = param_range,
                           cv = skf, n_jobs = -1, verbose = 6, scoring = 'f1')

train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)

plt.title("Validation Curve with RandomForest")
plt.xlabel(r"$\gamma$")
plt.ylabel("Score")
plt.ylim(.8, 1)

lw = 2
plt.semilogx(param_range, train_scores_mean, label="Training score",
             color="darkorange", lw=lw)
plt.fill_between(param_range, train_scores_mean - train_scores_std,
                 train_scores_mean + train_scores_std, alpha=0.2,
                 color="darkorange", lw=lw)
plt.semilogx(param_range, test_scores_mean, label="Cross-validation score",
             color="navy", lw=lw)
plt.fill_between(param_range, test_scores_mean - test_scores_std,
                 test_scores_mean + test_scores_std, alpha=0.2,
                 color="navy", lw=lw)
plt.legend(loc="best")
plt.show()


test_scores_mean

In [None]:
# Feature selection

print ('Original data size: ', X.shape)

var = VarianceThreshold(threshold=(.9 * (1 - .9)))
var_fit = var.fit_transform(X)

print ('Low variance removal: ', var_fit.shape)

feature_idx = var.get_support()
my_features_var = X.columns[feature_idx].tolist()
print ('Variance threshold list of features:', my_features_var)

selbest = SelectKBest(chi2, k=40)
selbest_fit = selbest.fit_transform(X, y)
print ('SelectKBest: ', selbest_fit.shape)
feature_idx = selbest.get_support()
my_features_selbest = X.columns[feature_idx].tolist()
print ('SelectKBest list of features:', my_features_selbest)


logreg = LogisticRegression()


selmodel = SelectFromModel(logreg).fit(X, y)
X_new = selmodel.transform(X)
print ('SelectFromModel', X_new.shape)

feature_idx = selmodel.get_support()
my_features_selmodel = X.columns[feature_idx].tolist()
print ('SelectKBest list of features:', my_features_selmodel)
