### Loading data and libraries

In [1]:
from IPython.display import display, HTML

display(HTML(data="""
<style>
    div#notebook-container    { width: 95%; }
    div#menubar-container     { width: 85%; }
    div#maintoolbar-container { width: 99%; }
</style>
"""))

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline 
import seaborn as sns
from pprint import pprint
from scipy.stats import uniform
import statsmodels.api as sm
import math
import shap 
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_validate, cross_val_score, cross_val_predict, GridSearchCV, RandomizedSearchCV, KFold, StratifiedKFold, validation_curve
from sklearn.pipeline import Pipeline as Pipeline, make_pipeline as make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from catboost import CatBoostClassifier, Pool, cv
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, precision_score, recall_score, confusion_matrix, classification_report
from sklearn.svm import SVC, LinearSVC
from sklearn.utils import resample
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE, RandomOverSampler, SMOTENC
from imblearn.under_sampling import NearMiss, RandomUnderSampler
from imblearn.pipeline import Pipeline as Imbpipeline
from imblearn.pipeline import make_pipeline as Imb_make_pipeline
from imblearn.combine import SMOTETomek, SMOTEENN
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectFromModel

pd.set_option('max_colwidth', 100)
pd.set_option('display.float_format', lambda x: '%.2f' % x)

# the first dataset includes authors with at least 0 article to form cognitive proximity.
# df_data_set0 = pd.read_csv(r'C:\Users\moham\Dropbox\QSE\Thesis\Geopattern\My data\df_data_set_Canada_0.csv')

df_data_set = pd.read_csv(r'C:\Users\moham\Dropbox\QSE\Thesis\Geopattern\My data\df_data_set_Canada_1a.csv')


In [None]:
df_descriptive = df_data_set[['collaboration_binary', 'Geo_Dist', 'TENB', 'Cog_Dist', 'Prov_Border', 'NotContig']]

df_descriptive.describe()

In [None]:
# Pearson correlation

df_descriptive = df_data_set[['collaboration_binary', 'Geo_Dist', 'TENB', 'Cog_Dist', 'Prov_Border', 'NotContig']]
df_descriptive.corr()

from scipy.stats import pearsonr
rho = df_descriptive.corr()
pval = df_descriptive.corr(method=lambda x, y: pearsonr(x, y)[1]) - np.eye(*rho.shape)
p = pval.applymap(lambda x: ''.join(['*' for t in [0.01,0.05,0.1] if x<=t]))
rho.round(2).astype(str) + p

### Regression analysis

In [None]:
columns = [  'Log_Geo_Dist',
             'Log_TENB',
             'Cog_Dist',
             'Prov_Border',
             'NotContig',
             'Log_Geo_Dist X Log_TENB', 
             'Log_Geo_Dist_Sq X Log_TENB']


Ind_v = df_data_set[columns]
Dep_v = df_data_set.collaboration_binary

columns_reg1 = [ 'Log_Geo_Dist',
                 'Log_TENB',
                 'Cog_Dist',
                 'Prov_Border',
                 'NotContig']

columns_reg2 = [ 'Log_Geo_Dist',
                 'Log_TENB',
                 'Log_Geo_Dist X Log_TENB',  
                 'Cog_Dist',
                 'Prov_Border',
                 'NotContig']

X_reg1 = Ind_v[columns_reg1]
X_reg2 = Ind_v[columns_reg2]



X_reg_1 = sm.tools.tools.add_constant(X_reg1, prepend=True, has_constant='add')
X_reg_2 = sm.tools.tools.add_constant(X_reg2, prepend=True, has_constant='add')


models = [X_reg1, X_reg2]

for model in models:
    logit_model=sm.Logit(Dep_v, model)
    result=logit_model.fit(method_kwargs={"warn_convergence": False})
    print(result.summary2())

In [None]:
# the geo distance that maximize the TENB's Elasticity

np.exp(2.0578/(2*0.2179))

In [None]:
2.0578 * math.log1p(112.38) - 0.2179 *  math.log1p(112.38) *  math.log1p(112.38)

In [None]:
dis = np.linspace (0,1000, num = 5000)

X_dis = []
for i in dis:
    x_dis = math.log1p(i)
    X_dis.append(x_dis)
    
Y_tenb = []
Y_tenb_ = []

for j in X_dis:
    y_tenb_ = 2.0578 * j - 0.2179 * j * j
    Y_tenb_.append(y_tenb_)

plt.figure(figsize=(20,10))
plt.plot (dis, Y_tenb_, color = 'red')
plt.xlabel('Physical distance (km)', size = 20)
plt.ylabel('Elasticity of TENB (network proximity)', size = 20)
plt.xticks(fontsize= 15)
plt.yticks(fontsize= 15)

### Machine Learning

In [2]:
features = ['Geo_Dist', 'TENB', 'Cog_Dist' ,'Prov_Border', 'NotContig']

X = df_data_set[features]

y = df_data_set.collaboration_binary

In [3]:
unique, count = np.unique (y, return_counts = True)

y_value_count = {k : v for (k,v) in zip(unique,count)}

print ('Dataset', y_value_count)
print("\n")

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1, stratify = y)

unique_train, count_train = np.unique (y_train, return_counts = True)

y_train_value_count = {k : v for (k,v) in zip(unique_train,count_train)}

print ('Dataset Train', y_train_value_count)
print("\n")

unique_test, count_test = np.unique (y_test, return_counts = True)

y_test_value_count = {k : v for (k,v) in zip(unique_test,count_test)}

print ('Dataset Test', y_test_value_count)
print("\n")


Dataset {0: 2621, 1: 81}


Dataset Train {0: 2096, 1: 65}


Dataset Test {0: 525, 1: 16}




In [4]:
logreg = LogisticRegression(max_iter = 500)
gnb = GaussianNB()
knn = KNeighborsClassifier(n_jobs = -1)
svm = LinearSVC(loss = 'hinge')
rnd = RandomForestClassifier(random_state = 123, n_jobs = -1)
xgb = XGBClassifier(use_label_encoder=False, eval_metric = 'logloss', random_state = 123, n_jobs = -1)

rus = RandomUnderSampler()
ros = RandomOverSampler()
smt = SMOTENC(categorical_features = [3,4], random_state = 123, n_jobs = -1)


In [9]:
%%time
# Cross Validation

skf = StratifiedKFold(n_splits=5, shuffle = True, random_state = 123)

logreg_pipe_smt = Imbpipeline(steps = [['SMOTENC', smt], ['LogisticRegression', logreg]])
logreg_pipe_ros = Imbpipeline(steps = [['RandomOverSampler', ros], ['LogisticRegression', logreg]])
logreg_pipe_rus = Imbpipeline(steps = [['RandomUnderSampler', rus], ['LogisticRegression', logreg]])

gnb_pipe_smt = Imbpipeline(steps = [['SMOTENC', smt], ['GaussianNB', gnb]])
gnb_pipe_ros = Imbpipeline(steps = [['RandomOverSampler', ros], ['GaussianNB', gnb]])
gnb_pipe_rus = Imbpipeline(steps = [['RandomUnderSampler', rus], ['GaussianNB', gnb]])

knn_pipe_smt = Imbpipeline(steps = [['SMOTENC', smt], ['KNeighborsClassifier', knn]])
knn_pipe_ros = Imbpipeline(steps = [['RandomOverSampler', ros], ['KNeighborsClassifier', knn]])
knn_pipe_rus = Imbpipeline(steps = [['RandomUnderSampler', rus], ['KNeighborsClassifier', knn]])

svm_pipe_smt = Imbpipeline(steps = [['SMOTENC', smt], ['svm', svm]])
svm_pipe_ros = Imbpipeline(steps = [['RandomOverSampler', ros], ['svm', svm]])
svm_pipe_rus = Imbpipeline(steps = [['RandomUnderSampler', rus], ['svm', svm]])

rnd_pipe_smt = Imbpipeline(steps = [['SMOTENC', smt], ['Random Forest', rnd]])
rnd_pipe_ros = Imbpipeline(steps = [['RandomOverSampler', ros], ['Random Forest', rnd]])
rnd_pipe_rus = Imbpipeline(steps = [['RandomUnderSampler', rus], ['Random Forest', rnd]])

xgb_pipe_smt = Imbpipeline(steps = [['SMOTENC', smt], ['XGBoost', xgb]])
xgb_pipe_ros = Imbpipeline(steps = [['RandomOverSampler', ros], ['XGBoost', xgb]])
xgb_pipe_rus = Imbpipeline(steps = [['RandomUnderSampler', rus], ['XGBoost', xgb]])

clfs = [logreg, logreg_pipe_smt, logreg_pipe_ros, logreg_pipe_rus,
        gnb, gnb_pipe_smt, gnb_pipe_ros, gnb_pipe_rus,
        knn, knn_pipe_smt, knn_pipe_ros, knn_pipe_rus, 
        svm, svm_pipe_smt, svm_pipe_ros, svm_pipe_rus,
        rnd, rnd_pipe_smt, rnd_pipe_ros, rnd_pipe_rus,
        xgb, xgb_pipe_smt, xgb_pipe_ros, xgb_pipe_rus]

labels = ['logreg', 'logreg_pipe_smt', 'logreg_pipe_ros', 'logreg_pipe_rus',
          'gnb', 'gnb_pipe_smt', 'gnb_pipe_ros', 'gnb_pipe_rus',
          'knn', 'knn_pipe_smt', 'knn_pipe_ros', 'knn_pipe_rus', 
          'svm', 'svm_pipe_smt', 'svm_pipe_ros', 'svm_pipe_rus',
          'rnd', 'rnd_pipe_smt', 'rnd_pipe_ros', 'rnd_pipe_rus', 
          'xgb', 'xgb_pipe_smt', 'xgb_pipe_ros', 'xgb_pipe_rus']

       
clfs = zip(clfs,labels)

for clf, label in clfs:
    scores_F1 = cross_val_score(clf, X_train, y_train, scoring='f1', cv=skf, n_jobs=-1)
    print("F1: %0.2f (+/- %0.2f) [%s]" % (scores_F1.mean(), scores_F1.std(), label))


F1: 0.79 (+/- 0.10) [logreg]
F1: 0.71 (+/- 0.07) [logreg_pipe_smt]
F1: 0.64 (+/- 0.08) [logreg_pipe_ros]
F1: 0.59 (+/- 0.11) [logreg_pipe_rus]
F1: 0.72 (+/- 0.06) [gnb]
F1: 0.53 (+/- 0.08) [gnb_pipe_smt]
F1: 0.56 (+/- 0.04) [gnb_pipe_ros]
F1: 0.59 (+/- 0.07) [gnb_pipe_rus]
F1: 0.70 (+/- 0.16) [knn]
F1: 0.74 (+/- 0.03) [knn_pipe_smt]
F1: 0.81 (+/- 0.08) [knn_pipe_ros]
F1: 0.22 (+/- 0.04) [knn_pipe_rus]
F1: 0.79 (+/- 0.09) [svm]
F1: 0.57 (+/- 0.29) [svm_pipe_smt]
F1: 0.25 (+/- 0.27) [svm_pipe_ros]
F1: 0.48 (+/- 0.20) [svm_pipe_rus]
F1: 0.91 (+/- 0.03) [rnd]
F1: 0.88 (+/- 0.04) [rnd_pipe_smt]
F1: 0.92 (+/- 0.02) [rnd_pipe_ros]
F1: 0.67 (+/- 0.07) [rnd_pipe_rus]
F1: 0.93 (+/- 0.04) [xgb]
F1: 0.90 (+/- 0.05) [xgb_pipe_smt]
F1: 0.93 (+/- 0.03) [xgb_pipe_ros]
F1: 0.64 (+/- 0.07) [xgb_pipe_rus]
Wall time: 54.4 s


In [10]:
%%time
# Hyperparameter tuning - RandomSearch

param_dist = { 'max_depth': [3, 5, 6, 10, 15, 20],
           'learning_rate': np.random.uniform(size=10),
           'subsample': np.random.uniform(size=10),
           'colsample_bytree': np.random.uniform(size=10),
           'colsample_bylevel': np.random.uniform(size=10),
           'n_estimators': [100, 500, 1000]}

skf = StratifiedKFold(n_splits=5, shuffle = True, random_state = 123)

xgb_srch_model = RandomizedSearchCV(estimator = xgb,
                           param_distributions = param_dist, 
                           cv = skf, n_jobs = -1, 
                           verbose = 1,
                           n_iter = 100,
                           scoring = 'f1',
                           random_state = 123)

xgb_srch_model.fit(X_train, y_train)
print ('Best Parameters', xgb_srch_model.best_params_)
print ('Best Score', xgb_srch_model.best_score_)

y_pred = xgb_srch_model.best_estimator_.predict(X_test)
print ('Presicion:' , precision_score(y_test, y_pred))
print ('Recall:' , recall_score(y_test, y_pred))
print ('F1_Score:', f1_score(y_test, y_pred))


Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best Parameters {'subsample': 0.8177919990310334, 'n_estimators': 500, 'max_depth': 20, 'learning_rate': 0.30786718112683087, 'colsample_bytree': 0.8972608202973892, 'colsample_bylevel': 0.6505382144387293}
Best Score 0.9486137566137567
Presicion: 0.9333333333333333
Recall: 0.875
F1_Score: 0.9032258064516129
Wall time: 1min 39s


In [None]:
# # Test scores

# for clf, label in zip([logreg, gnb, knn, svm, rnd, xgb], ['LogisticRegression', 'GaussianNB', 'KNeighborsClassifier', 'svm', 'Random Forest', 'XGB Classifier', 'CatBoostClassifier']):
#     clf.fit(X_train, y_train)
#     y_pred = clf.predict(X_test)
# #     print("Presicion: %0.3f [%s]" % (precision_score(y_test, y_pred), label))
# #     print("Recall: %0.3f [%s]" % (recall_score(y_test, y_pred), label))
#     print("F1: %0.2f [%s]" % (f1_score(y_test, y_pred), label))
# #     print("Accuracy: %0.2f [%s]" % (accuracy_score(y_test, y_pred), label))
# #     print('\n')

In [None]:
# Feature importance - mean decrease in impurity

importances = xgb.feature_importances_ * 100


df_importance = pd.DataFrame()

df_importance.insert(0,'Feature','')
df_importance.insert(1,'importance',0)

j = 0
for i in features:
    df_importance.loc[j,'Feature'] = i
    df_importance.loc[j,'importance'] = importances[j]
    j += 1

df_importance.sort_values(by=['importance'], ascending=False, inplace = True)

df_importance[:10].plot(x = 'Feature', y = 'importance', kind = 'bar', rot = 90, ylabel = 'mean decrease in impurity (%)')

In [None]:
%%time
# Feature importance - SHAP

xgb.fit(X_train, y_train)

xgb_explainer = shap.TreeExplainer(xgb)
shap_values = xgb_explainer.shap_values(X_train, y_train)
shap_values2 = xgb_explainer(X_train)
shap_interaction_values = xgb_explainer.shap_interaction_values(X_train)


In [None]:
shap.plots.bar(shap_values2)
shap.summary_plot(shap_values2, X_train, plot_type="bar")
shap.summary_plot(shap_values2, X_train, plot_size=2, cmap=plt.get_cmap("cool"))
# shap.plots.waterfall(shap_values2[0])


In [None]:
# Dependence plot

shap.dependence_plot('Log_TENB', shap_values, X_train, interaction_index=None, show = False)

plt.gcf().set_size_inches(5, 5)

shap.dependence_plot(('Log_TENB', 'Log_Geo_Dist'), shap_interaction_values, X_train, show = False, cmap=plt.get_cmap("hsv"), dot_size = 70)

plt.gcf().set_size_inches(5, 5)
# ax = plt.axes()
# ax.set_facecolor("black")

# shap.dependence_plot('Cog_Dist', shap_values, X_train, interaction_index=None)
# shap.dependence_plot('Country_Border', shap_values, X_train, interaction_index=None)
# shap.dependence_plot('Prov_Border', shap_values, X_train, interaction_index=None)
# shap.dependence_plot('NotContig', shap_values, X_train, interaction_index=None)


In [None]:
# Random Search Best parameters result (test data)

rnd_best = RandomForestClassifier(random_state = 123, n_jobs = -1,
                                       n_estimators = 170,
                                       min_samples_split = 5,
                                       min_samples_leaf = 1,
                                       max_features = 'sqrt',
                                       criterion = 'gini',
                                       max_depth = 55,
                                       bootstrap = True)

rnd_best.fit(X_train, y_train)
y_pred_best = rnd_best.predict(X_test)
print ('F1_Score:', f1_score(y_test, y_pred_best))
print ('Presicion:' , precision_score(y_test, y_pred_best))
print ('Recall:' , recall_score(y_test, y_pred_best))
print ('Accuracy:', accuracy_score(y_test, y_pred_best))
print ('confusion_matrix:')
print (confusion_matrix(y_test, y_pred_best))
print ('Classification Report:')
print (classification_report(y_test, y_pred_best))

# rfc_cv_score = cross_val_score(rnd_best, X_train, y_train, cv = skf, scoring = 'f1')

# print ('F1_Score_cv_score:', rfc_cv_score.mean())


In [None]:
# Hyperparameter tuning - Randomforestclassifier - Grid search


n_estimators = [int(x) for x in np.linspace(start = 160, stop = 180, num = 3)]

max_features = ['auto','sqrt']

max_depth = [int(x) for x in np.linspace(53, 57, num = 5)]
max_depth.append(None)

min_samples_split = [4,5,6]

min_samples_leaf = [1]

bootstrap = [True, False]

criterion = ['gini', 'entropy']


param_grid = { 'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap,
               'criterion' : criterion}

print('Parameters used in the grid:\n')
pprint(param_grid)

skf = StratifiedKFold(n_splits = 5)

grid_model = GridSearchCV(estimator = rnd,
                         param_grid = param_grid, 
                         cv = skf, n_jobs = -1, 
                         verbose = 1,
                         scoring = 'f1')

grid_model.fit(X_train, y_train)

print ('Best Parameters', grid_model.best_params_)
print ('Best Score', grid_model.best_score_)


# Best Parameters {'bootstrap': True, 'criterion': 'gini', 'max_depth': 53, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 170}
# Best Score 0.8852592313348533

In [None]:
# grid Search Best parameters result (test data)

grd_best = RandomForestClassifier(random_state = 123, n_jobs = -1,
                                       criterion = 'gini',
                                       n_estimators = 170,
                                       min_samples_split = 5,
                                       min_samples_leaf = 1,
                                       max_features = 'auto',
                                       max_depth = 53,
                                       bootstrap = True)

grd_best.fit(X_train, y_train)
y_pred_best_grd = grd_best.predict(X_test)
print ('F1_Score:', f1_score(y_test, y_pred_best_grd))
print ('Presicion:' , precision_score(y_test, y_pred_best_grd))
print ('Recall:' , recall_score(y_test, y_pred_best_grd))
print ('Accuracy:', accuracy_score(y_test, y_pred_best_grd))
print ('roc_auc_score:', roc_auc_score(y_test, y_pred_best_grd))
print ('confusion_matrix:')
print (confusion_matrix(y_test, y_pred_best_grd))
print ('Classification Report:')
print (classification_report(y_test, y_pred_best_grd))

# rfc_cv_score = cross_val_score(rnd_best, X_train, y_train, cv = skf, scoring = 'f1')

# print ('F1_Score_cv_score:', rfc_cv_score.mean())


In [None]:
# cat = CatBoostClassifier(random_state = 123, verbose = False)
# params = {'iterations':1000,
#         'learning_rate':0.01,
#         'depth':3,
#         'eval_metric':'F1',
#         'loss_function': "Logloss",
#         'verbose':False,
#         'od_type':"Iter", # overfit detector
#         'od_wait':500, # most recent best iteration to wait before stopping
#         'random_state' : 123}

# cat1 = CatBoostClassifier(**params)
# logreg = LogisticRegression(max_iter = 500)
# rnd = RandomForestClassifier(random_state = 123, n_jobs = -1)
# gbc = GradientBoostingClassifier(random_state = 123)
# xgb = XGBClassifier(use_label_encoder=False, eval_metric = 'logloss', random_state = 123, n_jobs = -1)
# cat = CatBoostClassifier()
# knn = KNeighborsClassifier(n_jobs = -1)
# gnb = GaussianNB()
# smtk = SMOTETomek(n_jobs = 6)
# smnn = SMOTEENN(n_jobs = 6)
# scl = StandardScaler()
# feature_selection_selbest = SelectKBest(chi2, k=7)
# feature_selection_selmodel = SelectFromModel(rnd)
# voth = VotingClassifier(estimators=[('LogisticRegression', logreg), ('RandomForestClassifier', rnd), ('GradientBoostingClassifier', gbc), ('XGBClassifier', xgb), ('KNeighborsClassifier', knn),('GaussianNB', gnb)], voting='hard')
# vots = VotingClassifier(estimators=[('LogisticRegression', logreg), ('RandomForestClassifier', rnd), ('GradientBoostingClassifier', gbc), ('XGBClassifier', xgb), ('KNeighborsClassifier', knn),('GaussianNB', gnb)], voting='soft')
# voth = VotingClassifier(estimators=[('RandomForestClassifier', rnd), ('XGBClassifier', xgb)], voting='hard')
# vots = VotingClassifier(estimators=[('RandomForestClassifier', rnd), ('XGBClassifier', xgb)], voting='soft')