In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pprint import pprint

%matplotlib inline

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_validate, cross_val_score, GridSearchCV, RandomizedSearchCV, KFold, StratifiedKFold
from sklearn.pipeline import Pipeline as Pipeline, make_pipeline as make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
from sklearn.svm import SVC, LinearSVC
from sklearn.utils import resample
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.under_sampling import NearMiss, RandomUnderSampler
from imblearn.pipeline import Pipeline as Imbpipeline
from imblearn.pipeline import make_pipeline as Imb_make_pipeline

from imblearn.combine import SMOTETomek, SMOTEENN

from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectFromModel

df_data_set = pd.read_csv(r'C:\Users\moham\Dropbox\QSE\Thesis\Geopattern\My data\df_data_set_North_America.csv')

columns = ['Log_Geo_Dist',
 'TENB',
 'Cog_Dist',
 'Top_regions',
 'Prov_Border',
 'NotContig',
 'Province_1_Alberta',
 'Province_1_Arizona',
 'Province_1_California',
 'Province_1_Manitoba',
 'Province_1_Quebec',
 'Province_2_Alberta',
 'Province_2_Arizona',
 'Province_2_California',
 'Province_2_Missouri',
 'Province_2_Quebec',
 'Province_1_Colorado',
 'Province_1_Maryland',
 'Province_1_Massachusetts',
 'Province_1_Minnesota',
 'Province_1_Missouri',
 'Province_1_New York',
 'Province_1_Ontario',
 'Province_1_Pennsylvania',
 'Province_1_Washington',
 'Province_2_Colorado',
 'Province_2_Maryland',
 'Province_2_Massachusetts',
 'Province_2_Minnesota',
 'Province_2_New York',
 'Province_2_Ontario',
 'Province_2_Pennsylvania',
 'Province_2_Washington',
 'Province_1_Connecticut',
 'Province_1_Florida',
 'Province_1_Illinois',
 'Province_1_Iowa',
 'Province_1_Oklahoma',
 'Province_1_South Carolina',
 'Province_2_Connecticut',
 'Province_2_Florida',
 'Province_2_Illinois',
 'Province_2_Iowa',
 'Province_2_Oklahoma',
 'Province_2_South Carolina',
 'Province_1_Alabama',
 'Province_1_Arkansas',
 'Province_1_Ohio',
 'Province_1_Oregon',
 'Province_1_Wisconsin',
 'Province_2_Alabama',
 'Province_2_Arkansas',
 'Province_2_Manitoba',
 'Province_2_Ohio',
 'Province_2_Oregon',
 'Province_2_Wisconsin',
 'Province_1_British Columbia',
 'Province_1_Louisiana',
 'Province_1_Mississippi',
 'Province_1_New Jersey',
 'Province_1_Texas',
 'Province_1_Wyoming',
 'Province_2_British Columbia',
 'Province_2_Louisiana',
 'Province_2_Mississippi',
 'Province_2_New Jersey',
 'Province_2_Texas',
 'Province_2_Wyoming',
 'Province_1_Delaware',
 'Province_1_District of Columbia',
 'Province_1_Michigan',
 'Province_1_Saskatchewan',
 'Province_1_Tennessee',
 'Province_1_Virginia',
 'Province_2_Delaware',
 'Province_2_District of Columbia',
 'Province_2_Michigan',
 'Province_2_Saskatchewan',
 'Province_2_Tennessee',
 'Province_2_Virginia',
 'Province_1_Georgia',
 'Province_2_Georgia',
 'Province_2_North Carolina',
 'Province_1_Indiana',
 'Province_1_Kansas',
 'Province_1_North Carolina',
 'Province_2_Indiana',
 'Province_2_Kansas',
 'Province_1_Alaska',
 'Province_2_Alaska',
 'Province_1_New Mexico',
 'Province_2_New Mexico',
 'Province_1_Hawaii',
 'Province_1_Idaho',
 'Province_2_Hawaii',
 'Province_2_Idaho',
 'Province_1_Utah',
 'Province_1_West Virginia',
 'Province_2_Utah',
 'Province_2_West Virginia',
 'Province_1_Maine',
 'Province_1_Nova Scotia',
 'Province_1_Rhode Island',
 'Province_2_Maine',
 'Province_2_Nova Scotia',
 'Province_2_Rhode Island',
 'Province_1_Montana',
 'Province_1_Nebraska',
 'Province_1_Nevada',
 'Province_1_New Brunswick',
 'Province_1_New Hampshire',
 'Province_1_Vermont',
 'Province_2_Montana',
 'Province_2_Nebraska',
 'Province_2_Nevada',
 'Province_2_New Brunswick',
 'Province_2_New Hampshire',
 'Province_2_Vermont']

X = df_data_set[columns]
y = df_data_set.collaboration_binary

In [None]:
unique, count = np.unique (y, return_counts = True)

y_value_count = {k : v for (k,v) in zip(unique,count)}

print ('Dataset', y_value_count)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state = 123, stratify = y)

unique, count = np.unique (y_test, return_counts = True)

y_value_count = {k : v for (k,v) in zip(unique,count)}

print ('Dataset Test', y_value_count)

In [None]:
# Feature selection

print ('Original data size: ', X.shape)

var = VarianceThreshold(threshold=(.9 * (1 - .9)))
var_fit = var.fit_transform(X)

print ('Low variance removal: ', var_fit.shape)

feature_idx = var.get_support()
my_features_var = X.columns[feature_idx].tolist()
print ('Variance threshold list of features:', my_features_var)

selbest = SelectKBest(chi2, k=40)
selbest_fit = selbest.fit_transform(X, y)
print ('SelectKBest: ', selbest_fit.shape)
feature_idx = selbest.get_support()
my_features_selbest = X.columns[feature_idx].tolist()
print ('SelectKBest list of features:', my_features_selbest)


logreg = LogisticRegression(max_iter = 500)


selmodel = SelectFromModel(logreg).fit(X, y)
X_new = selmodel.transform(X)
print ('SelectFromModel', X_new.shape)

feature_idx = selmodel.get_support()
my_features_selmodel = X.columns[feature_idx].tolist()
print ('SelectKBest list of features:', my_features_selmodel)


In [None]:
logreg = LogisticRegression(max_iter = 500)
rnd = RandomForestClassifier(random_state = 42, n_jobs = 6)
gbc = GradientBoostingClassifier()
xgb = XGBClassifier(use_label_encoder=False, eval_metric = 'logloss')
knn = KNeighborsClassifier()
rus = RandomUnderSampler()
ros = RandomOverSampler()
sm = SMOTE()
smtk = SMOTETomek(n_jobs = 6)
smnn = SMOTEENN(n_jobs = 6)
scl = StandardScaler()
feature_selection_selbest = SelectKBest(chi2, k=7)
feature_selection_selmodel = SelectFromModel(logreg)

In [None]:
# Model building - Randomforestclassifier

pipe1 = Pipeline(steps = [['RandomForestClassifier', rnd]])

pipe2 = Pipeline(steps = [['standard_scalar', scl],
                         ['RandomForestClassifier', rnd]])

pipe3 = Imbpipeline(steps = [['random_over_sampler', ros],
                         ['RandomForestClassifier', rnd]])

pipe4 = Imbpipeline(steps = [['feature_selection_selmodel', feature_selection_selmodel],
                         ['RandomForestClassifier', rnd]])

pipes = [pipe1, pipe2, pipe3, pipe4]

for pipe in pipes:
    model = pipe
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print ('F1_Score:', f1_score(y_test, y_pred))
    print ('Presicion:' , precision_score(y_test, y_pred))
    print ('Recall:' , recall_score(y_test, y_pred))
    print ('Accuracy:', accuracy_score(y_test, y_pred))
    print ('confusion_matrix:', confusion_matrix(y_test, y_pred))
#     sns.heatmap(confusion_matrix(y_test, y_pred), annot = True, fmt = 'd')

In [None]:
# Model building - XGBClassifier

pipe1 = Pipeline(steps = [['XGBClassifier', xgb]])

pipe2 = Pipeline(steps = [['standard_scalar', scl],
                         ['XGBClassifier', xgb]])

pipe3 = Imbpipeline(steps = [['random_over_sampler', ros],
                         ['XGBClassifier', xgb]])

pipe4 = Imbpipeline(steps = [['feature_selection_selmodel', feature_selection_selmodel],
                         ['XGBClassifier', xgb]])

pipes = [pipe1, pipe2, pipe3, pipe4]

for pipe in pipes:
    model = pipe
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print ('F1_Score:', f1_score(y_test, y_pred))
    print ('Presicion:' , precision_score(y_test, y_pred))
    print ('Recall:' , recall_score(y_test, y_pred))
    print ('Accuracy:', accuracy_score(y_test, y_pred))
    print ('confusion_matrix:', confusion_matrix(y_test, y_pred))

In [None]:
# Hyperparameter tuning - Randomforestclassifier


# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]

# Number of features to consider at every split
max_features = ['auto', 'sqrt']

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)

# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]

# Method of selecting samples for training each tree
bootstrap = [True, False]

random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

print('Parameters used in the grid:\n')
pprint(random_grid)

skf = StratifiedKFold(n_splits = 4)

model = RandomizedSearchCV(estimator = rnd,
                           param_distributions = random_grid, 
                           cv = skf, n_jobs = 6, 
                           verbose = 3,
                           n_iter = 100,
                           random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print ('F1_Score:', f1_score(y_test, y_pred))
print ('Presicion' , precision_score(y_test, y_pred))
print ('Recall' , recall_score(y_test, y_pred))
print ('Best Parameters', model.best_params_)


In [None]:
# Hyperparameter tuning - XGBClassifier

skf = StratifiedKFold(n_splits = 5)
model = RandomizedSearchCV(estimator = xgb,
                           param_distributions = {}, 
                           cv = skf, n_jobs = 6, 
                           verbose = 3,
                           n_iter = 1)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print ('F1_Score:', f1_score(y_test, y_pred))
print ('Presicion' , precision_score(y_test, y_pred))
print ('Recall' , recall_score(y_test, y_pred))

In [None]:
pipe1 = Pipeline(steps = [['logreg', logreg]])

pipe2 = Pipeline(steps = [['feature_selection', feature_selection_selbest],
                          ['logreg', logreg]])

pipe3 = Imbpipeline(steps = [['random_under_sampler', rus],
                             ['feature_selection', feature_selection_selbest],
                             ['logreg', logreg]])

pipe4 = Imbpipeline(steps = [['random_over_sampler', ros],
                             ['feature_selection', feature_selection_selbest],
                             ['logreg', logreg]])


pipe5 = Imbpipeline(steps = [['random_under_sampler', rus],
                             ['feature_selection', feature_selection_selbest],
                             ['standard_scalar', scl],
                             ['logreg', logreg]])

pipe6 = Imbpipeline(steps = [['random_over_sampler', ros],
                             ['feature_selection', feature_selection_selbest],
                             ['standard_scalar', scl],
                             ['logreg', logreg]])

pipe7 = Pipeline(steps = [['XGBoost', xgb]])

pipe8 = Pipeline(steps = [['feature_selection', feature_selection_selbest],
                          ['XGBoost', xgb]])

pipe9 = Imbpipeline(steps = [['random_under_sampler', rus],
                             ['feature_selection', feature_selection_selbest],
                             ['XGBoost', xgb]])

pipe10 = Imbpipeline(steps = [['random_over_sampler', ros],
                             ['feature_selection', feature_selection_selbest],
                             ['XGBoost', xgb]])


pipe11 = Imbpipeline(steps = [['random_under_sampler', rus],
                             ['feature_selection', feature_selection_selbest],
                             ['standard_scalar', scl],
                             ['XGBoost', xgb]])

pipe12 = Imbpipeline(steps = [['random_over_sampler', ros],
                             ['feature_selection', feature_selection_selbest],
                             ['standard_scalar', scl],
                             ['XGBoost', xgb]])


pipe13 = Imbpipeline(steps = [['standard_scalar', scl],
                             ['XGBoost', xgb]])


pipe14 = Imbpipeline(steps = [['random_over_sampler', ros],
                             ['standard_scalar', scl],
                             ['XGBoost', xgb]])


pipe15 = Imbpipeline(steps = [['feature_selection', feature_selection_selbest],
                             ['standard_scalar', scl],
                             ['XGBoost', xgb]])

pipe16 = Pipeline(steps = [['RandomForestClassifier', rnd]])

pipe17 = Imbpipeline(steps = [['standard_scalar', scl],
                              ['RandomForestClassifier', rnd]])

pipe18 = Imbpipeline(steps = [['feature_selection', feature_selection_selmodel],
                              ['RandomForestClassifier', rnd]])

pipe19 = Imbpipeline(steps = [['feature_selection', feature_selection_selmodel],
                              ['standard_scalar', scl],
                              ['RandomForestClassifier', rnd]])

pipe20 = Imbpipeline(steps = [['standard_scalar', scl],
                              ['feature_selection', feature_selection_selmodel],
                              ['RandomForestClassifier', rnd]])

pipes = [pipe16, pipe17, pipe18, pipe19, pipe20]

skf = StratifiedKFold(n_splits = 5)

# param_grid = {'logreg__max_iter' : [100, 500, 1000]}

# param_distributions = {'RandomForestClassifier__n_estimators' : [int(x) for x in np.linspace(start = 200, stop = 2000, num = 100)]}

for pipe in pipes:
    model = RandomizedSearchCV(estimator = pipe,
                           param_distributions = {}, 
                           cv = skf, n_jobs = 6, 
                           verbose = 3,
                           n_iter = 1)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print ('F1_Score:', f1_score(y_test, y_pred))
    print ('Presicion' , precision_score(y_test, y_pred))
    print ('Recall' , recall_score(y_test, y_pred))

In [None]:
model.best_estimator_.named_steps['RandomForestClassifier'].get_params()

In [None]:
print (model.best_estimator_.named_steps['XGBoost'].feature_importances_)
print (model.best_estimator_.named_steps['XGBoost'].get_booster().feature_names)
