In [None]:
import numpy as np
import pandas as pd
import pickle
import datetime as dt
import sklearn.model_selection
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
import math
from sklearn.metrics import make_scorer, roc_auc_score, classification_report, confusion_matrix, f1_score, \
precision_score, recall_score
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import VotingRegressor, VotingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
%matplotlib inline
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier



# suppressing pesky warnings

import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=Warning)

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from xgboost import XGBClassifier 
from xgboost import XGBRegressor
from sklearn.model_selection import StratifiedKFold
import catboost as cb

import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
import datetime

# Visualisation
#import geopandas as gpd
#from geopandas import GeoDataFrame
#from pyproj import CRS
#from shapely.geometry import Point, Polygon

### We will build and compare the following models, then choose one to use on the test set:

- Decision Tree
- Random Forest
- KNearest Neighbors
- Catboost
- XGBoost
- Voting Classifier that incorporates above models

The basic procedure for building each model is similar. First, we split the training data 80-20. Each model will test on the 20%, and the model that performs the best on that set will be used on the actual test set.

For each model ,we create set of model parameters to iterate through and find the best set of parameters to fit the 80% training data using grid search, determined by best ROC_AUC values. We then save those models to a pickle file, using these models as inputs to the voting classifier model

In [None]:
train_data = pd.read_csv("train_data.csv", index_col = 0)

X = train_data.iloc[:, 1:]
y = train_data.iloc[:, 0]
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)


In [None]:
len(x_train), len(y_train), len(x_test), len(y_test), len(train_data)

In [None]:
plt.figure(figsize=(25,25))
cor = train_data.corr()
sns.heatmap(cor, annot=True, cmap=plt.cm.Reds)
plt.show()

### Model 1: Decision Tree

In [None]:
def createRandomGrid():
    
    criterion = ["gini", "entropy"]
    max_depth = range(1,5)
    min_samples_split = range(2,10)
    min_samples_leaf = range(1,10)
    #min_samples_split = [2, 5, 10, 20]
    #min_samples_leaf = [1, 20, 40, 100]
    
    random_grid = {"criterion": criterion ,
             "min_samples_split": min_samples_split,
              "min_samples_leaf": min_samples_leaf,
              }
        
    #random_grid = {"criterion": criterion}
    
    return random_grid

def runDecisionTree(random_grid, x_train, y_train):
    
    #rmse_score = make_scorer(rmse, greater_is_better = False)
    
    decision_tree = DecisionTreeClassifier(random_state=42)
    grid_search = GridSearchCV(decision_tree, random_grid, cv=10, scoring='roc_auc', n_jobs = -1)
    grid_search.fit(x_train, y_train)
    final = grid_search.best_params_
    print(final)
    print(grid_search.best_score_)
    return grid_search.best_estimator_

param_set = createRandomGrid()
best_model = runDecisionTree(param_set, x_train, y_train)
best_model.fit(x_train,y_train)
pickle.dump(best_model, open("DecisionTreeModel.pkl", 'wb'))

### Model 2: K-Nearest Neighbors

In [None]:
def createRandomGrid():
    
    n_neighbors = [2,5,10,15]
    weights = ['uniform', 'distance']
    metric = ['euclidean', 'manhattan']
    
    random_grid = {'n_neighbors': n_neighbors, 
                   'weights': weights,
                  'metric': metric}

    
    
    return random_grid

def runKNeighbors(random_grid, x_train, y_train):
    
#     rmse_score = make_scorer(rmse, greater_is_better = False)
    
    K_neigh = KNeighborsClassifier()
    grid_search = GridSearchCV(K_neigh, random_grid, cv=5, scoring='roc_auc', n_jobs = -1)
    grid_search.fit(x_train, y_train)
    final = grid_search.best_params_
    print(final)
    print(grid_search.best_score_)
    return grid_search.best_estimator_

param_set = createRandomGrid()
best_model = runKNeighbors(param_set, x_train, y_train)
best_model.fit(x_train,y_train)
pickle.dump(best_model, open("KNeighbors.pkl", 'wb'))

### Model 3: Random Forest

In [None]:
def createRandomGrid():
    n_estimators = [10 , 50, 100, 200, 400, 500]
    max_features = [6,8,10,12, 14 ]
    max_depth = [2] 
    min_samples_leaf = [1,5,10,25,50]
    bootstrap = [True, False]

    random_grid = {'n_estimators': n_estimators,
                   'bootstrap': bootstrap,
                    'max_features': max_features,
                   'max_depth': max_depth,
                   'min_samples_leaf': min_samples_leaf
                                    }
    
    return random_grid

def runRandomForest(random_grid, x_train, y_train):
    
    forest = RandomForestClassifier(random_state = 42)
    grid_search = GridSearchCV(forest, random_grid, cv=5, scoring='roc_auc', n_jobs = -1)
    grid_search.fit(x_train, y_train)
    final = grid_search.best_params_
    print(final)
    print(grid_search.best_score_)
    return grid_search.best_estimator_

param_set = createRandomGrid()
best_model = runRandomForest(param_set, x_train, y_train)
best_model.fit(x_train,y_train)
pickle.dump(best_model, open("RandomForestModel.pkl", 'wb'))

In [None]:
# rfc = RandomForestClassifier()
# rfc.fit(x_train, y_train)
# y_pred_prob = rfc.predict_proba(x_test)[:, 1]

In [None]:
# for i in np.round(np.linspace(0.05, 0.95, 19), 3):
#     print('#########################')
#     print('Threshold =', i)
#     yp = np.where(y_pred_prob >= i, 1, 0)
#     print(classification_report(y_test, yp), confusion_matrix(y_test, yp), roc_auc_score(y_test, yp))
#     print('#########################')

# Model 4: XGBoost

In [None]:
def createRandomGrid():
        
        random_grid = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5]
        }

    
        return random_grid

def runXGB(random_grid, x_train, y_train):
    
    xgb = XGBClassifier(learning_rate=0.02, n_estimators=600, objective='binary:logistic',
                    silent=True, nthread=1)
    
    
    
    grid_search = GridSearchCV(xgb, random_grid, cv=5, scoring='roc_auc', n_jobs = 4, verbose=3)
    grid_search.fit(x_train, y_train)
    final = grid_search.best_params_
    print(final)
    print(grid_search.best_score_)
    return grid_search.best_estimator_


param_set = createRandomGrid()
best_model = runXGB(param_set, x_train, y_train)
best_model.fit(x_train,y_train)
pickle.dump(best_model, open("XGBModel.pkl", 'wb'))

### Model 5: Cat Boost

In [None]:
def createRandomGrid():
        
        random_grid = {
        'learning_rate': [0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
        'depth': [1,2,3,4,5,6,7],
        'l2_leaf_reg': [1,2,3,4,5,6,7,8,9,10]
        }

    
        return random_grid

def runCat(random_grid, x_train, y_train):
    
    cat = cb.CatBoostClassifier(silent=True)
    
    
    
    grid_search = GridSearchCV(cat, random_grid, cv=5, scoring='roc_auc', n_jobs = 4, verbose=3)
    grid_search.fit(x_train, y_train)
    final = grid_search.best_params_
    print(final)
    print(grid_search.best_score_)
    return grid_search.best_estimator_


param_set = createRandomGrid()
best_model = runCat(param_set, x_train, y_train)
best_model.fit(x_train,y_train)
pickle.dump(best_model, open("CatModel.pkl", 'wb'))

### Model 6: Voting Regressor

In [None]:
loaded_rf_model = pickle.load(open("RandomForestModel.pkl", 'rb'))
loaded_kn_model = pickle.load(open("KNeighbors.pkl", 'rb'))
loaded_dt_model = pickle.load(open("DecisionTreeModel.pkl", 'rb'))
loaded_xg_model = pickle.load(open("XGBModel.pkl", 'rb'))
loaded_cb_model = pickle.load(open('CatModel.pkl', 'rb'))


vtr = VotingClassifier([('rf', loaded_rf_model), ('kn', loaded_kn_model), ('dt', loaded_dt_model), ('xg', loaded_xg_model),
                        ('cb', loaded_cb_model)])
vtr.fit(x_train,y_train)
pickle.dump(vtr, open("VotingClassifierModel.pkl", 'wb'))

### Model Evaluation

Here we will load the optimal models we created and run them on our test set. For each model we calculate the following metrics:

- ROC_AUC Score
- Brier Score
- Precision and Recall

In [None]:
#Loading Models

rf_model = pickle.load(open("RandomForestModel.pkl", 'rb'))
kn_model = pickle.load(open("KNeighbors.pkl", 'rb'))
dt_model = pickle.load(open("DecisionTreeModel.pkl", 'rb'))
vtr_model = pickle.load(open("VotingClassifierModel.pkl", 'rb'))
xg_model = pickle.load(open("XGBModel.pkl", 'rb'))
cb_model = pickle.load(open('CatModel.pkl', 'rb'))

#x_train = pickle.load(open("SplitTrainX_V2.pkl", 'rb'))
#x_test = pickle.load(open("SplitTestX_V2.pkl", 'rb'))
#y_train = pickle.load(open("SplitTrainY_V2.pkl", 'rb'))
#y_test = pickle.load(open("SplitTestY_V2.pkl", 'rb'))

In [None]:
def brier_score(y_test, y_pred):
    return (1 / len(y_pred)) * sum((y_pred - y_test)**2)

In [None]:
#Looping through models and running them on test set

model_list = [rf_model, kn_model, dt_model, vtr_model,xg_model, cb_model]
model_names = ["RandomForest", "KNearestNeighbors", "DecisionTree", "VotingClassifier", "XGBoost", 'CatBoost']
roc_auc_list = []
results_df_list = []
for i in range(6):
    model = model_list[i]
    model_name = model_names[i]
    y_test_predictions =  model.predict(x_test)
    y_train_predictions = model.predict(x_train)

#     rmse_test = math.sqrt(mean_squared_error(y_test, y_test_predictions))
#     rmse_train = math.sqrt(mean_squared_error(y_train, y_train_predictions))
    roc_auc_test = roc_auc_score(y_test, y_test_predictions)
    roc_auc_train = roc_auc_score(y_train, y_train_predictions)
    cv_scores = cross_val_score(model, x_train, y_train, cv=10, scoring='roc_auc', n_jobs=-1)
    roc_auc_cv = cv_scores.mean()
    f1 = f1_score(y_test, y_test_predictions)
    if(model_name == 'VotingClassifier'):
        brier = np.nan
    else:
        y_test_pred_prob = model.predict_proba(x_test)[:, 1]
        brier = brier_score(y_test, y_test_pred_prob)
    precision_test = precision_score(y_test, y_test_predictions)
    recall_test = recall_score(y_test, y_test_predictions)
    cf = confusion_matrix(y_test, y_test_predictions)
    cf = cf.flatten()

    roc_auc_list.append([model_name,roc_auc_train,roc_auc_cv, roc_auc_test, f1, brier, precision_test, recall_test, 
                         cf[0], cf[1], cf[2], cf[3]])
    
    y_test = np.array(list(y_test))
    y_test_predictions = np.array(y_test_predictions)
    dfi = pd.DataFrame({'Actual IR': y_test, 'Predicted IR': y_test_predictions})
    results_df_list.append(dfi)

In [None]:
roc_auc_df = pd.DataFrame(roc_auc_list, columns = ["Model Name", "Train ROC AUC", "CV ROC AUC", "Test ROC AUC", 
                                                   'Test F1 Score', 'Brier Score', 'Test Precision', 'Test Recall', 
                                                   'Test TN', 'Test FN', 'Test FP', 'Test TP'])
roc_auc_df.sort_values(by=['Test ROC AUC'], ascending = False)

In [None]:
y_pred_prob = xg_model.predict_proba(x_test)[:, 1]

In [None]:
for i in np.round(np.linspace(0.05, 0.95, 19), 3):
    print('#########################')
    print('Threshold =', i)
    yp = np.where(y_pred_prob >= i, 1, 0)
    print(classification_report(y_test, yp), confusion_matrix(y_test, yp), roc_auc_score(y_test, yp), recall_score(y_test, yp))
    print('#########################')