In [39]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime
from sklearn.feature_extraction.text import CountVectorizer


from sklearn import metrics
from sklearn.metrics import mean_squared_error

from sklearn.linear_model import (
    Ridge,RidgeCV,
    Lasso,LassoCV,
    ElasticNet, ElasticNetCV,
    LinearRegression
)
from sklearn.model_selection import(
    cross_val_score,
    train_test_split
)
from sklearn.preprocessing import (
    StandardScaler,
    PolynomialFeatures
)
# pd.set_option('display.max_rows', 1000)  # or 1000
pd.set_option('display.max_columns', 50)  # or 1000

from ipywidgets import *
from IPython.display import display

from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.under_sampling import ClusterCentroids
from imblearn.pipeline import Pipeline, make_pipeline

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, roc_auc_score
from sklearn.inspection import permutation_importance



In [40]:
merged_train = pd.read_csv('../data/merged_train.csv')
merged_test = pd.read_csv('../data/merged_test.csv')

In [41]:
merged_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8475 entries, 0 to 8474
Data columns (total 46 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   date                    8475 non-null   object 
 1   address                 8475 non-null   object 
 2   species                 8475 non-null   object 
 3   block                   8475 non-null   int64  
 4   street                  8475 non-null   object 
 5   trap                    8475 non-null   object 
 6   addressnumberandstreet  8475 non-null   object 
 7   latitude                8475 non-null   float64
 8   longitude               8475 non-null   float64
 9   addressaccuracy         8475 non-null   int64  
 10  nummosquitos            8475 non-null   float64
 11  wnvcount                8475 non-null   float64
 12  wnvpresent              8475 non-null   float64
 13  station                 8475 non-null   int64  
 14  tmax                    8475 non-null   

In [42]:
merged_train['date'].dtypes !='O'

False

In [43]:
merged_train = pd.get_dummies(merged_train, columns = ['species'], drop_first = True)
merged_test = pd.get_dummies(merged_test, columns = ['species'], drop_first = True)

In [44]:
X = merged_train[[col for col in merged_train.columns if (merged_train[col].dtypes !='O') & (col not in ['block','addressaccuracy','nummosquitos','wnvcount','wnvpresent', 'station','species_CULEX SALINARIUS',
 'species_CULEX TARSALIS',
 'species_CULEX TERRITANS'])]]
y = merged_train['wnvpresent']

In [45]:
# Splitting X and y into training and testing datasets.

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

In [46]:
# Scaling X_train to the standard scale.

ss = StandardScaler()
X_train_sc = ss.fit_transform(X_train)

In [47]:
# Transforming X_test to the same scale.

X_test_sc = ss.transform(X_test)

### Class Balancing Techniques

In [48]:
smote = SMOTE(random_state=42)
adasyn = ADASYN(random_state=42)
clustercentroids = ClusterCentroids(random_state=42)

In [49]:
X_train_sc_smote, y_train_smote = smote.fit_resample(X_train_sc, y_train)
X_train_sc_adasyn, y_train_adasyn = adasyn.fit_resample(X_train_sc, y_train)
X_train_sc_clustercentroids, y_train_clustercentroids = clustercentroids.fit_resample(X_train_sc, y_train)

In [50]:
# Checking the distribution of classes with SMOTE balancing technique.

y_train_smote.value_counts()

0.0    6013
1.0    6013
Name: wnvpresent, dtype: int64

In [51]:
# Checking the distribution of classes with SMOTE balancing technique.

y_train_smote.value_counts(normalize=True)

0.0    0.5
1.0    0.5
Name: wnvpresent, dtype: float64

In [52]:
# Checking the distribution of classes with ADASYN balancing technique.

y_train_adasyn.value_counts()

1.0    6064
0.0    6013
Name: wnvpresent, dtype: int64

In [53]:
# Checking the distribution of classes with ADASYN balancing technique.

y_train_adasyn.value_counts(normalize=True)

1.0    0.502111
0.0    0.497889
Name: wnvpresent, dtype: float64

In [54]:
# Checking the distribution of classes with ClusterCentroids balancing technique.

y_train_clustercentroids.value_counts()

0.0    343
1.0    343
Name: wnvpresent, dtype: int64

In [55]:
# Checking the distribution of classes with ClusterCentroids balancing technique.

y_train_clustercentroids.value_counts(normalize=True)

0.0    0.5
1.0    0.5
Name: wnvpresent, dtype: float64

As explained above, all three balancing techniques from the *imblearn* library help to balance the proportions of the classes in our sample population.

***SMOTE* and *ADASYN* are over-sampling techniques**, which means they create copies of the minority class data points with small variations, making the synthetic samples more diverse (as explained [here](https://www.kaggle.com/rafjaa/resampling-strategies-for-imbalanced-datasets)). So, as shown above, **the number of data points in the minority class increases to match the number of data points in the majority class (5885)**.

***ClusterCentroids* is an under-sampling technique**, which means it gets rid of data from the majority class in order to balance the two classes. It does so by finding clusters of data points in the majority class, and then inferring which data points in the majority class are 'central' in that cluster. The model then uses those centroids (central points) for the majority class instead of all the actual data points (as explained [here](https://dev.to/lberlin/balancing-the-imbalanced-2bgo)). So, as shown above, **the number of data points in the majority class decreases significantly to match the number of data points in the minority class (343)**.

## Classification Modelling

### *GridSearchCV* for *LogisticRegression* with *SMOTE* balancing technique

In [56]:
# Creating a pipeline object using imblearn.pipeline with SMOTE and LogisticRegression.

pipe1 = make_pipeline(SMOTE(random_state=42),
                      LogisticRegression(penalty='elasticnet', solver='saga', random_state=42, max_iter=500, tol=0.005)
                     )

In [57]:
pipe1.get_params()

{'memory': None,
 'steps': [('smote', SMOTE(random_state=42)),
  ('logisticregression',
   LogisticRegression(max_iter=500, penalty='elasticnet', random_state=42,
                      solver='saga', tol=0.005))],
 'verbose': False,
 'smote': SMOTE(random_state=42),
 'logisticregression': LogisticRegression(max_iter=500, penalty='elasticnet', random_state=42,
                    solver='saga', tol=0.005),
 'smote__k_neighbors': 5,
 'smote__n_jobs': None,
 'smote__random_state': 42,
 'smote__sampling_strategy': 'auto',
 'logisticregression__C': 1.0,
 'logisticregression__class_weight': None,
 'logisticregression__dual': False,
 'logisticregression__fit_intercept': True,
 'logisticregression__intercept_scaling': 1,
 'logisticregression__l1_ratio': None,
 'logisticregression__max_iter': 500,
 'logisticregression__multi_class': 'auto',
 'logisticregression__n_jobs': None,
 'logisticregression__penalty': 'elasticnet',
 'logisticregression__random_state': 42,
 'logisticregression__solver': '

In [58]:
# Establishing ranges of hyperparameters C and l1_ratio for GridSearchCV.

pipe1_params = {'logisticregression__C': np.logspace(-2, 2, 5),
                'logisticregression__l1_ratio': [0, 0.25, 0.5, 0.75, 1]
               }

In [59]:
# Creating a GridSearchCV object for the pipeline object defined above.

gs_pipe1 = GridSearchCV(pipe1, 
                        param_grid=pipe1_params, 
                        cv=5, 
                        scoring='roc_auc'
                       )

In [60]:
# Fitting GridSearchCV with SMOTE and LogisticRegression on X_train_sc and y_train.

gs_pipe1.fit(X_train_sc, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('smote', SMOTE(random_state=42)),
                                       ('logisticregression',
                                        LogisticRegression(max_iter=500,
                                                           penalty='elasticnet',
                                                           random_state=42,
                                                           solver='saga',
                                                           tol=0.005))]),
             param_grid={'logisticregression__C': array([1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02]),
                         'logisticregression__l1_ratio': [0, 0.25, 0.5, 0.75,
                                                          1]},
             scoring='roc_auc')

In [61]:
# Best combination of hyperparameters suggested by GridSearchCV.

gs_pipe1.best_params_

{'logisticregression__C': 100.0, 'logisticregression__l1_ratio': 1}

In [62]:
# Best roc_auc score obtained by above combination of hyperparameters.

gs_pipe1.best_score_

0.8204137876470015

In [63]:
# Scoring the model on training dataset (roc_auc score).

gs_pipe1.score(X_train_sc, y_train)

0.8364471730104697

In [64]:
# Scoring the model on testing dataset (roc_auc score).

gs_pipe1.score(X_test_sc, y_test)

0.7941549634685217

Even though the best parameters were suggested above by *GridSearchCV*, the model was further tuned manually using different combinations of hyperparameters. ***GridSearchCV* only optimizes the model using a single scoring parameter (set as *'roc_auc'* above). However, in this context, we need to find a good balance between *accuracy* and *sensitivity* scores, and so a manual tuning of hyperparameters is required.**

The final set of hyperparameters chosen after manual tuning of the model are as shown below.

In [65]:
# Creating a pipeline object using imblearn.pipeline with SMOTE and LogisticRegression using best params suggested by GridSearchCV above.

lr_smote_pipe1 = make_pipeline(SMOTE(random_state=42),
                               LogisticRegression(penalty='elasticnet',
                                                  solver='saga',
                                                  C=100,
                                                  l1_ratio=0.75,
                                                  random_state=42,
                                                  max_iter=500,
                                                  tol=0.005
                                                 )
                              )

In [66]:
# Fitting X_train_sc and y_train on the pipeline object defined above.

lr_smote_pipe1.fit(X_train_sc, y_train)

Pipeline(steps=[('smote', SMOTE(random_state=42)),
                ('logisticregression',
                 LogisticRegression(C=100, l1_ratio=0.75, max_iter=500,
                                    penalty='elasticnet', random_state=42,
                                    solver='saga', tol=0.005))])

In [67]:
# Scoring the model on training dataset.
# Training Accuracy

train_acc1 = lr_smote_pipe1.score(X_train_sc, y_train)
train_acc1

0.6875393329137822

In [68]:
# Estimated Testing Accuracy

est_test_acc1 = cross_val_score(lr_smote_pipe1, X_train_sc, y_train, cv=5).mean()
est_test_acc1

0.6914715793536511

In [69]:
# Actual Testing Accuracy

test_acc1 = lr_smote_pipe1.score(X_test_sc, y_test)
test_acc1

0.680509674374705

In [70]:
# Generating predictions on testing dataset using the model above.

y_pred1 = lr_smote_pipe1.predict(X_test_sc)

In [71]:
# Generating prediction probabilities on testing dataset using the model above.

y_pred_proba1 = lr_smote_pipe1.predict_proba(X_test_sc)

In [72]:
# Generating a confusion matrix.

tn, fp, fn, tp = confusion_matrix(y_test, y_pred1).ravel()

In [73]:
conf_mat1 = pd.DataFrame(columns=['Actual WnvPresent=1', 'Actual WnvPresent=0'], 
                         index=['Predicted WnvPresent=1', 'Predicted WnvPresent=0'],
                         data=[[f'{tp} (True Pos)', f'{fp} (False Pos)'], [f'{fn} (False Neg)', f'{tn} (True Neg)']]
                        )
conf_mat1

Unnamed: 0,Actual WnvPresent=1,Actual WnvPresent=0
Predicted WnvPresent=1,93 (True Pos),656 (False Pos)
Predicted WnvPresent=0,21 (False Neg),1349 (True Neg)


In [74]:
sensitivity1 = tp/(tp+fn)
sensitivity1

0.8157894736842105

In [75]:
specificity1 = tn/(tn+fp)
specificity1

0.6728179551122194

In [76]:
precision1 = tp/(tp+fp)
precision1

0.12416555407209613

In [77]:
roc_auc1 = roc_auc_score(y_test, y_pred_proba1[:, 1])
roc_auc1

0.7941637135232096

In [78]:
# Generating prediction probabilities on actual testing dataset using the model above (for kaggle submission).

submission_pred_proba1 = lr_smote_pipe1.predict_proba(act_test_X_sc)[:, 1]

NameError: name 'act_test_X_sc' is not defined

In [None]:
# Putting the above predicted probabilities into a dataframe and exporting it as a csv file for submission to kaggle.

sub1 = pd.DataFrame({'Id' : test['Id'].values, 'WnvPresent' : submission_pred_proba1})
sub1.to_csv('../kaggle_submissions/submission_1_logreg_smote.csv', index=False)

In [None]:
summary_df = pd.DataFrame(columns=['Classifier',
                                   'Class Balancing Technique',
                                   'Train Accuracy',
                                   'Est. Test Accuracy (cv=5)',
                                   'Actual Test Accuracy',
                                   'Overfit / Underfit',
                                   'Sensitivity',
                                   'Specificity',
                                   'Precision',
                                   'ROC-AUC',
                                   'Kaggle ROC-AUC'
                                  ])

In [None]:
summary_df.loc[1] = ["LogisticRegression",
                     "SMOTE",
                     round(train_acc1, 3),
                     round(est_test_acc1, 3),
                     round(test_acc1, 3),
                     round(train_acc1-test_acc1, 3),
                     round(sensitivity1, 3),
                     round(specificity1, 3),
                     round(precision1, 3),
                     round(roc_auc1, 3),
                     0.661
                    ]

summary_df

Unnamed: 0,Classifier,Class Balancing Technique,Train Accuracy,Est. Test Accuracy (cv=5),Actual Test Accuracy,Overfit / Underfit,Sensitivity,Specificity,Precision,ROC-AUC,Kaggle ROC-AUC
1,LogisticRegression,SMOTE,0.707,0.698,0.712,-0.004,0.851,0.704,0.143,0.858,0.661


### *RandomForestClassifier* with *SMOTE* balancing technique

Using a similar approach as with *LogisticRegression* above, manual tweaking of hyperparameters was done after using *GridSearchCV*. As explained above, ***GridSearchCV* only optimizes the model using a single scoring parameter. However, in this context, we need to find a good balance between *accuracy* and *sensitivity* scores, and so a manual tuning of hyperparameters is required**.

The final set of hyperparameters chosen after manual tuning of the model are as shown below.

In [None]:
# Creating a pipeline object using imblearn.pipeline with SMOTE and RandomForestClassifier.

rfc_smote_pipe2 = make_pipeline(SMOTE(random_state=42),
                                RandomForestClassifier(n_estimators=100,
                                                       ccp_alpha=0,
                                                       max_depth=5,
                                                       min_samples_split=2,
                                                       min_samples_leaf=3,
                                                       random_state=42
                                                      )
                               )

In [None]:
# Fitting X_train_sc and y_train on the pipeline object defined above.

rfc_smote_pipe2.fit(X_train_sc, y_train)

Pipeline(memory=None,
         steps=[('smote',
                 SMOTE(k_neighbors=5, n_jobs=None, random_state=42,
                       sampling_strategy='auto')),
                ('randomforestclassifier',
                 RandomForestClassifier(bootstrap=True, ccp_alpha=0,
                                        class_weight=None, criterion='gini',
                                        max_depth=5, max_features='auto',
                                        max_leaf_nodes=None, max_samples=None,
                                        min_impurity_decrease=0.0,
                                        min_impurity_split=None,
                                        min_samples_leaf=3, min_samples_split=2,
                                        min_weight_fraction_leaf=0.0,
                                        n_estimators=100, n_jobs=None,
                                        oob_score=False, random_state=42,
                                        verbose=0, warm_start=F

In [None]:
# Scoring the model on training dataset.
# Training Accuracy

train_acc2 = rfc_smote_pipe2.score(X_train_sc, y_train)
train_acc2

0.7784200385356455

In [None]:
# Estimated Testing Accuracy

est_test_acc2 = cross_val_score(rfc_smote_pipe2, X_train_sc, y_train, cv=5).mean()
est_test_acc2

0.7681431343350933

In [None]:
# Actual Testing Accuracy

test_acc2 = rfc_smote_pipe2.score(X_test_sc, y_test)
test_acc2

0.7750481695568401

In [None]:
# Generating predictions on testing dataset using the model above.

y_pred2 = rfc_smote_pipe2.predict(X_test_sc)

In [None]:
# Generating prediction probabilities on testing dataset using the model above.

y_pred_proba2 = rfc_smote_pipe2.predict_proba(X_test_sc)

In [None]:
# Generating a confusion matrix.

tn, fp, fn, tp = confusion_matrix(y_test, y_pred2).ravel()

In [None]:
conf_mat2 = pd.DataFrame(columns=['Actual WnvPresent=1', 'Actual WnvPresent=0'], 
                         index=['Predicted WnvPresent=1', 'Predicted WnvPresent=0'],
                         data=[[f'{tp} (True Pos)', f'{fp} (False Pos)'], [f'{fn} (False Neg)', f'{tn} (True Neg)']]
                        )
conf_mat2

Unnamed: 0,Actual WnvPresent=1,Actual WnvPresent=0
Predicted WnvPresent=1,97 (True Pos),450 (False Pos)
Predicted WnvPresent=0,17 (False Neg),1512 (True Neg)


In [None]:
sensitivity2 = tp/(tp+fn)
sensitivity2

0.8508771929824561

In [None]:
specificity2 = tn/(tn+fp)
specificity2

0.7706422018348624

In [None]:
precision2 = tp/(tp+fp)
precision2

0.1773308957952468

In [None]:
roc_auc2 = roc_auc_score(y_test, y_pred_proba2[:, 1])
roc_auc2

0.8722369762326305

In [None]:
# Generating prediction probabilities on actual testing dataset using the model above (for kaggle submission).

submission_pred_proba2 = rfc_smote_pipe2.predict_proba(act_test_X_sc)[:, 1]

In [None]:
# Putting the above predicted probabilities into a dataframe and exporting it as a csv file for submission to kaggle.

sub2 = pd.DataFrame({'Id' : test['Id'].values, 'WnvPresent' : submission_pred_proba2})
sub2.to_csv('../kaggle_submissions/submission_2_rfc_smote.csv', index=False)

In [None]:
summary_df.loc[2] = ["RandomForestClassifier",
                     "SMOTE",
                     round(train_acc2, 3),
                     round(est_test_acc2, 3),
                     round(test_acc2, 3),
                     round(train_acc2-test_acc2, 3),
                     round(sensitivity2, 3),
                     round(specificity2, 3),
                     round(precision2, 3),
                     round(roc_auc2, 3),
                     0.706
                    ]

summary_df

Unnamed: 0,Classifier,Class Balancing Technique,Train Accuracy,Est. Test Accuracy (cv=5),Actual Test Accuracy,Overfit / Underfit,Sensitivity,Specificity,Precision,ROC-AUC,Kaggle ROC-AUC
1,LogisticRegression,SMOTE,0.707,0.698,0.712,-0.004,0.851,0.704,0.143,0.858,0.661
2,RandomForestClassifier,SMOTE,0.778,0.768,0.775,0.003,0.851,0.771,0.177,0.872,0.706


### *SVC* with *SMOTE* balancing technique

Using a similar approach as above, manual tweaking of hyperparameters was done after using *GridSearchCV*. As explained above, ***GridSearchCV* only optimizes the model using a single scoring parameter. However, in this context, we need to find a good balance between *accuracy* and *sensitivity* scores, and so a manual tuning of hyperparameters is required**.

The final set of hyperparameters chosen after manual tuning of the model are as shown below.

In [None]:
# Creating a pipeline object using imblearn.pipeline with SMOTE and SVC.

svc_smote_pipe3 = make_pipeline(SMOTE(random_state=42),
                                SVC(C=0.1,
                                    kernel='rbf',
                                    probability=True, 
                                    random_state=42
                                   )
                               )

In [None]:
# Fitting X_train_sc and y_train on the pipeline object defined above.

svc_smote_pipe3.fit(X_train_sc, y_train)

Pipeline(memory=None,
         steps=[('smote',
                 SMOTE(k_neighbors=5, n_jobs=None, random_state=42,
                       sampling_strategy='auto')),
                ('svc',
                 SVC(C=0.1, break_ties=False, cache_size=200, class_weight=None,
                     coef0=0.0, decision_function_shape='ovr', degree=3,
                     gamma='scale', kernel='rbf', max_iter=-1, probability=True,
                     random_state=42, shrinking=True, tol=0.001,
                     verbose=False))],
         verbose=False)

In [None]:
# Scoring the model on training dataset.
# Training Accuracy

train_acc3 = svc_smote_pipe3.score(X_train_sc, y_train)
train_acc3

0.7032755298651252

In [None]:
# Estimated Testing Accuracy

est_test_acc3 = cross_val_score(svc_smote_pipe3, X_train_sc, y_train, cv=5).mean()
est_test_acc3

0.6931616030736106

In [None]:
# Actual Testing Accuracy

test_acc3 = svc_smote_pipe3.score(X_test_sc, y_test)
test_acc3

0.6936416184971098

In [None]:
# Generating predictions on testing dataset using the model above.

y_pred3 = svc_smote_pipe3.predict(X_test_sc)

In [None]:
# Generating prediction probabilities on testing dataset using the model above.

y_pred_proba3 = svc_smote_pipe3.predict_proba(X_test_sc)

In [None]:
# Generating a confusion matrix.

tn, fp, fn, tp = confusion_matrix(y_test, y_pred3).ravel()

In [None]:
conf_mat3 = pd.DataFrame(columns=['Actual WnvPresent=1', 'Actual WnvPresent=0'], 
                         index=['Predicted WnvPresent=1', 'Predicted WnvPresent=0'],
                         data=[[f'{tp} (True Pos)', f'{fp} (False Pos)'], [f'{fn} (False Neg)', f'{tn} (True Neg)']]
                        )
conf_mat3

Unnamed: 0,Actual WnvPresent=1,Actual WnvPresent=0
Predicted WnvPresent=1,98 (True Pos),620 (False Pos)
Predicted WnvPresent=0,16 (False Neg),1342 (True Neg)


In [None]:
sensitivity3 = tp/(tp+fn)
sensitivity3

0.8596491228070176

In [None]:
specificity3 = tn/(tn+fp)
specificity3

0.6839959225280327

In [None]:
precision3 = tp/(tp+fp)
precision3

0.13649025069637882

In [None]:
roc_auc3 = roc_auc_score(y_test, y_pred_proba3[:, 1])
roc_auc3

0.8443451901926069

In [None]:
# Generating prediction probabilities on actual testing dataset using the model above (for kaggle submission).

submission_pred_proba3 = svc_smote_pipe3.predict_proba(act_test_X_sc)[:, 1]

In [None]:
# Putting the above predicted probabilities into a dataframe and exporting it as a csv file for submission to kaggle.

sub3 = pd.DataFrame({'Id' : test['Id'].values, 'WnvPresent' : submission_pred_proba3})
sub3.to_csv('../kaggle_submissions/submission_3_svc_smote.csv', index=False)

In [None]:
summary_df.loc[3] = ["SVC",
                     "SMOTE",
                     round(train_acc3, 3),
                     round(est_test_acc3, 3),
                     round(test_acc3, 3),
                     round(train_acc3-test_acc3, 3),
                     round(sensitivity3, 3),
                     round(specificity3, 3),
                     round(precision3, 3),
                     round(roc_auc3, 3),
                     0.675
                    ]

summary_df

Unnamed: 0,Classifier,Class Balancing Technique,Train Accuracy,Est. Test Accuracy (cv=5),Actual Test Accuracy,Overfit / Underfit,Sensitivity,Specificity,Precision,ROC-AUC,Kaggle ROC-AUC
1,LogisticRegression,SMOTE,0.707,0.698,0.712,-0.004,0.851,0.704,0.143,0.858,0.661
2,RandomForestClassifier,SMOTE,0.778,0.768,0.775,0.003,0.851,0.771,0.177,0.872,0.706
3,SVC,SMOTE,0.703,0.693,0.694,0.01,0.86,0.684,0.136,0.844,0.675


### *GradientBoostingClassifier* with *SMOTE* balancing technique

Using a similar approach as above, manual tweaking of hyperparameters was done after using *GridSearchCV*. As explained above, ***GridSearchCV* only optimizes the model using a single scoring parameter. However, in this context, we need to find a good balance between *accuracy* and *sensitivity* scores, and so a manual tuning of hyperparameters is required**.

The final set of hyperparameters chosen after manual tuning of the model are as shown below.

In [None]:
# Creating a pipeline object using imblearn.pipeline with SMOTE and GradientBoostingClassifier.

grb_smote_pipe4 = make_pipeline(SMOTE(random_state=42),
                                GradientBoostingClassifier(learning_rate=0.01,
                                                           n_estimators=80,
                                                           ccp_alpha=0,
                                                           max_depth=5,
                                                           min_samples_split=2,
                                                           min_samples_leaf=1,
                                                           subsample=0.2,
                                                           random_state=42
                                                          )
                               )

In [None]:
# Fitting X_train_sc and y_train on the pipeline object defined above.

grb_smote_pipe4.fit(X_train_sc, y_train)

Pipeline(memory=None,
         steps=[('smote',
                 SMOTE(k_neighbors=5, n_jobs=None, random_state=42,
                       sampling_strategy='auto')),
                ('gradientboostingclassifier',
                 GradientBoostingClassifier(ccp_alpha=0,
                                            criterion='friedman_mse', init=None,
                                            learning_rate=0.01, loss='deviance',
                                            max_depth=5, max_features=None,
                                            max_leaf_nodes=None,
                                            min_impurity_decrease=0.0,
                                            min_impurity_split=None,
                                            min_samples_leaf=1,
                                            min_samples_split=2,
                                            min_weight_fraction_leaf=0.0,
                                            n_estimators=80,
                      

In [None]:
# Scoring the model on training dataset.
# Training Accuracy

train_acc4 = grb_smote_pipe4.score(X_train_sc, y_train)
train_acc4

0.7800256904303147

In [None]:
# Estimated Testing Accuracy

est_test_acc4 = cross_val_score(grb_smote_pipe4, X_train_sc, y_train, cv=5).mean()
est_test_acc4

0.7782585881245689

In [None]:
# Actual Testing Accuracy

test_acc4 = grb_smote_pipe4.score(X_test_sc, y_test)
test_acc4

0.7649325626204239

In [None]:
# Generating predictions on testing dataset using the model above.

y_pred4 = grb_smote_pipe4.predict(X_test_sc)

In [None]:
# Generating prediction probabilities on testing dataset using the model above.

y_pred_proba4 = grb_smote_pipe4.predict_proba(X_test_sc)

In [None]:
# Generating a confusion matrix.

tn, fp, fn, tp = confusion_matrix(y_test, y_pred4).ravel()

In [None]:
conf_mat4 = pd.DataFrame(columns=['Actual WnvPresent=1', 'Actual WnvPresent=0'], 
                         index=['Predicted WnvPresent=1', 'Predicted WnvPresent=0'],
                         data=[[f'{tp} (True Pos)', f'{fp} (False Pos)'], [f'{fn} (False Neg)', f'{tn} (True Neg)']]
                        )
conf_mat4

Unnamed: 0,Actual WnvPresent=1,Actual WnvPresent=0
Predicted WnvPresent=1,95 (True Pos),469 (False Pos)
Predicted WnvPresent=0,19 (False Neg),1493 (True Neg)


In [None]:
sensitivity4 = tp/(tp+fn)
sensitivity4

0.8333333333333334

In [None]:
specificity4 = tn/(tn+fp)
specificity4

0.7609582059123343

In [None]:
precision4 = tp/(tp+fp)
precision4

0.16843971631205673

In [None]:
roc_auc4 = roc_auc_score(y_test, y_pred_proba4[:, 1])
roc_auc4

0.8709247634887423

In [None]:
# Generating prediction probabilities on actual testing dataset using the model above (for kaggle submission).

submission_pred_proba4 = grb_smote_pipe4.predict_proba(act_test_X_sc)[:, 1]

In [None]:
# Putting the above predicted probabilities into a dataframe and exporting it as a csv file for submission to kaggle.

sub4 = pd.DataFrame({'Id' : test['Id'].values, 'WnvPresent' : submission_pred_proba4})
sub4.to_csv('../kaggle_submissions/submission_4_gradboost_smote.csv', index=False)

In [None]:
summary_df.loc[4] = ["GradientBoostingClassifier",
                     "SMOTE",
                     round(train_acc4, 3),
                     round(est_test_acc4, 3),
                     round(test_acc4, 3),
                     round(train_acc4-test_acc4, 3),
                     round(sensitivity4, 3),
                     round(specificity4, 3),
                     round(precision4, 3),
                     round(roc_auc4, 3),
                     0.705
                    ]

summary_df

Unnamed: 0,Classifier,Class Balancing Technique,Train Accuracy,Est. Test Accuracy (cv=5),Actual Test Accuracy,Overfit / Underfit,Sensitivity,Specificity,Precision,ROC-AUC,Kaggle ROC-AUC
1,LogisticRegression,SMOTE,0.707,0.698,0.712,-0.004,0.851,0.704,0.143,0.858,0.661
2,RandomForestClassifier,SMOTE,0.778,0.768,0.775,0.003,0.851,0.771,0.177,0.872,0.706
3,SVC,SMOTE,0.703,0.693,0.694,0.01,0.86,0.684,0.136,0.844,0.675
4,GradientBoostingClassifier,SMOTE,0.78,0.778,0.765,0.015,0.833,0.761,0.168,0.871,0.705


Looking at the summary of classification metrics above, we note the following:

- ***LogisticRegression* with *SMOTE*** - Performs well with train and test *accuracies* of ~71.0% with slight under-fitting on the train data. The *sensitivity* is also high at 85.1%.
- ***RandomForestClassifier* with *SMOTE*** - Performs significantly better than *LogisticRegression* in terms of *accuracy* with very little over-fitting on the train data. The *sensitivity* score is same as *LogisticRegression* (85.1%).
- ***SVC* with *SMOTE*** - Performs similarly as *LogisticRegression* above with slightly lesser *accuracy* scores and a slight over-fit on the train data. It performs the best among the four models in terms of *sensitivity* (86.0%), even though it is not significantly higher than the *sensitivity* for the two models above (85.1%).
- ***GradientBoostingClassifier* with *SMOTE*** - Performs slightly better than the *RandomForestClassifier* model in terms of *accuracy*, but with a higher degree of over-fit on the train data. Also, it has the lowest *sensitivity* among the four models (83.3%).

So, from the above, the two models that performed the best in terms of *accuracy* are *RandomForestClassifier* and *GradientBoostingClassifier*. While *GradientBoostingClassifier* performed slightly better on the train data, *RandomForestClassifier* had a significantly lesser degree of over-fit, and so can be said to be generalizing better on unseen data. Furthermore, *RandomForestClassifier sensitivity* is quite high and while it is only slightly lower than *SVC*, it manages to mantain a significantly higher *accuracy* as well as *specificity*. It also has the highest *ROC-AUC* score among the four models.

So, for the above stated reasons, **we chose *RandomForestClassifier* to be our best performing classifier**. We will now pair *RandomForestClassifier* with **three other balancing techniques - *ADASYN, ClusterCentroids* and using hyperparameter *class_weight='balanced'*** to further model and select the best performing combination of class balancing technique and classifier for this project.

### *RandomForestClassifier* with *ADASYN* balancing technique

Using a similar approach as above, manual tweaking of hyperparameters was done after using *GridSearchCV*. As explained above, ***GridSearchCV* only optimizes the model using a single scoring parameter. However, in this context, we need to find a good balance between *accuracy* and *sensitivity* scores, and so a manual tuning of hyperparameters is required**.

The final set of hyperparameters chosen after manual tuning of the model are as shown below.

In [None]:
# Creating a pipeline object using imblearn.pipeline with ADASYN and RandomForestClassifier.

rfc_adasyn_pipe5 = make_pipeline(ADASYN(random_state=42),
                                 RandomForestClassifier(n_estimators=100,
                                                        ccp_alpha=0,
                                                        max_depth=5,
                                                        min_samples_split=2,
                                                        min_samples_leaf=3,
                                                        random_state=42
                                                       )
                                )

In [None]:
# Fitting X_train_sc and y_train on the pipeline object defined above.

rfc_adasyn_pipe5.fit(X_train_sc, y_train)

Pipeline(memory=None,
         steps=[('adasyn',
                 ADASYN(n_jobs=None, n_neighbors=5, random_state=42,
                        sampling_strategy='auto')),
                ('randomforestclassifier',
                 RandomForestClassifier(bootstrap=True, ccp_alpha=0,
                                        class_weight=None, criterion='gini',
                                        max_depth=5, max_features='auto',
                                        max_leaf_nodes=None, max_samples=None,
                                        min_impurity_decrease=0.0,
                                        min_impurity_split=None,
                                        min_samples_leaf=3, min_samples_split=2,
                                        min_weight_fraction_leaf=0.0,
                                        n_estimators=100, n_jobs=None,
                                        oob_score=False, random_state=42,
                                        verbose=0, warm_star

In [None]:
# Scoring the model on training dataset.
# Training Accuracy

train_acc5 = rfc_adasyn_pipe5.score(X_train_sc, y_train)
train_acc5

0.7707129094412332

In [None]:
# Estimated Testing Accuracy

est_test_acc5 = cross_val_score(rfc_adasyn_pipe5, X_train_sc, y_train, cv=5).mean()
est_test_acc5

0.7605955120643086

In [None]:
# Actual Testing Accuracy

test_acc5 = rfc_adasyn_pipe5.score(X_test_sc, y_test)
test_acc5

0.7615606936416185

In [None]:
# Generating predictions on testing dataset using the model above.

y_pred5 = rfc_adasyn_pipe5.predict(X_test_sc)

In [None]:
# Generating prediction probabilities on testing dataset using the model above.

y_pred_proba5 = rfc_adasyn_pipe5.predict_proba(X_test_sc)

In [None]:
# Generating a confusion matrix.

tn, fp, fn, tp = confusion_matrix(y_test, y_pred5).ravel()

In [None]:
conf_mat5 = pd.DataFrame(columns=['Actual WnvPresent=1', 'Actual WnvPresent=0'], 
                         index=['Predicted WnvPresent=1', 'Predicted WnvPresent=0'],
                         data=[[f'{tp} (True Pos)', f'{fp} (False Pos)'], [f'{fn} (False Neg)', f'{tn} (True Neg)']]
                        )
conf_mat5

Unnamed: 0,Actual WnvPresent=1,Actual WnvPresent=0
Predicted WnvPresent=1,96 (True Pos),477 (False Pos)
Predicted WnvPresent=0,18 (False Neg),1485 (True Neg)


In [None]:
sensitivity5 = tp/(tp+fn)
sensitivity5

0.8421052631578947

In [None]:
specificity5 = tn/(tn+fp)
specificity5

0.7568807339449541

In [None]:
precision5 = tp/(tp+fp)
precision5

0.16753926701570682

In [None]:
roc_auc5 = roc_auc_score(y_test, y_pred_proba5[:, 1])
roc_auc5

0.8715573975714005

In [None]:
# Generating prediction probabilities on actual testing dataset using the model above (for kaggle submission).

submission_pred_proba5 = rfc_adasyn_pipe5.predict_proba(act_test_X_sc)[:, 1]

In [None]:
# Putting the above predicted probabilities into a dataframe and exporting it as a csv file for submission to kaggle.

sub5 = pd.DataFrame({'Id' : test['Id'].values, 'WnvPresent' : submission_pred_proba5})
sub5.to_csv('../kaggle_submissions/submission_5_rfc_adasyn.csv', index=False)

In [None]:
summary_df.loc[5] = ["RandomForestClassifier",
                     "ADASYN",
                     round(train_acc5, 3),
                     round(est_test_acc5, 3),
                     round(test_acc5, 3),
                     round(train_acc5-test_acc5, 3),
                     round(sensitivity5, 3),
                     round(specificity5, 3),
                     round(precision5, 3),
                     round(roc_auc5, 3),
                     0.713
                    ]

summary_df

Unnamed: 0,Classifier,Class Balancing Technique,Train Accuracy,Est. Test Accuracy (cv=5),Actual Test Accuracy,Overfit / Underfit,Sensitivity,Specificity,Precision,ROC-AUC,Kaggle ROC-AUC
1,LogisticRegression,SMOTE,0.707,0.698,0.712,-0.004,0.851,0.704,0.143,0.858,0.661
2,RandomForestClassifier,SMOTE,0.778,0.768,0.775,0.003,0.851,0.771,0.177,0.872,0.706
3,SVC,SMOTE,0.703,0.693,0.694,0.01,0.86,0.684,0.136,0.844,0.675
4,GradientBoostingClassifier,SMOTE,0.78,0.778,0.765,0.015,0.833,0.761,0.168,0.871,0.705
5,RandomForestClassifier,ADASYN,0.771,0.761,0.762,0.009,0.842,0.757,0.168,0.872,0.713


### *RandomForestClassifier* with *ClusterCentroids* balancing technique

Using a similar approach as above, manual tweaking of hyperparameters was done after using *GridSearchCV*. As explained above, ***GridSearchCV* only optimizes the model using a single scoring parameter. However, in this context, we need to find a good balance between *accuracy* and *sensitivity* scores, and so a manual tuning of hyperparameters is required**.

The final set of hyperparameters chosen after manual tuning of the model are as shown below.

In [None]:
# Creating a pipeline object using imblearn.pipeline with ClusterCentroids and RandomForestClassifier.

rfc_clustercentroids_pipe6 = make_pipeline(ClusterCentroids(random_state=42),
                                           RandomForestClassifier(n_estimators=100,
                                                                  ccp_alpha=0,
                                                                  max_depth=5,
                                                                  min_samples_split=2,
                                                                  min_samples_leaf=3,
                                                                  random_state=42
                                                                 )
                                          )

In [None]:
# Fitting X_train_sc and y_train on the pipeline object defined above.

rfc_clustercentroids_pipe6.fit(X_train_sc, y_train)

Pipeline(memory=None,
         steps=[('clustercentroids',
                 ClusterCentroids(estimator=None, n_jobs=None, random_state=42,
                                  sampling_strategy='auto', voting='auto')),
                ('randomforestclassifier',
                 RandomForestClassifier(bootstrap=True, ccp_alpha=0,
                                        class_weight=None, criterion='gini',
                                        max_depth=5, max_features='auto',
                                        max_leaf_nodes=None, max_samples=None,
                                        min_impurity_decrease=0.0,
                                        min_impurity_split=None,
                                        min_samples_leaf=3, min_samples_split=2,
                                        min_weight_fraction_leaf=0.0,
                                        n_estimators=100, n_jobs=None,
                                        oob_score=False, random_state=42,
              

In [None]:
# Scoring the model on training dataset.
# Training Accuracy

train_acc6 = rfc_clustercentroids_pipe6.score(X_train_sc, y_train)
train_acc6

0.6274887604367373

In [None]:
# Estimated Testing Accuracy

est_test_acc6 = cross_val_score(rfc_clustercentroids_pipe6, X_train_sc, y_train, cv=5).mean()
est_test_acc6

0.6178545320930592

In [None]:
# Actual Testing Accuracy

test_acc6 = rfc_clustercentroids_pipe6.score(X_test_sc, y_test)
test_acc6

0.6242774566473989

In [None]:
# Generating predictions on testing dataset using the model above.

y_pred6 = rfc_clustercentroids_pipe6.predict(X_test_sc)

In [None]:
# Generating prediction probabilities on testing dataset using the model above.

y_pred_proba6 = rfc_clustercentroids_pipe6.predict_proba(X_test_sc)

In [None]:
# Generating a confusion matrix.

tn, fp, fn, tp = confusion_matrix(y_test, y_pred6).ravel()

In [None]:
conf_mat6 = pd.DataFrame(columns=['Actual WnvPresent=1', 'Actual WnvPresent=0'], 
                         index=['Predicted WnvPresent=1', 'Predicted WnvPresent=0'],
                         data=[[f'{tp} (True Pos)', f'{fp} (False Pos)'], [f'{fn} (False Neg)', f'{tn} (True Neg)']]
                        )
conf_mat6

Unnamed: 0,Actual WnvPresent=1,Actual WnvPresent=0
Predicted WnvPresent=1,105 (True Pos),771 (False Pos)
Predicted WnvPresent=0,9 (False Neg),1191 (True Neg)


In [None]:
sensitivity6 = tp/(tp+fn)
sensitivity6

0.9210526315789473

In [None]:
specificity6 = tn/(tn+fp)
specificity6

0.6070336391437309

In [None]:
precision6 = tp/(tp+fp)
precision6

0.11986301369863013

In [None]:
roc_auc6 = roc_auc_score(y_test, y_pred_proba6[:, 1])
roc_auc6

0.8479465100059017

In [None]:
# Generating prediction probabilities on actual testing dataset using the model above (for kaggle submission).

submission_pred_proba6 = rfc_clustercentroids_pipe6.predict_proba(act_test_X_sc)[:, 1]

In [None]:
# Putting the above predicted probabilities into a dataframe and exporting it as a csv file for submission to kaggle.

sub6 = pd.DataFrame({'Id' : test['Id'].values, 'WnvPresent' : submission_pred_proba6})
sub6.to_csv('../kaggle_submissions/submission_6_rfc_clustercentroids.csv', index=False)

In [None]:
summary_df.loc[6] = ["RandomForestClassifier",
                     "ClusterCentroids",
                     round(train_acc6, 3),
                     round(est_test_acc6, 3),
                     round(test_acc6, 3),
                     round(train_acc6-test_acc6, 3),
                     round(sensitivity6, 3),
                     round(specificity6, 3),
                     round(precision6, 3),
                     round(roc_auc6, 3),
                     0.707
                    ]

summary_df

Unnamed: 0,Classifier,Class Balancing Technique,Train Accuracy,Est. Test Accuracy (cv=5),Actual Test Accuracy,Overfit / Underfit,Sensitivity,Specificity,Precision,ROC-AUC,Kaggle ROC-AUC
1,LogisticRegression,SMOTE,0.707,0.698,0.712,-0.004,0.851,0.704,0.143,0.858,0.661
2,RandomForestClassifier,SMOTE,0.778,0.768,0.775,0.003,0.851,0.771,0.177,0.872,0.706
3,SVC,SMOTE,0.703,0.693,0.694,0.01,0.86,0.684,0.136,0.844,0.675
4,GradientBoostingClassifier,SMOTE,0.78,0.778,0.765,0.015,0.833,0.761,0.168,0.871,0.705
5,RandomForestClassifier,ADASYN,0.771,0.761,0.762,0.009,0.842,0.757,0.168,0.872,0.713
6,RandomForestClassifier,ClusterCentroids,0.627,0.618,0.624,0.003,0.921,0.607,0.12,0.848,0.707


### *RandomForestClassifier* with hyperparameter *class_weight='balanced_subsample'*

Using a similar approach as above, manual tweaking of hyperparameters was done after using *GridSearchCV*. As explained above, ***GridSearchCV* only optimizes the model using a single scoring parameter. However, in this context, we need to find a good balance between *accuracy* and *sensitivity* scores, and so a manual tuning of hyperparameters is required**.

The final set of hyperparameters chosen after manual tuning of the model are as shown below.

In [None]:
# Creating a RandomForestClassifier object.

rfc7 = RandomForestClassifier(class_weight='balanced_subsample', 
                              n_estimators=200,
                              ccp_alpha=0,
                              max_depth=5,
                              min_samples_split=2,
                              min_samples_leaf=1,
                              random_state=42
                             )

In [None]:
# Fitting X_train_sc and y_train on RandomForestClassifier object defined above.

rfc7.fit(X_train_sc, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0,
                       class_weight='balanced_subsample', criterion='gini',
                       max_depth=5, max_features='auto', max_leaf_nodes=None,
                       max_samples=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       n_estimators=200, n_jobs=None, oob_score=False,
                       random_state=42, verbose=0, warm_start=False)

In [None]:
# Scoring the model on training dataset.
# Training Accuracy

train_acc7 = rfc7.score(X_train_sc, y_train)
train_acc7

0.7408477842003853

In [None]:
# Estimated Testing Accuracy

est_test_acc7 = cross_val_score(rfc7, X_train_sc, y_train, cv=5).mean()
est_test_acc7

0.7430972042262147

In [None]:
# Actual Testing Accuracy

test_acc7 = rfc7.score(X_test_sc, y_test)
test_acc7

0.7345857418111753

In [None]:
# Generating predictions on testing dataset using the model above.

y_pred7 = rfc7.predict(X_test_sc)

In [None]:
# Generating prediction probabilities on testing dataset using the model above.

y_pred_proba7 = rfc7.predict_proba(X_test_sc)

In [None]:
# Generating a confusion matrix.

tn, fp, fn, tp = confusion_matrix(y_test, y_pred7).ravel()

In [None]:
conf_mat7 = pd.DataFrame(columns=['Actual WnvPresent=1', 'Actual WnvPresent=0'], 
                         index=['Predicted WnvPresent=1', 'Predicted WnvPresent=0'],
                         data=[[f'{tp} (True Pos)', f'{fp} (False Pos)'], [f'{fn} (False Neg)', f'{tn} (True Neg)']]
                        )
conf_mat7

Unnamed: 0,Actual WnvPresent=1,Actual WnvPresent=0
Predicted WnvPresent=1,104 (True Pos),541 (False Pos)
Predicted WnvPresent=0,10 (False Neg),1421 (True Neg)


In [None]:
sensitivity7 = tp/(tp+fn)
sensitivity7

0.9122807017543859

In [None]:
specificity7 = tn/(tn+fp)
specificity7

0.7242609582059123

In [None]:
precision7 = tp/(tp+fp)
precision7

0.16124031007751938

In [None]:
roc_auc7 = roc_auc_score(y_test, y_pred_proba7[:, 1])
roc_auc7

0.8820372158735269

In [None]:
# Generating prediction probabilities on actual testing dataset using the model above (for kaggle submission).

submission_pred_proba7 = rfc7.predict_proba(act_test_X_sc)[:, 1]

In [None]:
# Putting the above predicted probabilities into a dataframe and exporting it as a csv file for submission to kaggle.

sub7 = pd.DataFrame({'Id' : test['Id'].values, 'WnvPresent' : submission_pred_proba7})
sub7.to_csv('../kaggle_submissions/submission_7_rfc_class_weight_bal.csv', index=False)

In [None]:
# Exporting the fitted model as a pickle file for deployment.

filename= 'final_rfc_model.pkl'
pickle.dump(rfc7, open(filename,'wb'))

In [None]:
summary_df.loc[7] = ["RandomForestClassifier",
                     "class_weight='balanced_subsample'",
                     round(train_acc7, 3),
                     round(est_test_acc7, 3),
                     round(test_acc7, 3),
                     round(train_acc7-test_acc7, 3),
                     round(sensitivity7, 3),
                     round(specificity7, 3),
                     round(precision7, 3),
                     round(roc_auc7, 3),
                     0.717
                    ]

summary_df

Unnamed: 0,Classifier,Class Balancing Technique,Train Accuracy,Est. Test Accuracy (cv=5),Actual Test Accuracy,Overfit / Underfit,Sensitivity,Specificity,Precision,ROC-AUC,Kaggle ROC-AUC
1,LogisticRegression,SMOTE,0.707,0.698,0.712,-0.004,0.851,0.704,0.143,0.858,0.661
2,RandomForestClassifier,SMOTE,0.778,0.768,0.775,0.003,0.851,0.771,0.177,0.872,0.706
3,SVC,SMOTE,0.703,0.693,0.694,0.01,0.86,0.684,0.136,0.844,0.675
4,GradientBoostingClassifier,SMOTE,0.78,0.778,0.765,0.015,0.833,0.761,0.168,0.871,0.705
5,RandomForestClassifier,ADASYN,0.771,0.761,0.762,0.009,0.842,0.757,0.168,0.872,0.713
6,RandomForestClassifier,ClusterCentroids,0.627,0.618,0.624,0.003,0.921,0.607,0.12,0.848,0.707
7,RandomForestClassifier,class_weight='balanced_subsample',0.741,0.743,0.735,0.006,0.912,0.724,0.161,0.882,0.717
