In [1]:
import pandas as pd
import numpy as np
import pandas_profiling as pdp

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn import metrics, svm
from sklearn.linear_model import LogisticRegression, LinearRegression, Lasso
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingClassifier, GradientBoostingRegressor, BaggingRegressor, BaggingClassifier, AdaBoostClassifier



# display plots in the notebook
%matplotlib inline
# increase default figure and font sizes for easier viewing
plt.rcParams['figure.figsize'] = (16, 8)
plt.rcParams['font.size'] = 12

In [56]:
#read in scaled train_X
X = pd.read_csv("../assets/scaled_train_X.csv", index_col=0)

#read in train_y
y = pd.read_csv("../assets/clean_train.csv", index_col=0)
y = y["WnvPresent"]

In [57]:
#read in scaled test_X

test_X = pd.read_csv("../assets/scaled_test_X.csv", index_col=0)

In [58]:
X.head()

Unnamed: 0,Tavg,DewPoint,PrecipTotal,StnPressure,AvgSpeed,Precip_7d_avg,wind_abv_1std,Latitude,Longitude,Mos_WNV_Prob,...,Trap_Species_Mos_75percent,Trap_Mos_Mean,Trap_Mos_Median,Trap_Mos_25percent,Trap_Mos_75percent,Species_Mos_Mean,Species_Mos_Median,Species_Mos_25percent,Species_Mos_75percent,Species_Obs_Proportion
0,0.703601,0.43334,-0.319916,0.870826,-0.649931,0.103875,0.0,1.032541,-1.263449,0.17102,...,0.384196,0.384598,0.81011,0.791777,0.502594,0.255114,0.913742,0.676959,0.97549,0.967227
1,0.703601,0.43334,-0.319916,0.870826,-0.649931,0.103875,0.0,1.214515,-1.546838,0.17102,...,-0.335797,-0.44033,-0.370459,-0.609928,-0.542962,0.255114,0.913742,0.676959,0.97549,0.967227
2,0.703601,0.43334,-0.319916,0.870826,-0.649931,0.103875,0.0,0.210971,0.482575,0.17102,...,0.224198,-0.124542,0.135499,0.090925,0.117389,0.255114,0.913742,0.676959,0.97549,0.967227
3,0.703601,0.43334,-0.319916,0.870826,-0.649931,0.103875,0.0,0.700966,0.006295,0.17102,...,-0.447795,-0.482957,-0.707764,-0.609928,-0.65302,0.255114,0.913742,0.676959,0.97549,0.967227
4,0.703601,0.43334,-0.319916,0.870826,-0.649931,0.103875,0.0,0.725562,0.745953,0.17102,...,-0.127799,-0.312977,-0.201806,0.090925,-0.322845,0.255114,0.913742,0.676959,0.97549,0.967227


In [59]:
test_X.head()

Unnamed: 0,Tavg,DewPoint,PrecipTotal,StnPressure,AvgSpeed,Precip_7d_avg,wind_abv_1std,Latitude,Longitude,Mos_WNV_Prob,...,Trap_Species_Mos_75percent,Trap_Mos_Mean,Trap_Mos_Median,Trap_Mos_25percent,Trap_Mos_75percent,Species_Mos_Mean,Species_Mos_Median,Species_Mos_25percent,Species_Mos_75percent,Species_Obs_Proportion
0,0.703601,0.24584,-0.319916,0.165946,0.448568,0.953593,0.0,0.987891,-1.330045,0.885971,...,0.924092,0.699689,1.006187,0.995285,0.640559,0.97453,0.850425,-0.063763,1.565851,1.975787
1,0.703601,0.24584,-0.319916,0.165946,0.448568,0.953593,0.0,0.987891,-1.330045,-0.255449,...,0.339315,0.699689,1.006187,0.995285,0.640559,-0.009351,-0.130088,-0.59334,0.234302,0.156427
2,0.703601,0.24584,-0.319916,0.165946,0.448568,0.953593,0.0,0.987891,-1.330045,1.933189,...,1.070286,0.699689,1.006187,0.995285,0.640559,1.704737,0.360168,-0.063763,0.678152,1.058526
3,0.703601,0.24584,-0.319916,0.165946,0.448568,0.953593,0.0,0.987891,-1.330045,-0.846322,...,-0.440387,0.699689,1.006187,0.995285,0.640559,-1.0606,-1.110601,-0.59334,-1.097246,-0.83257
4,0.703601,0.24584,-0.319916,0.165946,0.448568,0.953593,0.0,0.987891,-1.330045,-0.846322,...,-0.440387,0.699689,1.006187,0.995285,0.640559,-0.980347,-1.110601,-0.59334,-1.097246,-0.817068


In [60]:
y.head()

0    0.0
1    0.0
2    0.0
3    0.0
4    0.0
Name: WnvPresent, dtype: float64

In [61]:
#creating function to test and fit classification model (created for SVM lab)
def do_cm_cr(model, X, y, names): 
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42, stratify=y)
    model.fit(X_train, y_train)
    y_probs = model.predict(X_test)    #predict y values for X_test
    print "Output for Tested Model:"
    print "Confusion Matrix of Predictions: "
    print
    print(confusion_matrix(y_test, y_probs)) # Actual values are rows (0, 1), while predicted are columns (0, 1); 
    print
    #printing classification report
    #precision is true positives / (true positives + false positives) - of all predicted, % correct
    #recall is true positives / (true positives + false negatives) - of all actual, % correct
    #f1-score is a weighted harmonic mean of the precision and recall, f1-score reaches best value at 1 and worst at 0.
    #support is number of true values for each class
    print "Classification Matrix: "
    print
    print(classification_report(y_test, y_probs, target_names=names))
    return model.score(X_test,y_test)

In [62]:
#we'll start with Logistic Regression
logreg = LogisticRegression()

do_cm_cr(logreg, X, y, ["no_Wnv", "yes_Wnv"])


Output for Tested Model:
Confusion Matrix of Predictions: 

[[2646    0]
 [ 151    0]]

Classification Matrix: 

             precision    recall  f1-score   support

     no_Wnv       0.95      1.00      0.97      2646
    yes_Wnv       0.00      0.00      0.00       151

avg / total       0.89      0.95      0.92      2797



  'precision', 'predicted', average, warn_for)


0.94601358598498386

In [63]:
#k-nearest neighbors
knnc = KNeighborsClassifier()

do_cm_cr(knnc, X, y, ["no_Wnv", "yes_Wnv"])

#After scaling the X data, the knnc performs effectively as well as logistic regression


Output for Tested Model:
Confusion Matrix of Predictions: 

[[2643    3]
 [ 151    0]]

Classification Matrix: 

             precision    recall  f1-score   support

     no_Wnv       0.95      1.00      0.97      2646
    yes_Wnv       0.00      0.00      0.00       151

avg / total       0.89      0.94      0.92      2797



0.94494100822309612

In [64]:
#SVM with linear kernal
lin_svm = svm.SVC(kernel='linear')

do_cm_cr(lin_svm, X, y, ["no_Wnv", "yes_Wnv"])

#SVM with linear model predict 0 instances of WNV - wow.

Output for Tested Model:
Confusion Matrix of Predictions: 

[[2646    0]
 [ 151    0]]

Classification Matrix: 

             precision    recall  f1-score   support

     no_Wnv       0.95      1.00      0.97      2646
    yes_Wnv       0.00      0.00      0.00       151

avg / total       0.89      0.95      0.92      2797



0.94601358598498386

In [65]:
#SVM with rbf kernal
rbf_svm = svm.SVC(kernel='rbf')

do_cm_cr(rbf_svm, X, y, ["no_Wnv", "yes_Wnv"])

#SVM with rbf model predicted a whopping 1 instance of WNV, but at least it predicted that one correctly.

Output for Tested Model:
Confusion Matrix of Predictions: 

[[2646    0]
 [ 151    0]]

Classification Matrix: 

             precision    recall  f1-score   support

     no_Wnv       0.95      1.00      0.97      2646
    yes_Wnv       0.00      0.00      0.00       151

avg / total       0.89      0.95      0.92      2797



0.94601358598498386

In [66]:
#normal decision tree
drc = DecisionTreeClassifier()

do_cm_cr(drc, X, y, ["no_Wnv", "yes_Wnv"])

Output for Tested Model:
Confusion Matrix of Predictions: 

[[2464  182]
 [ 135   16]]

Classification Matrix: 

             precision    recall  f1-score   support

     no_Wnv       0.95      0.93      0.94      2646
    yes_Wnv       0.08      0.11      0.09       151

avg / total       0.90      0.89      0.89      2797



0.88666428316052914

In [67]:
#random forest
rfc = RandomForestClassifier()

do_cm_cr(rfc, X, y, ["no_Wnv", "yes_Wnv"])

Output for Tested Model:
Confusion Matrix of Predictions: 

[[2640    6]
 [ 149    2]]

Classification Matrix: 

             precision    recall  f1-score   support

     no_Wnv       0.95      1.00      0.97      2646
    yes_Wnv       0.25      0.01      0.03       151

avg / total       0.91      0.94      0.92      2797



0.94458348230246691

In [68]:
#gradient boosting classifier
gbc = GradientBoostingClassifier()

do_cm_cr(gbc, X, y, ["no_Wnv", "yes_Wnv"])


Output for Tested Model:
Confusion Matrix of Predictions: 

[[2643    3]
 [ 150    1]]

Classification Matrix: 

             precision    recall  f1-score   support

     no_Wnv       0.95      1.00      0.97      2646
    yes_Wnv       0.25      0.01      0.01       151

avg / total       0.91      0.95      0.92      2797



0.94529853414372544

In [69]:
#gradient boosting classifier
bcgbc = GradientBoostingClassifier()

bc = BaggingClassifier(bcgbc)

do_cm_cr(bc, X, y, ["no_Wnv", "yes_Wnv"])



Output for Tested Model:
Confusion Matrix of Predictions: 

[[2646    0]
 [ 151    0]]

Classification Matrix: 

             precision    recall  f1-score   support

     no_Wnv       0.95      1.00      0.97      2646
    yes_Wnv       0.00      0.00      0.00       151

avg / total       0.89      0.95      0.92      2797



0.94601358598498386

In [None]:
#predicting WNV based on fit models (substitute name of fit model for "model" in code below)
test_probs = model.predict_proba(test_X)

In [None]:
#creating DF of resultant predicted probabilties
test_probs = pd.DataFrame(test_probs, columns = ["No_Predict", "Yes_Predict"])

In [None]:
#reading in sample submission file

sampsub = pd.read_csv("../assets/sampleSubmission.csv", index_col = 0) 

In [None]:
sampsub.head()
#sample submission is simply a list of the trap IDs along with the prediction of whether each has WnvPresent
#Note from Kaggle page, For each record in the test set, you should predict a real-valued probability that WNV 
#is present. That is, they shouldn't just be 0 or 1, but should be probabilities (e.g., 0.2, 0.9)

# see page here: https://www.kaggle.com/c/predict-west-nile-virus#evaluation

In [None]:
#adding 1 to index values of test_probs to ensure they line up appropriately with sampsub id numbers
test_probs.index += 1

In [None]:
# transferring probabilities from test probs into sampsub df
sampsub["WnvPresent"] = test_probs["Yes_Predict"]

In [None]:
#saving submission file to assets folder (use unique name!)
sampsub.to_csv("../assets/wnvsubmit_[your unique name here].csv")
