In [30]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np


In [90]:
#importing data, spray data for the test set (2008,2010,2012,2014) is not provided. Therefore, spray info is not used for the analysis
train = pd.read_csv('./assets/train.csv')
test =pd.read_csv('./assets/test.csv')
weather = pd.read_csv('./assets/weather.csv')
spray =pd.read_csv('./assets/spray.csv')
mapdata = np.loadtxt("./assets/mapdata_copyright_openstreetmap_contributors.txt")

In [32]:
from datetime import datetime
train['Date'] = pd.to_datetime(train['Date'], format='%Y-%m-%d')
test['Date'] = pd.to_datetime(test['Date'], format='%Y-%m-%d')
weather['Date'] = pd.to_datetime(weather['Date'], format='%Y-%m-%d')
spray['Date'] = pd.to_datetime(spray['Date'], format='%Y-%m-%d')

In [33]:
train['week'] = train['Date'].dt.weekofyear
train['year'] = train['Date'].dt.year
test['week'] = test['Date'].dt.weekofyear
test['year'] = test['Date'].dt.year
spray['year']=spray['Date'].dt.year

In [34]:
train.columns


Index([u'Date', u'Address', u'Species', u'Block', u'Street', u'Trap',
       u'AddressNumberAndStreet', u'Latitude', u'Longitude',
       u'AddressAccuracy', u'NumMosquitos', u'WnvPresent', u'week', u'year'],
      dtype='object')

In [35]:
train = train.drop(train[[1,3,4,5,6,9,10]], axis=1)

In [36]:
train.head(1)

Unnamed: 0,Date,Species,Latitude,Longitude,WnvPresent,week,year
0,2007-05-29,CULEX PIPIENS/RESTUANS,41.95469,-87.800991,0,22,2007


In [37]:
test =test.drop(test[[0,2,4,5,6,7,10]], axis=1)

In [38]:
test.head()

Unnamed: 0,Date,Species,Latitude,Longitude,week,year
0,2008-06-11,CULEX PIPIENS/RESTUANS,41.95469,-87.800991,24,2008
1,2008-06-11,CULEX RESTUANS,41.95469,-87.800991,24,2008
2,2008-06-11,CULEX PIPIENS,41.95469,-87.800991,24,2008
3,2008-06-11,CULEX SALINARIUS,41.95469,-87.800991,24,2008
4,2008-06-11,CULEX TERRITANS,41.95469,-87.800991,24,2008


In [39]:
train = pd.concat([train , pd.get_dummies(train['Species'])], axis=1)  

In [40]:
test = pd.concat([test , pd.get_dummies(test['Species'])], axis=1)  

In [41]:
test.head()

Unnamed: 0,Date,Species,Latitude,Longitude,week,year,CULEX ERRATICUS,CULEX PIPIENS,CULEX PIPIENS/RESTUANS,CULEX RESTUANS,CULEX SALINARIUS,CULEX TARSALIS,CULEX TERRITANS,UNSPECIFIED CULEX
0,2008-06-11,CULEX PIPIENS/RESTUANS,41.95469,-87.800991,24,2008,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,2008-06-11,CULEX RESTUANS,41.95469,-87.800991,24,2008,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,2008-06-11,CULEX PIPIENS,41.95469,-87.800991,24,2008,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2008-06-11,CULEX SALINARIUS,41.95469,-87.800991,24,2008,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,2008-06-11,CULEX TERRITANS,41.95469,-87.800991,24,2008,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [42]:
test =test.drop(test[[1]], axis=1)
train =train.drop(train[[1]], axis=1)

In [43]:
w30 = pd.read_csv('weather_ave_30.csv')

In [44]:
w30 = w30.drop('Datetime_Date', axis=1)


In [45]:
w30['Date'] = pd.to_datetime(w30['Date'], format='%Y-%m-%d')

In [46]:
# Adding the weather data to the train and test data
train_add =train.join(w30.set_index('Date'), on='Date')
test_add = test.join(w30.set_index('Date'), on='Date')

In [47]:

test_add['UNSPECIFIED CULEX'].value_counts()



0.0    101948
1.0     14345
Name: UNSPECIFIED CULEX, dtype: int64

In [48]:
train_add['UNSPECIFIED CULEX']= 0


In [49]:
y =train_add['WnvPresent']


In [50]:
train_add = train_add.drop('WnvPresent', axis=1)


In [54]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, classification_report
from sklearn.cross_validation import cross_val_score, StratifiedKFold ,train_test_split, KFold
from sklearn.tree import DecisionTreeClassifier 
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, BaggingClassifier,AdaBoostClassifier,GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.grid_search import GridSearchCV


In [55]:
train_add.columns

Index([u'Date', u'Latitude', u'Longitude', u'week', u'year',
       u'CULEX ERRATICUS', u'CULEX PIPIENS', u'CULEX PIPIENS/RESTUANS',
       u'CULEX RESTUANS', u'CULEX SALINARIUS', u'CULEX TARSALIS',
       u'CULEX TERRITANS', u'Tmax', u'Tmin', u'Tavg', u'Depart', u'DewPoint',
       u'WetBulb', u'Heat', u'Cool', u'PrecipTotal', u'StnPressure',
       u'SeaLevel', u'ResultSpeed', u'ResultDir', u'AvgSpeed', u'HZ', u'VC',
       u'FU', u'BC', u'SQ', u'FG+', u'MI', u'TS', u'DZ', u'RA', u'BR', u'FG',
       u'SN', u'UNSPECIFIED CULEX'],
      dtype='object')

In [56]:
X =train_add[[u'Latitude', u'Longitude', u'week', u'year',
       u'CULEX ERRATICUS', u'CULEX PIPIENS', u'CULEX PIPIENS/RESTUANS',
       u'CULEX RESTUANS', u'CULEX SALINARIUS', u'CULEX TARSALIS',
       u'CULEX TERRITANS', u'Tmax', u'Tmin', u'Tavg', u'Depart', u'DewPoint',
       u'WetBulb', u'Heat', u'Cool', u'PrecipTotal', u'StnPressure',
       u'SeaLevel', u'ResultSpeed', u'ResultDir', u'AvgSpeed', u'HZ', u'VC',
       u'FU', u'BC', u'SQ', u'FG+', u'MI', u'TS', u'DZ', u'RA', u'BR', u'FG',
       u'SN', u'UNSPECIFIED CULEX']]

In [57]:
X.dtypes

Latitude                  float64
Longitude                 float64
week                        int64
year                        int64
CULEX ERRATICUS           float64
CULEX PIPIENS             float64
CULEX PIPIENS/RESTUANS    float64
CULEX RESTUANS            float64
CULEX SALINARIUS          float64
CULEX TARSALIS            float64
CULEX TERRITANS           float64
Tmax                      float64
Tmin                      float64
Tavg                      float64
Depart                    float64
DewPoint                  float64
WetBulb                   float64
Heat                      float64
Cool                      float64
PrecipTotal               float64
StnPressure               float64
SeaLevel                  float64
ResultSpeed               float64
ResultDir                   int64
AvgSpeed                  float64
HZ                        float64
VC                        float64
FU                        float64
BC                        float64
SQ            

In [58]:
scaler = MinMaxScaler()
X= scaler.fit_transform(X)

In [59]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)


In [106]:
##Grid search over Random Forest parameters
# model evaluation function
def evaluate_model(model):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    a = accuracy_score(y_test, y_pred)
    probabilities = model.predict_proba(X_test)
    #cm = confusion_matrix(y_test, y_pred)
    conmat = np.array(confusion_matrix(y_test, y_pred, labels=[1,0]))

    confusion = pd.DataFrame(conmat, index=['1', '0'],
                         columns=['predicted_1','predicted_0'])

    cr = classification_report(y_test, y_pred)
    
    print 'confusion matrix:'
    print confusion
    print 'classification_report:',cr
    print 'Accuracy of the model on test:',a
    return probabilities
#params = {'max_features ': [0.5,1.0],'max_depth':[0.5,1.0],'n_estimators':[5,10]}
max_depths = [5]
max_features = [1.0]
n_estimators = [6000]
rf = RandomForestClassifier(n_jobs=-1,random_state = 33, class_weight='balanced')
gsrf = GridSearchCV(estimator = rf,param_grid=dict(max_depth = max_depths, max_features=max_features,n_estimators=n_estimators), n_jobs=-1,cv=3, scoring='roc_auc')
gsrf.fit(X_train, y_train)
print 'best parameters for the model:',gsrf.best_params_
print 'best score on train:',gsrf.best_score_
probability = evaluate_model(gsrf.best_estimator_)


best parameters for the model: {'max_features': 1.0, 'n_estimators': 6000, 'max_depth': 5}
best score on train: 0.818554641639
confusion matrix:
   predicted_1  predicted_0
1          140           42
0          775         2510
classification_report:              precision    recall  f1-score   support

          0       0.98      0.76      0.86      3285
          1       0.15      0.77      0.26       182

avg / total       0.94      0.76      0.83      3467

Accuracy of the model on test: 0.764349581771


In [78]:
test_add = test_add.drop('Date', axis=1)

In [79]:
scaler =  StandardScaler()
test_add = scaler.fit_transform(test_add)

In [88]:
predictions = gsrf.best_estimator_.predict(test_add)

In [89]:
predictions

array([0, 0, 0, ..., 0, 0, 0])

In [93]:
predictions = pd.Series(predictions)

In [101]:
submission = pd.concat([test['Id'],predictions], axis=1)

In [102]:
submission.columns = ['Id','WnvPresent']

In [103]:
submission.head()

Unnamed: 0,Id,WnvPresent
0,1,0
1,2,0
2,3,0
3,4,0
4,5,0


In [99]:
submission.to_csv('submission.csv')

In [108]:
##Grid search over Knn parameters
# model evaluation function
def evaluate_model(model):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    a = accuracy_score(y_test, y_pred)
    probabilities = model.predict_proba(X_test)
    #cm = confusion_matrix(y_test, y_pred)
    conmat = np.array(confusion_matrix(y_test, y_pred, labels=[1,0]))

    confusion = pd.DataFrame(conmat, index=['1', '0'],
                         columns=['predicted_1','predicted_0'])

    cr = classification_report(y_test, y_pred)
    
    print 'confusion matrix:'
    print confusion
    print 'classification_report:'
    print cr
    print 'Accuracy of the model on test:',a
    return probabilities

n_neighbors =  range(1, 10)
knn = KNeighborsClassifier(weights='distance')
gsrf = GridSearchCV(estimator = knn,param_grid=dict( n_neighbors=n_neighbors),cv=3, scoring='roc_auc')
gsrf.fit(X_train, y_train)
print 'best parameters for the model:',gsrf.best_params_
print 'best score on train:',gsrf.best_score_
probability = evaluate_model(gsrf.best_estimator_)


best parameters for the model: {'n_neighbors': 8}
best score on train: 0.686687089884
confusion matrix:
   predicted_1  predicted_0
1           21          161
0           32         3253
classification_report:
             precision    recall  f1-score   support

          0       0.95      0.99      0.97      3285
          1       0.40      0.12      0.18       182

avg / total       0.92      0.94      0.93      3467

Accuracy of the model on test: 0.944332275743


In [None]:
##AdaBoosted RandomForest over Random Forest parameters
# model evaluation function
def evaluate_model(model):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    a = accuracy_score(y_test, y_pred)
    probabilities = model.predict_proba(X_test)
    #cm = confusion_matrix(y_test, y_pred)
    conmat = np.array(confusion_matrix(y_test, y_pred, labels=[1,0]))

    confusion = pd.DataFrame(conmat, index=['1', '0'],
                         columns=['predicted_1','predicted_0'])

    cr = classification_report(y_test, y_pred)
    
    print 'confusion matrix:'
    print confusion
    print 'classification_report:',cr
    print 'Accuracy of the model on test:',a
    return probabilities


n_estimators = [50,100]
RFM = RandomForestClassifier(max_features=1.0, n_estimators=1000, max_depth=5)
ADB = AdaBoostClassifier(base_estimator=RFM,random_state = 33)
gsADB = GridSearchCV(estimator = ADB,param_grid=dict(n_estimators=n_estimators), n_jobs=-1,cv=3, scoring='roc_auc')
gsADB.fit(X_train, y_train)

