### Starting clean. This is based on:
https://www.kaggle.com/abhishek/predict-west-nile-virus/vote-me-up

In [23]:
import pandas as pd
import numpy as np
from sklearn import ensemble, preprocessing
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import cPickle

%config InlineBackend.figure_format = 'retina'
%matplotlib inline

# plot styles
plt.style.use('seaborn-whitegrid')
sns.set_style('whitegrid')

In [24]:
spray = pd.read_csv('../assets/spray.csv')
weather = pd.read_csv('../assets/weather.csv')
train = pd.read_csv('../assets/train.csv')
test = pd.read_csv('../assets/test.csv')
sample = pd.read_csv('../assets/sampleSubmission.csv')

## Set y now:

In [25]:
y = train.WnvPresent.values

## make edits to train/test in parallel

In [26]:
from geopy.distance import vincenty

# calculate the distance between a test trap and a spray for a given date
#test = train[9100:9110]

def calc_dist_to_spray(df):
    # list of distances
    distances_to_spray= []
    # if no spray, distance is 25
    dist_to_spray = 25
    
    # only need to look at 2011 and 2013
    if ('2011' in df.Date) or ('2013' in df.Date):
                
        # get the location of the trap site
        lat_long_train = (df.Latitude,df.Longitude)
        date = df.Date
        distances = []
        # look for the date in the spray info
        for ispr,spr in spray[spray.Date == date].iterrows():
            # get the location of the spray site
            lat_long_spray = (spr.Latitude,spr.Longitude)
            
            # calculate the distance between the spray and trap locations 
            distance = vincenty(lat_long_train,lat_long_spray).miles
            distances_to_spray.append(distance)
            
        # if there are distances, return the smallest one
        if len(distances_to_spray) > 0:
            # sort so the smallest distance is first
            distances_to_spray = sorted(distances_to_spray)
            
            dist_to_spray = distances_to_spray[0]
            #print dist_to_spray
            
    # return the distance (or None)
    return dist_to_spray        

In [27]:
# train['dist_to_spray'] = train.apply(calc_dist_to_spray,axis=1)
# test['dist_to_spray'] = 25  # no sprays occur in this date range

### clean the weather data

In [28]:
# Not using codesum for this benchmark
weather = weather.drop('CodeSum', axis=1)

# Split station 1 and 2 and join horizontally
weather_stn1 = weather[weather['Station']==1]
weather_stn2 = weather[weather['Station']==2]
weather_stn1 = weather_stn1.drop('Station', axis=1)
weather_stn2 = weather_stn2.drop('Station', axis=1)
weather = weather_stn1.merge(weather_stn2, on='Date')

# replace some missing values and T with -1
weather = weather.replace('M', -1)
weather = weather.replace('-', -1)
weather = weather.replace('T', -1)
weather = weather.replace(' T', -1)
weather = weather.replace('  T', -1)

In [29]:
weather.head(2)

Unnamed: 0,Date,Tmax_x,Tmin_x,Tavg_x,Depart_x,DewPoint_x,WetBulb_x,Heat_x,Cool_x,Sunrise_x,...,Sunset_y,Depth_y,Water1_y,SnowFall_y,PrecipTotal_y,StnPressure_y,SeaLevel_y,ResultSpeed_y,ResultDir_y,AvgSpeed_y
0,2007-05-01,83,50,67,14,51,56,0,2,448,...,-1,-1,-1,-1,0.0,29.18,29.82,2.7,25,9.6
1,2007-05-02,59,42,51,-3,42,47,14,0,447,...,-1,-1,-1,-1,0.0,29.44,30.08,13.3,2,13.4


### Add month and day columns to train/test

In [30]:
def create_month(x):
    return x.split('-')[1]

def create_day(x):
    return x.split('-')[2]

def create_month(x):
    return x.split('-')[0]

train['month'] = train.Date.apply(create_month)
train['day'] = train.Date.apply(create_day)
train['year']  = train.Date.apply(create_month)
test['month'] = test.Date.apply(create_month)
test['day'] = test.Date.apply(create_day)
test['year']  = test.Date.apply(create_month)

print train.head(1),'\n================================================\n'
print test.head(1)

         Date                                            Address  \
0  2007-05-29  4100 North Oak Park Avenue, Chicago, IL 60634,...   

                  Species  Block           Street  Trap  \
0  CULEX PIPIENS/RESTUANS     41   N OAK PARK AVE  T002   

              AddressNumberAndStreet  Latitude  Longitude  AddressAccuracy  \
0  4100  N OAK PARK AVE, Chicago, IL  41.95469 -87.800991                9   

   NumMosquitos  WnvPresent month day  year  
0             1           0  2007  29  2007   

   Id        Date                                            Address  \
0   1  2008-06-11  4100 North Oak Park Avenue, Chicago, IL 60634,...   

                  Species  Block           Street  Trap  \
0  CULEX PIPIENS/RESTUANS     41   N OAK PARK AVE  T002   

              AddressNumberAndStreet  Latitude  Longitude  AddressAccuracy  \
0  4100  N OAK PARK AVE, Chicago, IL  41.95469 -87.800991                9   

  month day  year  
0  2008  11  2008  


In [31]:
# Add integer latitude/longitude columns
train['Lat_int'] = train.Latitude.apply(int)
train['Long_int'] = train.Longitude.apply(int)
test['Lat_int'] = test.Latitude.apply(int)
test['Long_int'] = test.Longitude.apply(int)

print train.head(1),'\n================================================\n'
print test.head(1)

         Date                                            Address  \
0  2007-05-29  4100 North Oak Park Avenue, Chicago, IL 60634,...   

                  Species  Block           Street  Trap  \
0  CULEX PIPIENS/RESTUANS     41   N OAK PARK AVE  T002   

              AddressNumberAndStreet  Latitude  Longitude  AddressAccuracy  \
0  4100  N OAK PARK AVE, Chicago, IL  41.95469 -87.800991                9   

   NumMosquitos  WnvPresent month day  year  Lat_int  Long_int  
0             1           0  2007  29  2007       41       -87   

   Id        Date                                            Address  \
0   1  2008-06-11  4100 North Oak Park Avenue, Chicago, IL 60634,...   

                  Species  Block           Street  Trap  \
0  CULEX PIPIENS/RESTUANS     41   N OAK PARK AVE  T002   

              AddressNumberAndStreet  Latitude  Longitude  AddressAccuracy  \
0  4100  N OAK PARK AVE, Chicago, IL  41.95469 -87.800991                9   

  month day  year  Lat_int  Long_i

In [32]:
# drop some columns
train = train.drop(['Address', 'AddressNumberAndStreet','WnvPresent', 'NumMosquitos'], axis = 1)
test = test.drop(['Id', 'Address', 'AddressNumberAndStreet'], axis = 1)

### Merge with weather data

In [33]:
# Merge with weather data
train = train.merge(weather, on='Date')
test = test.merge(weather, on='Date')
train = train.drop(['Date'], axis = 1)
test = test.drop(['Date'], axis = 1)
print train.head(1),'\n================================================\n'
print test.head(1)

                  Species  Block           Street  Trap  Latitude  Longitude  \
0  CULEX PIPIENS/RESTUANS     41   N OAK PARK AVE  T002  41.95469 -87.800991   

   AddressAccuracy month day  year    ...      Sunset_y  Depth_y  Water1_y  \
0                9  2007  29  2007    ...            -1       -1        -1   

   SnowFall_y PrecipTotal_y StnPressure_y  SeaLevel_y ResultSpeed_y  \
0          -1          0.00         29.44       30.09           5.8   

  ResultDir_y AvgSpeed_y  
0          16        7.4  

[1 rows x 50 columns] 

                  Species  Block           Street  Trap  Latitude  Longitude  \
0  CULEX PIPIENS/RESTUANS     41   N OAK PARK AVE  T002  41.95469 -87.800991   

   AddressAccuracy month day  year    ...      Sunset_y  Depth_y  Water1_y  \
0                9  2008  11  2008    ...            -1       -1        -1   

   SnowFall_y PrecipTotal_y StnPressure_y  SeaLevel_y ResultSpeed_y  \
0          -1          0.00         29.34       29.97           9.4   


In [34]:
# Convert categorical data to numbers
lbl = preprocessing.LabelEncoder()
lbl.fit(list(train['Species'].values) + list(test['Species'].values))
train['Species'] = lbl.transform(train['Species'].values)
test['Species'] = lbl.transform(test['Species'].values)

lbl.fit(list(train['Street'].values) + list(test['Street'].values))
train['Street'] = lbl.transform(train['Street'].values)
test['Street'] = lbl.transform(test['Street'].values)

lbl.fit(list(train['Trap'].values) + list(test['Trap'].values))
train['Trap'] = lbl.transform(train['Trap'].values)
test['Trap'] = lbl.transform(test['Trap'].values)


In [35]:
print train.head(1),'\n================================================\n'
print test.head(1)

   Species  Block  Street  Trap  Latitude  Longitude  AddressAccuracy month  \
0        2     41      36     1  41.95469 -87.800991                9  2007   

  day  year    ...      Sunset_y  Depth_y  Water1_y  SnowFall_y PrecipTotal_y  \
0  29  2007    ...            -1       -1        -1          -1          0.00   

  StnPressure_y  SeaLevel_y ResultSpeed_y ResultDir_y AvgSpeed_y  
0         29.44       30.09           5.8          16        7.4  

[1 rows x 50 columns] 

   Species  Block  Street  Trap  Latitude  Longitude  AddressAccuracy month  \
0        2     41      36     1  41.95469 -87.800991                9  2008   

  day  year    ...      Sunset_y  Depth_y  Water1_y  SnowFall_y PrecipTotal_y  \
0  11  2008    ...            -1       -1        -1          -1          0.00   

  StnPressure_y  SeaLevel_y ResultSpeed_y ResultDir_y AvgSpeed_y  
0         29.34       29.97           9.4          18       10.4  

[1 rows x 50 columns]


In [36]:
# drop columns with -1s
train = train.ix[:,(train != -1).any(axis=0)]
test = test.ix[:,(test != -1).any(axis=0)]

print train.columns
print test.columns


Index([u'Species', u'Block', u'Street', u'Trap', u'Latitude', u'Longitude',
       u'AddressAccuracy', u'month', u'day', u'year', u'Lat_int', u'Long_int',
       u'Tmax_x', u'Tmin_x', u'Tavg_x', u'Depart_x', u'DewPoint_x',
       u'WetBulb_x', u'Heat_x', u'Cool_x', u'Sunrise_x', u'Sunset_x',
       u'Depth_x', u'SnowFall_x', u'PrecipTotal_x', u'StnPressure_x',
       u'SeaLevel_x', u'ResultSpeed_x', u'ResultDir_x', u'AvgSpeed_x',
       u'Tmax_y', u'Tmin_y', u'Tavg_y', u'DewPoint_y', u'WetBulb_y', u'Heat_y',
       u'Cool_y', u'PrecipTotal_y', u'StnPressure_y', u'SeaLevel_y',
       u'ResultSpeed_y', u'ResultDir_y', u'AvgSpeed_y'],
      dtype='object')
Index([u'Species', u'Block', u'Street', u'Trap', u'Latitude', u'Longitude',
       u'AddressAccuracy', u'month', u'day', u'year', u'Lat_int', u'Long_int',
       u'Tmax_x', u'Tmin_x', u'Tavg_x', u'Depart_x', u'DewPoint_x',
       u'WetBulb_x', u'Heat_x', u'Cool_x', u'Sunrise_x', u'Sunset_x',
       u'Depth_x', u'SnowFall_x', u'PrecipTot

In [37]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.grid_search import GridSearchCV,RandomizedSearchCV

rfc = RandomForestClassifier()

rf_params = {
    'max_features':[None,'log2','sqrt', 2,3,4,5],
    'max_depth':[1,2,3,None],
    'min_samples_leaf':np.linspace(1,101,10),
    'n_estimators':[100,1000]
}

## gridsearch parameters, and cv =5
rf_gs = RandomizedSearchCV(rfc, rf_params, cv=5, verbose=1, n_jobs=-1,n_iter=100).fit(train,y)
print "best score", rf_gs.best_score_
print "best params", rf_gs.best_params_
# 0.67215 without spray distance
# 0.65207 with spray distance and year
# 0.63172 with out spray, distance, with year

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   56.3s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  4.8min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed: 11.8min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed: 13.5min finished


best score 0.947553778793
best params {'n_estimators': 100, 'max_features': 2, 'max_depth': 2, 'min_samples_leaf': 34.333333333333329}


In [38]:
# create predictions and submission file
predictions = rf_gs.predict_proba(test)[:,1]
sample['WnvPresent'] = predictions
sample.to_csv('randomforest_rscv.csv', index=False)

In [39]:
# Random Forest Classifier 
clf = ensemble.RandomForestClassifier(n_jobs=-1, n_estimators=1000, min_samples_split=1)
clf.fit(train, y)

# create predictions and submission file
predictions = clf.predict_proba(test)[:,1]
sample['WnvPresent'] = predictions
sample.to_csv('beat_the_benchmark.csv', index=False)
# 0.66611 with basic rfc and spray distance and year
# 0.67089 with out spray, distance, with year

In [40]:
# from sklearn.cross_validation import cross_val_score,train_test_split
# from sklearn.svm import SVC

# svm = SVC(C=1,probability=True).fit(train,y)

# # create predictions and submission file
# predictions = svm.predict_proba(test)[:,1]
# sample['WnvPresent'] = predictions
# sample.to_csv('svm.csv', index=False)
# # 0.52176 without spray distance
# # 0.51914 with spray distance and year

In [41]:

# tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
#                      'C': [1, 10, 100, 1000]},
#                     {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]

# svm_gs = GridSearchCV(SVC(C=1,probability=True), tuned_parameters, cv=5,verbose=1,n_jobs=-1)

# svm_gs.fit(train,y)

# # create predictions and submission file
# predictions = svm_gs.predict_proba(test)[:,1]
# sample['WnvPresent'] = predictions
# sample.to_csv('svm_gs.csv', index=False)
# [Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  5.0min
# [Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed: 378.9min finished
# # 0.62295  Too slow!

## Also try again including the spray distance info