In [68]:
import pandas as pd
import numpy as np
from sklearn import ensemble, preprocessing


#setting rootdirectory path 
rootdir = '/usr/bin/MachineLearning/WestNileVirusPrediction/'

# Load Train Dataframe, Test DataFrame and Weather DataFrame 

train_dataframe = pd.read_csv(rootdir + 'train.csv')
test_dataframe = pd.read_csv(rootdir + 'test.csv')
weather_dataframe = pd.read_csv(rootdir + 'weather.csv')


In [69]:
Y_target = train_dataframe.WnvPresent.values

In [70]:
# Dropping Code Sum, saw this in multiple kaggle scripts submitted. Will see whether to use this later or not
weather_dataframe = weather_dataframe.drop('CodeSum', axis=1)

In [71]:
weather_dataframe.head()

Unnamed: 0,Station,Date,Tmax,Tmin,Tavg,Depart,DewPoint,WetBulb,Heat,Cool,...,Sunset,Depth,Water1,SnowFall,PrecipTotal,StnPressure,SeaLevel,ResultSpeed,ResultDir,AvgSpeed
0,1,2007-05-01,83,50,67,14,51,56,0,2,...,1849,0,M,0.0,0.0,29.1,29.82,1.7,27,9.2
1,2,2007-05-01,84,52,68,M,51,57,0,3,...,-,M,M,M,0.0,29.18,29.82,2.7,25,9.6
2,1,2007-05-02,59,42,51,-3,42,47,14,0,...,1850,0,M,0.0,0.0,29.38,30.09,13.0,4,13.4
3,2,2007-05-02,60,43,52,M,42,47,13,0,...,-,M,M,M,0.0,29.44,30.08,13.3,2,13.4
4,1,2007-05-03,66,46,56,2,40,48,9,0,...,1851,0,M,0.0,0.0,29.39,30.12,11.7,7,11.9


In [65]:
# Split station 1 and 2 and join horizontally
weather_dataframe_stn1 = weather_dataframe[weather_dataframe['Station']==1]
weather_dataframe_stn2 = weather_dataframe[weather_dataframe['Station']==2]
weather_dataframe_stn1 = weather_dataframe_stn1.drop('Station', axis=1)
weather_dataframe_stn2 = weather_dataframe_stn2.drop('Station', axis=1)
weather_dataframe = weather_dataframe_stn1.merge(weather_dataframe_stn2, on='Date')

In [66]:
# replace some missing values and T with -1
weather_dataframe = weather_dataframe.replace('M', -1)
weather_dataframe = weather_dataframe.replace('-', -1)
weather_dataframe = weather_dataframe.replace('T', -1)
weather_dataframe = weather_dataframe.replace(' T', -1)
weather_dataframe = weather_dataframe.replace('  T', -1)

In [67]:
weather_dataframe.head()

Unnamed: 0,Date,Tmax_x,Tmin_x,Tavg_x,Depart_x,DewPoint_x,WetBulb_x,Heat_x,Cool_x,Sunrise_x,...,Sunset_y,Depth_y,Water1_y,SnowFall_y,PrecipTotal_y,StnPressure_y,SeaLevel_y,ResultSpeed_y,ResultDir_y,AvgSpeed_y
0,2007-05-01,83,50,67,14,51,56,0,2,448,...,-1,-1,-1,-1,0.0,29.18,29.82,2.7,25,9.6
1,2007-05-02,59,42,51,-3,42,47,14,0,447,...,-1,-1,-1,-1,0.0,29.44,30.08,13.3,2,13.4
2,2007-05-03,66,46,56,2,40,48,9,0,446,...,-1,-1,-1,-1,0.0,29.46,30.12,12.9,6,13.2
3,2007-05-04,66,49,58,4,41,50,7,0,444,...,-1,-1,-1,-1,0.0,29.36,30.04,10.1,7,10.4
4,2007-05-05,66,53,60,5,38,49,5,0,443,...,-1,-1,-1,-1,-1.0,29.46,30.09,11.2,7,11.5


In [53]:
# Functions to extract month and day from dataset. It will be used with Pandas apply to extract Month and day from the Date column
def create_month(x):
    return x.split('-')[1]

def create_day(x):
    return x.split('-')[2]


In [54]:
#adding new column Month and Day to Train and Test Data Frame
train_dataframe['month'] = train_dataframe.Date.apply(create_month)
train_dataframe['day'] = train_dataframe.Date.apply(create_day)
test_dataframe['month'] = test_dataframe.Date.apply(create_month)
test_dataframe['day'] = test_dataframe.Date.apply(create_day)

In [55]:
# Truncating the floating point value latitude/longitude columns and making it integer
train_dataframe['Lat_int'] = train_dataframe.Latitude.apply(int)
train_dataframe['Long_int'] = train_dataframe.Longitude.apply(int)
test_dataframe['Lat_int'] = test_dataframe.Latitude.apply(int)
test_dataframe['Long_int'] = test_dataframe.Longitude.apply(int)




In [61]:
# dropping columns not needed for train_dataframeing and prediction 
train_dataframe = train_dataframe.drop(['Address', 'AddressNumberAndStreet','WnvPresent', 'NumMosquitos'], axis = 1)
test_dataframe = test_dataframe.drop(['Id', 'Address', 'AddressNumberAndStreet'], axis = 1)

ValueError: labels ['Address' 'AddressNumberAndStreet' 'WnvPresent' 'NumMosquitos'] not contained in axis

In [57]:
# Join on Date column with weather data
train_dataframe = train_dataframe.merge(weather_dataframe, on='Date')
test_dataframe = test_dataframe.merge(weather_dataframe, on='Date')
train_dataframe = train_dataframe.drop(['Date'], axis = 1)
test_dataframe = test_dataframe.drop(['Date'], axis = 1)

In [58]:
# Convert categorical data to numbers
lbl = preprocessing.LabelEncoder()
lbl.fit(list(train_dataframe['Species'].values) + list(test_dataframe['Species'].values))
train_dataframe['Species'] = lbl.transform(train_dataframe['Species'].values)
test_dataframe['Species'] = lbl.transform(test_dataframe['Species'].values)

lbl.fit(list(train_dataframe['Street'].values) + list(test_dataframe['Street'].values))
train_dataframe['Street'] = lbl.transform(train_dataframe['Street'].values)
test_dataframe['Street'] = lbl.transform(test_dataframe['Street'].values)

lbl.fit(list(train_dataframe['Trap'].values) + list(test_dataframe['Trap'].values))
train_dataframe['Trap'] = lbl.transform(train_dataframe['Trap'].values)
test_dataframe['Trap'] = lbl.transform(test_dataframe['Trap'].values)

# drop columns with -1s
train_dataframe = train_dataframe.ix[:,(train_dataframe != -1).any(axis=0)]
test_dataframe = test_dataframe.ix[:,(test_dataframe != -1).any(axis=0)]

In [59]:
train_dataframe.dtypes

Species              int64
Block                int64
Street               int64
Trap                 int64
Latitude           float64
Longitude          float64
AddressAccuracy      int64
month               object
day                 object
Lat_int              int64
Long_int             int64
Tmax_x               int64
Tmin_x               int64
Tavg_x              object
Depart_x            object
DewPoint_x           int64
WetBulb_x           object
Heat_x              object
Cool_x              object
Sunrise_x           object
Sunset_x            object
Depth_x             object
SnowFall_x          object
PrecipTotal_x       object
StnPressure_x       object
SeaLevel_x          object
ResultSpeed_x      float64
ResultDir_x          int64
AvgSpeed_x          object
Tmax_y               int64
Tmin_y               int64
Tavg_y              object
DewPoint_y           int64
WetBulb_y           object
Heat_y              object
Cool_y              object
PrecipTotal_y       object
S

In [60]:
# Random Forest Classifier 
clf = ensemble.RandomForestClassifier(n_jobs=-1, n_estimators=1000, min_samples_split=1)
clf.fit(train_dataframe, Y_target)

# create predictions and submission file
predictions = clf.predict_proba(test)[:,1]
sample['WnvPresent'] = predictions
sample.to_csv(rootdir+'beat_the_benchmark.csv', index=False)

ValueError: invalid literal for float(): 2100  N CANNON DR, Chicago, IL

In [25]:
train_array = train_dataframe.values
test_array = test_dataframe.values

In [26]:
#Random Forest Regressor implementation for prediction
#Splitting Training result set into 2 for cross validation
from sklearn.cross_validation import *
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV

tuned_parameters = {'n_estimators': [25, 45, 60], 'max_depth': [None, 1, 2, 3], 'min_samples_split': [1, 2, 3] ,'max_features' : ['auto','sqrt','log2']}
X_train, X_test, y_train, y_test = train_test_split(train_array,Y_target, test_size=0.20, random_state=0)

clf = GridSearchCV(ensemble.RandomForestClassifier(n_jobs=-1), tuned_parameters, cv=10, scoring='roc_curve')
clf.fit(X_train, y_train)
print(clf.best_estimator_)

ValueError: Found arrays with inconsistent numbers of samples: [10506 21012]