In [4]:
import pandas as pd
import numpy as np
from sklearn import ensemble, preprocessing


#setting rootdirectory path 
rootdir = '/usr/bin/MachineLearning/WestNileVirusPrediction/'

# Load Train Dataframe, Test DataFrame and Weather DataFrame 

train_dataframe = pd.read_csv(rootdir + 'train.csv')
test_dataframe = pd.read_csv(rootdir + 'test.csv')
weather_dataframe = pd.read_csv(rootdir + 'weather.csv')


In [5]:
Y_target = train_dataframe.WnvPresent.values

In [6]:
# Dropping Code Sum, saw this in multiple kaggle scripts submitted. Will see whether to use this later or not
weather_dataframe = weather_dataframe.drop('CodeSum', axis=1)

In [7]:
# Split station 1 and 2 and join horizontally
weather_dataframe_stn1 = weather_dataframe[weather_dataframe['Station']==1]
weather_dataframe_stn2 = weather_dataframe[weather_dataframe['Station']==2]
weather_dataframe_stn1 = weather_dataframe_stn1.drop('Station', axis=1)
weather_dataframe_stn2 = weather_dataframe_stn2.drop('Station', axis=1)
weather_dataframe = weather_dataframe_stn1.merge(weather_dataframe_stn2, on='Date')

In [8]:
# replace some missing values and T with -1
weather_dataframe = weather_dataframe.replace('M', -1)
weather_dataframe = weather_dataframe.replace('-', -1)
weather_dataframe = weather_dataframe.replace('T', -1)
weather_dataframe = weather_dataframe.replace(' T', -1)
weather_dataframe = weather_dataframe.replace('  T', -1)

In [9]:
# Functions to extract month and day from dataset. It will be used with Pandas apply to extract Month and day from the Date column
def create_month(x):
    return x.split('-')[1]

def create_day(x):
    return x.split('-')[2]


In [10]:
#adding new column Month and Day to Train and Test Data Frame
train_dataframe['month'] = train_dataframe.Date.apply(create_month)
train_dataframe['day'] = train_dataframe.Date.apply(create_day)
test_dataframe['month'] = test_dataframe.Date.apply(create_month)
test_dataframe['day'] = test_dataframe.Date.apply(create_day)

In [13]:
# Truncating the floating point value latitude/longitude columns and making it integer
train_dataframe['Lat_int'] = train_dataframe.Latitude.apply(int)
train_dataframe['Long_int'] = train_dataframe.Longitude.apply(int)
test_dataframe['Lat_int'] = test_dataframe.Latitude.apply(int)
test_dataframe['Long_int'] = test_dataframe.Longitude.apply(int)




In [14]:
# dropping columns not needed for train_dataframeing and prediction 
train_dataframe = train_dataframe.drop(['Address', 'AddressNumberAndStreet','WnvPresent', 'NumMosquitos'], axis = 1)
test_dataframe = test_dataframe.drop(['Id', 'Address', 'AddressNumberAndStreet'], axis = 1)

In [15]:
# Join on Date column with weather data
train_dataframe = train_dataframe.merge(weather, on='Date')
test_dataframe = test_dataframe.merge(weather, on='Date')
train_dataframe = train_dataframe.drop(['Date'], axis = 1)
test_dataframe = test_dataframe.drop(['Date'], axis = 1)

In [16]:
# Convert categorical data to numbers
lbl = preprocessing.LabelEncoder()
lbl.fit(list(train_dataframe['Species'].values) + list(test_dataframe['Species'].values))
train_dataframe['Species'] = lbl.transform(train_dataframe['Species'].values)
test_dataframe['Species'] = lbl.transform(test_dataframe['Species'].values)

lbl.fit(list(train_dataframe['Street'].values) + list(test_dataframe['Street'].values))
train_dataframe['Street'] = lbl.transform(train_dataframe['Street'].values)
test_dataframe['Street'] = lbl.transform(test_dataframe['Street'].values)

lbl.fit(list(train_dataframe['Trap'].values) + list(test_dataframe['Trap'].values))
train_dataframe['Trap'] = lbl.transform(train_dataframe['Trap'].values)
test_dataframe['Trap'] = lbl.transform(test_dataframe['Trap'].values)

# drop columns with -1s
train_dataframe = train_dataframe.ix[:,(train_dataframe != -1).any(axis=0)]
test_dataframe = test_dataframe.ix[:,(test_dataframe != -1).any(axis=0)]

In [25]:
train_array = train_dataframe.values
test_array = test_dataframe.values

In [None]:
#Random Forest Regressor implementation for prediction
#Splitting Training result set into 2 for cross validation
from sklearn.cross_validation import *
from sklearn.cross_validation import train_test_split
from sklearn.svm import SVR
from sklearn.grid_search import GridSearchCV

tuned_parameters = {'n_estimators': [25, 45, 60], 'max_depth': [None, 1, 2, 3], 'min_samples_split': [1, 2, 3] ,'max_features' : ['auto','sqrt','log2']}
X_train, X_test, y_train, y_test = train_test_split(train_array,Y_target, test_size=0.20, random_state=0)

clf = GridSearchCV(ensemble.RandomForestClassifier(n_jobs=-1), tuned_parameters, cv=10, scoring='r2')
clf.fit(X_train, y_train)
print(clf.best_estimator_)