In [31]:

import pandas as pd
import numpy as np
from sklearn import ensemble, preprocessing

#setting rootdirectory path 
rootdir = '/usr/bin/MachineLearning/WestNileVirusPrediction/'

# Load Train Dataframe, Test DataFrame and Weather DataFrame 

train = pd.read_csv(rootdir + 'train.csv')
test = pd.read_csv(rootdir + 'test.csv')
weather = pd.read_csv(rootdir + 'weather.csv')
sample = pd.read_csv(rootdir+'sampleSubmission.csv')


# Get Target Value into Labels Array
labels = train.WnvPresent.values

# Not using codesum for this run
weather = weather.drop('CodeSum', axis=1)

# Split station 1 and 2 and join horizontally
weather_stn1 = weather[weather['Station']==1]
weather_stn2 = weather[weather['Station']==2]
weather_stn1 = weather_stn1.drop('Station', axis=1)
weather_stn2 = weather_stn2.drop('Station', axis=1)
weather = weather_stn1.merge(weather_stn2, on='Date')

# replace some missing values and T with -1
weather = weather.replace('M', -1)
weather = weather.replace('-', -1)
weather = weather.replace('T', -1)
weather = weather.replace(' T', -1)
weather = weather.replace('  T', -1)

# Functions to extract month and day from dataset
# You can also use parse_dates of Pandas.
def create_month(x):
    return x.split('-')[1]

def create_day(x):
    return x.split('-')[2]

train['month'] = train.Date.apply(create_month)
train['day'] = train.Date.apply(create_day)
test['month'] = test.Date.apply(create_month)
test['day'] = test.Date.apply(create_day)

# Add integer latitude/longitude columns
train['Lat_int'] = train.Latitude.apply(int)
train['Long_int'] = train.Longitude.apply(int)
test['Lat_int'] = test.Latitude.apply(int)
test['Long_int'] = test.Longitude.apply(int)

# drop address columns
train = train.drop(['Address', 'AddressNumberAndStreet','WnvPresent', 'NumMosquitos'], axis = 1)
test = test.drop(['Id', 'Address', 'AddressNumberAndStreet'], axis = 1)

# Merge with weather data
train = train.merge(weather, on='Date')
test = test.merge(weather, on='Date')
train = train.drop(['Date'], axis = 1)
test = test.drop(['Date'], axis = 1)



In [32]:
#test_dataframe.ix[test_dataframe["Species"]=='UNSPECIFIED CULEX'].Species = 'CULEX ERRATICUS'
test.loc[test.Species=='UNSPECIFIED CULEX','Species'] = 'CULEX ERRATICUS'



In [33]:
# Creating dummy variables for Species in Train and Test DataFrame
train_Species_dummies = pd.get_dummies(train['Species'], prefix='Species')
train_Species_dummies = train_Species_dummies.drop('Species_CULEX ERRATICUS',axis = 1)
train = train.drop('Species',1)
train = train.join(train_Species_dummies.ix[:,:])

test_Species_dummies = pd.get_dummies(test['Species'], prefix='Species')
test_Species_dummies = test_Species_dummies.drop('Species_CULEX ERRATICUS',axis = 1)
test= test.drop('Species',1)
test = test.join(test_Species_dummies.ix[:,:])

#Creating dummy variables for Block in Train and Test DataFrame
train_Block_dummies = pd.get_dummies(train['Block'], prefix='Block')
train_Block_dummies = train_Block_dummies.drop('Block_10',axis = 1)
train = train.drop('Block',1)
train = train.join(train_Block_dummies.ix[:,:])

test_Block_dummies = pd.get_dummies(test['Block'], prefix='Block')
test_Block_dummies = test_Block_dummies.drop('Block_10',axis = 1)
test= test.drop('Block',1)
test = test.join(test_Block_dummies.ix[:,:])


#Creating dummy variables for Street in Train and Test DataFrame
train_Street_dummies = pd.get_dummies(train['Street'], prefix='Street')
train_Street_dummies = train_Street_dummies.drop(train_Street_dummies.columns[0],axis = 1)
train = train.drop('Street',1)
train = train.join(train_Street_dummies.ix[:,:])

test_Street_dummies = pd.get_dummies(test['Street'], prefix='Street')
test_Street_dummies = test_Street_dummies.drop(test_Street_dummies.columns[0],axis = 1)
test = test.drop('Street',1)
test = test.join(test_Street_dummies.ix[:,:])

#Creating dummy variables for Trap in Train and Test DataFrame
train_Trap_dummies = pd.get_dummies(train['Trap'], prefix='Trap')
train_Trap_dummies = train_Trap_dummies.drop(train_Trap_dummies.columns[0],axis = 1)
train = train.drop('Trap',1)
train = train.join(train_Trap_dummies.ix[:,:])

test_Trap_dummies = pd.get_dummies(test['Trap'], prefix='Trap')
test_Trap_dummies = test_Trap_dummies.drop(test_Trap_dummies.columns[0],axis = 1)
test = test.drop('Trap',1)
test = test.join(test_Trap_dummies.ix[:,:])

In [40]:
# Finding out Missing columns in Train and Test Dataframe after Dummy Variable creation. 
# Without this step there will be a difference in columns between Train and Test DataFrame.
# Machine Algorithm will throw an error without this step

train_column_names_set = set(list(train.columns.values))
test_column_names_set = set(list(test.columns.values))

train_columns_notin_test = train_column_names_set - test_column_names_set
test_columns_notin_train = test_column_names_set - train_column_names_set

In [44]:
train_columns_notin_test

set()

In [None]:
test_columns_notin_train

In [43]:
# Dropping columns in Test that are not in Train Data Frame
test = test.drop(['Block_26', 'Street_ E 136TH ST', 'Street_ N KENNETH AVE', 'Street_ N KILBOURN AVE', 'Street_ N MELVINA AVE', 'Street_ S BALTIMORE AVE', 'Street_ S LOOMIS ST', 'Street_ S OGLESBY AVE', 'Street_ W 112TH ST', 'Street_ W 120TH ST', 'Street_ W 63RD PL', 'Street_ W DAKIN ST', 'Trap_T002A', 'Trap_T002B', 'Trap_T065A', 'Trap_T090A', 'Trap_T090B','Trap_T090C', 'Trap_T128A', 'Trap_T200A', 'Trap_T200B', 'Trap_T218A', 'Trap_T218B', 'Trap_T218C', 'Trap_T234'], axis = 1)

In [46]:
# drop columns with -1s
train = train.ix[:,(train != -1).any(axis=0)]
test = test.ix[:,(test != -1).any(axis=0)]

# Random Forest Classifier 
clf = ensemble.RandomForestClassifier(n_jobs=-1, n_estimators=1000)
clf.fit(train, labels)

# create predictions and submission file
# currently this is one giving best results with the score of 0.71560
#
predictions = clf.predict_proba(test)[:,1]
sample['WnvPresent'] = predictions
sample.to_csv(rootdir+'beat_the_benchmark.csv', index=False)

In [47]:
train_array = train.values
test_array = test.values

In [48]:
#Random Forest Classifier implementation for prediction
#Splitting Training result set into 2 for cross validation
from sklearn.cross_validation import *
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV

tuned_parameters = {'n_estimators': [25, 45, 60], 'max_depth': [None, 1, 2, 3], 'min_samples_split': [1, 2, 3] ,'max_features' : ['auto','sqrt','log2']}
X_train, X_test, y_train, y_test = train_test_split(train_array,labels, test_size=0.20, random_state=0)

clf = GridSearchCV(ensemble.RandomForestClassifier(n_jobs=-1), tuned_parameters, cv=10, scoring='roc_auc')
clf.fit(X_train, y_train)
print(clf.best_estimator_)
print(clf.best_score_)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=3, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=3,
            min_weight_fraction_leaf=0.0, n_estimators=45, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
0.797668627018


In [49]:
# create predictions and submission file
predictions = clf.predict_proba(test)[:,1]
sample['WnvPresent'] = predictions
sample.to_csv(rootdir+'beat_the_benchmark.csv', index=False)