In [266]:
# Import Needed Packages

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [267]:
# load in the csvs for the analysis

WNV = pd.read_csv('Datasets/west_nile/input/train.csv') 
# spray = pd.read_csv('Datasets/west_nile/input/spray.csv')
weather = pd.read_csv('Datasets/west_nile/input/weather.csv')

In [268]:
# Look at present columns

print WNV.columns
print weather.columns

Index([u'Date', u'Address', u'Species', u'Block', u'Street', u'Trap',
       u'AddressNumberAndStreet', u'Latitude', u'Longitude',
       u'AddressAccuracy', u'NumMosquitos', u'WnvPresent'],
      dtype='object')
Index([u'Station', u'Date', u'Tmax', u'Tmin', u'Tavg', u'Depart', u'DewPoint',
       u'WetBulb', u'Heat', u'Cool', u'Sunrise', u'Sunset', u'CodeSum',
       u'Depth', u'Water1', u'SnowFall', u'PrecipTotal', u'StnPressure',
       u'SeaLevel', u'ResultSpeed', u'ResultDir', u'AvgSpeed'],
      dtype='object')


In [269]:
# Convert to datetime for merging
WNV.Date = pd.to_datetime(WNV['Date'])
weather.Date = pd.to_datetime(weather['Date'])

In [270]:
# Create Mask to Pick Appropriate Columns for Analysis
wnv_mask = ['Date', 'Address', 'Block', 'Street', 'Latitude', 'Longitude', 'WnvPresent']
weather_mask = ['Date', 'DewPoint', 'WetBulb', 'PrecipTotal', 'SeaLevel', 'Tmax', 'StnPressure']

In [271]:
wnv_merge = pd.merge(WNV[wnv_mask], weather[weather_mask], how='left', on='Date')

In [272]:
# Drop duplicates that remain as artefacts of the inital merge
wnv_ = wnv_merge.drop_duplicates()

In [274]:
# See if there are any null values (not counting 'm' or 't' values)
wnv_merge.isnull().sum()

Date           0
Address        0
Block          0
Street         0
Latitude       0
Longitude      0
WnvPresent     0
DewPoint       0
WetBulb        0
PrecipTotal    0
SeaLevel       0
Tmax           0
StnPressure    0
dtype: int64

In [275]:

wnv_.to_csv('WNV.csv')

In [276]:
WNV = pd.read_csv('WNV.csv')

In [277]:
WNV.dtypes

Unnamed: 0       int64
Date            object
Address         object
Block            int64
Street          object
Latitude       float64
Longitude      float64
WnvPresent       int64
DewPoint         int64
WetBulb         object
PrecipTotal     object
SeaLevel       float64
Tmax             int64
StnPressure     object
dtype: object

In [278]:
WNV['WetBulb'] = pd.to_numeric(WNV['WetBulb'], errors='coerce')

In [279]:
WNV.WetBulb.unique()

array([ 65.,  66.,  51.,  52.,  72.,  59.,  60.,  58.,  71.,  67.,  70.,
        69.,  76.,  73.,  74.,  68.,  61.,  62.,  49.,  50.,  53.,  54.,
        46.,  48.,  55.,  nan,  63.,  57.,  56.,  64.,  75.,  47.])

In [280]:
def trace(df, x):
    '''Replaces the Trace and Missing tags from the data'''
    df[x] = df[x].apply(lambda x: x == np.NaN if x == 'M' else x)
    df[x] = df[x].apply(lambda x: x == 0.01 if x in ['T', '  T'] else x)
    df[x] = pd.to_numeric(df[x])
    return df

In [281]:
WNV.to_csv('WNV_Not_Imputed.csv')

In [282]:
x= ['PrecipTotal', 'StnPressure']
for item in x:
    trace(WNV, item)
    
WNV = WNV.fillna(value=WNV.mean())

In [283]:
# Above loop converted wetbulb to a 
WNV.WetBulb = WNV.WetBulb.apply(lambda x: x == np.NaN if x == 'M' else x)
WNV.WetBulb = pd.to_numeric(WNV.WetBulb)

In [284]:
WNV.to_csv('WNV_Imputed.csv')

In [285]:
WNV.dtypes

Unnamed: 0       int64
Date            object
Address         object
Block            int64
Street          object
Latitude       float64
Longitude      float64
WnvPresent       int64
DewPoint         int64
WetBulb        float64
PrecipTotal    float64
SeaLevel       float64
Tmax             int64
StnPressure    float64
dtype: object

In [253]:
WNV.dropna(axis=0, how='any')

Unnamed: 0.1,Unnamed: 0,Date,Address,Block,Street,Latitude,Longitude,WnvPresent,DewPoint,WetBulb,PrecipTotal,SeaLevel,Tmax,StnPressure
0,0,2007-05-29,"4100 North Oak Park Avenue, Chicago, IL 60634,...",41,N OAK PARK AVE,41.954690,-87.800991,0,58,65.0,0.0,30.11,88,29.39
1,1,2007-05-29,"4100 North Oak Park Avenue, Chicago, IL 60634,...",41,N OAK PARK AVE,41.954690,-87.800991,0,59,66.0,0.0,30.09,88,29.44
2,4,2007-05-29,"6200 North Mandell Avenue, Chicago, IL 60646, USA",62,N MANDELL AVE,41.994991,-87.769279,0,58,65.0,0.0,30.11,88,29.39
3,5,2007-05-29,"6200 North Mandell Avenue, Chicago, IL 60646, USA",62,N MANDELL AVE,41.994991,-87.769279,0,59,66.0,0.0,30.09,88,29.44
4,6,2007-05-29,"7900 West Foster Avenue, Chicago, IL 60656, USA",79,W FOSTER AVE,41.974089,-87.824812,0,58,65.0,0.0,30.11,88,29.39
5,7,2007-05-29,"7900 West Foster Avenue, Chicago, IL 60656, USA",79,W FOSTER AVE,41.974089,-87.824812,0,59,66.0,0.0,30.09,88,29.44
6,10,2007-05-29,"1500 West Webster Avenue, Chicago, IL 60614, USA",15,W WEBSTER AVE,41.921600,-87.666455,0,58,65.0,0.0,30.11,88,29.39
7,11,2007-05-29,"1500 West Webster Avenue, Chicago, IL 60614, USA",15,W WEBSTER AVE,41.921600,-87.666455,0,59,66.0,0.0,30.09,88,29.44
8,12,2007-05-29,"2500 West Grand Avenue, Chicago, IL 60654, USA",25,W GRAND AVE,41.891118,-87.654491,0,58,65.0,0.0,30.11,88,29.39
9,13,2007-05-29,"2500 West Grand Avenue, Chicago, IL 60654, USA",25,W GRAND AVE,41.891118,-87.654491,0,59,66.0,0.0,30.09,88,29.44


In [242]:
mask = ['WnvPresent', 'DewPoint', 'WetBulb', 'PrecipTotal', 'SeaLevel']

WNV[mask] = WNV[mask].fillna(WNV.mean())

In [243]:
WNV.WnvPresent = WNV.WnvPresent.apply(lambda x: x == 0 if x not in [0,1] else x)

In [254]:
from sklearn.linear_model import LogisticRegression
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import train_test_split

In [255]:
lr = LogisticRegression()

params = {
    'C' : np.logspace(-2, 2, 50),
    'solver': ['liblinear'],
    'penalty' :['l1', 'l2']
}

In [256]:
Y = np.ravel(WNV['WnvPresent'])
X = WNV[['DewPoint', 'WetBulb', 'PrecipTotal', 'SeaLevel']]

In [257]:
X_train, x_test, Y_train, y_test = train_test_split(X, Y, test_size = 0.3)

In [258]:
gs = GridSearchCV(lr, params, cv=5 )

In [259]:
gs.fit(X_train, 
       Y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'penalty': ['l1', 'l2'], 'C': array([  1.00000e-02,   1.20679e-02,   1.45635e-02,   1.75751e-02,
         2.12095e-02,   2.55955e-02,   3.08884e-02,   3.72759e-02,
         4.49843e-02,   5.42868e-02,   6.55129e-02,   7.90604e-02,
         9.54095e-02,   1.15140e-01,   1.38950e-01,   1.6...7e+01,   5.68987e+01,   6.86649e+01,
         8.28643e+01,   1.00000e+02]), 'solver': ['liblinear']},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [260]:
test = pd.read_csv('../Cojasami/Datasets/test.csv')

In [261]:
test.shape

(116293, 11)

In [262]:
preds = gs.predict(x_test)

In [263]:
from sklearn.metrics import confusion_matrix

In [264]:
test.head()

Unnamed: 0,Id,Date,Address,Species,Block,Street,Trap,AddressNumberAndStreet,Latitude,Longitude,AddressAccuracy
0,1,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS/RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9
1,2,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9
2,3,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9
3,4,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX SALINARIUS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9
4,5,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX TERRITANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9


In [265]:
confusion_matrix(y_test, preds)

array([[2722,    0],
       [ 233,    0]])

In [98]:
test.Date = pd.to_datetime(test['Date'])

In [100]:
test_merge = pd.merge(test[['Date', 'Block', 'Latitude', 'Longitude']], weather[weather_mask], how='left', on='Date')

In [115]:
trace(test_merge, 'WetBulb')
trace(test_merge, 'PrecipTotal')

test_merge.dtypes

Date           datetime64[ns]
Block                   int64
Latitude              float64
Longitude             float64
DewPoint                int64
WetBulb                 int64
PrecipTotal           float64
SeaLevel               object
dtype: object

In [245]:
gs.predict_proba(test_merge[['DewPoint', 'Block', 'WetBulb', 'PrecipTotal', 'SeaLevel']])

array([[ 0.93336263,  0.06663737],
       [ 0.93547984,  0.06452016],
       [ 0.93336263,  0.06663737],
       ..., 
       [ 0.91231184,  0.08768816],
       [ 0.9181626 ,  0.0818374 ],
       [ 0.91231184,  0.08768816]])

In [150]:
test_merge[test_merge['Block']==41]


Unnamed: 0,Date,Block,Latitude,Longitude,DewPoint,WetBulb,PrecipTotal,SeaLevel
0,2008-06-11,41,41.95469,-87.800991,56,64,0.00,29.99
1,2008-06-11,41,41.95469,-87.800991,55,64,0.00,29.97
2,2008-06-11,41,41.95469,-87.800991,56,64,0.00,29.99
3,2008-06-11,41,41.95469,-87.800991,55,64,0.00,29.97
4,2008-06-11,41,41.95469,-87.800991,56,64,0.00,29.99
5,2008-06-11,41,41.95469,-87.800991,55,64,0.00,29.97
6,2008-06-11,41,41.95469,-87.800991,56,64,0.00,29.99
7,2008-06-11,41,41.95469,-87.800991,55,64,0.00,29.97
8,2008-06-11,41,41.95469,-87.800991,56,64,0.00,29.99
9,2008-06-11,41,41.95469,-87.800991,55,64,0.00,29.97
