# Project 4: West Nile Virus in the City of Chicago

Michael Schillawski, 9 March 2018

Data Science Immersive, General Assembly

## Setup

### Import

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import datetime

% matplotlib inline

### Gather Data

In [None]:
#os.chdir('..')

In [2]:
path = os.getcwd()
path = path + '/assets/input'
os.chdir(path)

In [3]:
ls

mapdata_copyright_openstreetmap_contributors.rds
mapdata_copyright_openstreetmap_contributors.txt
noaa_weather_qclcd_documentation.pdf
sampleSubmission.csv
spray.csv
test.csv
train.csv
weather.csv


In [4]:
train = pd.read_csv('train.csv')
spray = pd.read_csv('spray.csv')
weather = pd.read_csv('weather.csv')
#test = pd.read_csv('test.csv')

## Data Cleaning

### Formatting & DeDuping

In [5]:
def DataInspect(dataframe):
    '''Original function (previously called eda) created by Ritika Bhasker
       Good first step when starting any project. Provides overview of
       dataset including missing values, duplicates and types.
       Takes a Pandas dataframe as the argument.
       Modified by Michael Schillawski based on his preferences.'''
    print("Dataframe Shape:", dataframe.shape,"\n")
    print("Duplicate Rows:", dataframe.duplicated().sum(),"\n") #Added this
    print("Dataframe Types \n\n", dataframe.dtypes,"\n")    
    print("Missing Values \n\n", dataframe.isnull().sum(),"\n")
    print("Dataframe Describe \n\n", dataframe.describe(include='all'),"\n")


    print('Unique Values by Variable')
    for item in dataframe:
        print(item,':',dataframe[item].nunique())

In [6]:
train['Date'] = pd.to_datetime(train['Date'])
spray['Date'] = pd.to_datetime(spray['Date'])
weather['Date'] = pd.to_datetime(weather['Date'])
#test['Date'] = pd.to_datetime(test['Date'])

In [7]:
DataInspect(train)

Dataframe Shape: (10506, 12) 

Duplicate Rows: 813 

Dataframe Types 

 Date                      datetime64[ns]
Address                           object
Species                           object
Block                              int64
Street                            object
Trap                              object
AddressNumberAndStreet            object
Latitude                         float64
Longitude                        float64
AddressAccuracy                    int64
NumMosquitos                       int64
WnvPresent                         int64
dtype: object 

Missing Values 

 Date                      0
Address                   0
Species                   0
Block                     0
Street                    0
Trap                      0
AddressNumberAndStreet    0
Latitude                  0
Longitude                 0
AddressAccuracy           0
NumMosquitos              0
WnvPresent                0
dtype: int64 

Dataframe Describe 

                        Date  

In [13]:
train[train.duplicated(keep=False)]

Unnamed: 0,Date,Address,Species,Block,Street,Trap,AddressNumberAndStreet,Latitude,Longitude,AddressAccuracy,NumMosquitos,WnvPresent
98,2007-06-26,"2200 West 113th Street, Chicago, IL 60643, USA",CULEX PIPIENS/RESTUANS,22,W 113TH ST,T086,"2200 W 113TH ST, Chicago, IL",41.688324,-87.676709,8,1,0
99,2007-06-26,"2200 West 113th Street, Chicago, IL 60643, USA",CULEX PIPIENS/RESTUANS,22,W 113TH ST,T086,"2200 W 113TH ST, Chicago, IL",41.688324,-87.676709,8,1,0
293,2007-07-11,"2200 West 113th Street, Chicago, IL 60643, USA",CULEX PIPIENS/RESTUANS,22,W 113TH ST,T086,"2200 W 113TH ST, Chicago, IL",41.688324,-87.676709,8,50,0
295,2007-07-11,"2200 West 113th Street, Chicago, IL 60643, USA",CULEX PIPIENS/RESTUANS,22,W 113TH ST,T086,"2200 W 113TH ST, Chicago, IL",41.688324,-87.676709,8,50,0
350,2007-07-11,"3500 West 116th Street, Chicago, IL 60655, USA",CULEX PIPIENS/RESTUANS,35,W 116TH ST,T158,"3500 W 116TH ST, Chicago, IL",41.682587,-87.707973,9,50,0
351,2007-07-11,"3500 West 116th Street, Chicago, IL 60655, USA",CULEX PIPIENS/RESTUANS,35,W 116TH ST,T158,"3500 W 116TH ST, Chicago, IL",41.682587,-87.707973,9,50,0
353,2007-07-11,"3500 West 116th Street, Chicago, IL 60655, USA",CULEX PIPIENS/RESTUANS,35,W 116TH ST,T158,"3500 W 116TH ST, Chicago, IL",41.682587,-87.707973,9,50,0
511,2007-07-18,"3300 West Randolph Street, Chicago, IL 60612, USA",CULEX RESTUANS,33,E RANDOLPH ST,T044,"3300 E RANDOLPH ST, Chicago, IL",41.883284,-87.705085,8,1,0
512,2007-07-18,"3300 West Randolph Street, Chicago, IL 60612, USA",CULEX RESTUANS,33,E RANDOLPH ST,T044,"3300 E RANDOLPH ST, Chicago, IL",41.883284,-87.705085,8,1,0
530,2007-07-18,"South Stony Island Avenue, Chicago, IL, USA",CULEX PIPIENS/RESTUANS,10,S STONY ISLAND AVE,T138,"1000 S STONY ISLAND AVE, Chicago, IL",41.726465,-87.585413,5,50,0


In [8]:
DataInspect(spray)

Dataframe Shape: (14835, 4) 

Duplicate Rows: 541 

Dataframe Types 

 Date         datetime64[ns]
Time                 object
Latitude            float64
Longitude           float64
dtype: object 

Missing Values 

 Date           0
Time         584
Latitude       0
Longitude      0
dtype: int64 

Dataframe Describe 

                        Date        Time      Latitude     Longitude
count                 14835       14251  14835.000000  14835.000000
unique                   10        8583           NaN           NaN
top     2013-08-15 00:00:00  7:44:32 PM           NaN           NaN
freq                   2668         541           NaN           NaN
first   2011-08-29 00:00:00         NaN           NaN           NaN
last    2013-09-05 00:00:00         NaN           NaN           NaN
mean                    NaN         NaN     41.904828    -87.736690
std                     NaN         NaN      0.104381      0.067292
min                     NaN         NaN     41.713925    -88.09646

In [14]:
spray[spray.duplicated(keep=False)]

Unnamed: 0,Date,Time,Latitude,Longitude
484,2011-09-07,7:43:40 PM,41.983917,-87.793088
485,2011-09-07,7:43:40 PM,41.983917,-87.793088
489,2011-09-07,7:44:32 PM,41.986460,-87.794225
490,2011-09-07,7:44:32 PM,41.986460,-87.794225
491,2011-09-07,7:44:32 PM,41.986460,-87.794225
492,2011-09-07,7:44:32 PM,41.986460,-87.794225
493,2011-09-07,7:44:32 PM,41.986460,-87.794225
494,2011-09-07,7:44:32 PM,41.986460,-87.794225
495,2011-09-07,7:44:32 PM,41.986460,-87.794225
496,2011-09-07,7:44:32 PM,41.986460,-87.794225


In [9]:
DataInspect(weather)

Dataframe Shape: (2944, 22) 

Duplicate Rows: 0 

Dataframe Types 

 Station                 int64
Date           datetime64[ns]
Tmax                    int64
Tmin                    int64
Tavg                   object
Depart                 object
DewPoint                int64
WetBulb                object
Heat                   object
Cool                   object
Sunrise                object
Sunset                 object
CodeSum                object
Depth                  object
Water1                 object
SnowFall               object
PrecipTotal            object
StnPressure            object
SeaLevel               object
ResultSpeed           float64
ResultDir               int64
AvgSpeed               object
dtype: object 

Missing Values 

 Station        0
Date           0
Tmax           0
Tmin           0
Tavg           0
Depart         0
DewPoint       0
WetBulb        0
Heat           0
Cool           0
Sunrise        0
Sunset         0
CodeSum        0
Depth          0

In [15]:
print(len(train))
print(len(spray))
print(len(weather))

10506
14835
2944


In [16]:
spray['Date'].describe()

count                   14835
unique                     10
top       2013-08-15 00:00:00
freq                     2668
first     2011-08-29 00:00:00
last      2013-09-05 00:00:00
Name: Date, dtype: object

In [17]:
train['Date'].describe()

count                   10506
unique                     95
top       2007-08-01 00:00:00
freq                      551
first     2007-05-29 00:00:00
last      2013-09-26 00:00:00
Name: Date, dtype: object

In [19]:
weather['Date'].describe()

count                    2944
unique                   1472
top       2014-08-27 00:00:00
freq                        2
first     2007-05-01 00:00:00
last      2014-10-31 00:00:00
Name: Date, dtype: object

In [20]:
train.head(3)

Unnamed: 0,Date,Address,Species,Block,Street,Trap,AddressNumberAndStreet,Latitude,Longitude,AddressAccuracy,NumMosquitos,WnvPresent
0,2007-05-29,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS/RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,1,0
1,2007-05-29,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,1,0
2,2007-05-29,"6200 North Mandell Avenue, Chicago, IL 60646, USA",CULEX RESTUANS,62,N MANDELL AVE,T007,"6200 N MANDELL AVE, Chicago, IL",41.994991,-87.769279,9,1,0


## EDA

### Join Weather Data to Trap Data

In [22]:
train.head(2)

Unnamed: 0,Date,Address,Species,Block,Street,Trap,AddressNumberAndStreet,Latitude,Longitude,AddressAccuracy,NumMosquitos,WnvPresent
0,2007-05-29,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS/RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,1,0
1,2007-05-29,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,1,0


In [44]:
print(len(train),len(weather))

10506 2944


In [49]:
weather.columns

Index(['Station', 'Date', 'Tmax', 'Tmin', 'Tavg', 'Depart', 'DewPoint',
       'WetBulb', 'Heat', 'Cool', 'Sunrise', 'Sunset', 'CodeSum', 'Depth',
       'Water1', 'SnowFall', 'PrecipTotal', 'StnPressure', 'SeaLevel',
       'ResultSpeed', 'ResultDir', 'AvgSpeed'],
      dtype='object')

In [79]:
station1 = weather[weather['Station']==1].copy()
station2 = weather[weather['Station']==2].copy()


#Station 1: CHICAGO O'HARE INTERNATIONAL AIRPORT Lat: 41.995 Lon: -87.933 Elev: 662 ft. above sea level
station1['Latitude'] = 41.995
station1['Longitude'] = -87.9336

#Station 2: CHICAGO MIDWAY INTL ARPT Lat: 41.786 Lon: -87.752 Elev: 612 ft. above sea level
station2['Latitude'] = 41.78611
station2['Longitude'] = -87.75222


In [80]:
stations = pd.merge(station1,station2,on='Date',suffixes=('_s1','_s2'))
traps_weather = pd.merge(train,stations,on='Date')

In [81]:
DataInspect(traps_weather)

Dataframe Shape: (10506, 58) 

Duplicate Rows: 813 

Dataframe Types 

 Date                      datetime64[ns]
Address                           object
Species                           object
Block                              int64
Street                            object
Trap                              object
AddressNumberAndStreet            object
Latitude                         float64
Longitude                        float64
AddressAccuracy                    int64
NumMosquitos                       int64
WnvPresent                         int64
Station_s1                         int64
Tmax_s1                            int64
Tmin_s1                            int64
Tavg_s1                           object
Depart_s1                         object
DewPoint_s1                        int64
WetBulb_s1                        object
Heat_s1                           object
Cool_s1                           object
Sunrise_s1                        object
Sunset_s1                 

### Calculate point estimates of weather data at trap location

In [104]:
# calculate distance of traps to weather stations
dist_1 = np.sqrt(((traps_weather['Latitude'] - traps_weather['Latitude_s1'])**2 + 
 (traps_weather['Longitude'] - traps_weather['Longitude_s1'])**2))

dist_2 = np.sqrt(((traps_weather['Latitude'] - traps_weather['Latitude_s2'])**2 + 
 (traps_weather['Longitude'] - traps_weather['Longitude_s2'])**2))

# calculate distance weights to each trap
# to weight the weather data by proximity
total_dist = dist_1 + dist_2
traps_weather['weight_1'] = dist_1 / total_dist
traps_weather['weight_2'] = dist_2 / total_dist

In [114]:
# Apply distance weights to weather data
# weight along the gradient
station1_list = [col for col in traps_weather.columns 
                if '_s1' in col and col not in ('Station_s1','Latitude_s1','Longitude_s1')]
station2_list = [col for col in traps_weather.columns 
                 if '_s2' in col and col not in ('Station_s2','Latitude_s2','Longitude_s2')]

for col in station1_list:
    traps_weather[col] = traps_weather['weight_1'] * traps_weather[col]
for col in station2_list:
    traps_weather[col] = traps_weather['weight_2'] * traps_weather[col]

In [120]:
for col in [col for col in traps_weather.columns 
            if 's1' in col and col not in ('Station_s1','Latitude_s1','Longitude_s1')]:
    name = col.replace('_s1','')
    traps_weather[name] = traps_weather[col] + traps_weather[name+'_s2']

### Spraying Data

In [144]:
len(spray)

14835

In [145]:
spray.head(3)

Unnamed: 0,Date,Time,Latitude,Longitude
0,2011-08-29,6:56:58 PM,42.391623,-88.089163
1,2011-08-29,6:57:08 PM,42.391348,-88.089163
2,2011-08-29,6:57:18 PM,42.391022,-88.089157


In [219]:
# the idea here is that the targeted intervention has an effect that decays in two dimensions, time and distance
# the reference location that we care about is the trap
# so we evaluate spraying by how far the trap is from the where the spraying occurs AND
# we evaluate how long before the trap observation did the spraying occur
# so we calculate the deltas for every spraying against every trap
# if spraying occurred after the observation, we zero out these observations

# should we cross-multiply the distance and time? spraying that is 
# close in both time and distance should be privileged

distance = []
time = []

for i in traps_weather.index:
    temp_lat = traps_weather.at[i,'Latitude']
    temp_long = traps_weather.at[i,'Longitude']

    # calculate distance from traps to spray locations
    dist = np.sqrt((spray['Latitude'] - temp_lat)**2 + (spray['Longitude'] - temp_long)**2)
    distance.append(dist)

    # calculate time since spray
    time_since_spray = traps_weather.at[i,'Date'] - spray['Date']
    time.append(time_since_spray)

distance = pd.DataFrame(distance)
time = pd.DataFrame(time)

time.reset_index(inplace=True)
time.drop('index',axis=1,inplace=True)

In [None]:
for col in time.columns:
    time[col] = time[col].apply(lambda x: x.total_seconds())
    time[col] = time[col].apply(lambda x: (((x/60)/60)/24))

In [None]:
time.head()

In [196]:
data = pd.merge(distance,time,how='inner',left_index=True,right_index=True,suffixes=('_d','_t'))
data.shape

### Outcome variable(s)

In [None]:
# trim dataset
# drop string geographic variables, info encoded in lat/long
# drop _s1/_s2 weather variables --> stored after weighted average across distances

In [None]:
from sklearn.model_selection import train_test_split

X = data.drop('WnvPresent',axis=1)
Y = data[['WnvPresent']]

X_train,X_test,y_train,y_test = train_test_split()

## Modeling

We need to predict when, where, and among which species West Nile Virus will occur
In which traps will we observe West Nile Virus?

How to define outcome variable?

Outcome: WNV 1/0

Variable selection:
- weather
- spray
- species
- location-based

In [None]:
### Logistic Regression

In [None]:
### KNN

In [None]:
### Random Forest

In [None]:
### ExtraTrees

In [None]:
### Adaboost

In [None]:
### Gradient Boosting