# Predicting West Nile Virus

## Matt Garton, Katrina Miller, Rex Littlefield

## General Assembly Boston - DSI - September 2018

Link to presentation: https://docs.google.com/presentation/d/1MMZhOJnfq-JJzOlV3dmPOhjXRPjlVWUu07YNDUGYcJU/edit#slide=id.p

# Feature Engineering Notebook

Goal: Build a classification model to predict where and when West Nile Virus will be found in Chicago.

The purpose of this notebook is to build features from the data, explore some relationships and try to identify which features will be most useful, then start building some models. This notebook will also include some EDA and model-building, although I expect to have separate notebooks handling the bulk of those.

In [1]:
# import necessary modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
pd.options.display.max_columns = 1000

In [2]:
# read in the cleaned data
west_nile = pd.read_csv('../data/clean_data.csv')

In [3]:
def preprocess_data(data):
    '''Function to generate the variables needed for analysis'''
    
    # create dummy variable for WNV-carrying species
    data['carrier'] = np.where((data['Species'] == 'CULEX PIPIENS') | (data['Species'] == 'CULEX RESTUANS') | (data['Species'] == 'CULEX PIPIENS/RESTUANS'), 1, 0)
    
    # create dummy variables for Code Summary (weather codes)
    codes = ['TSRA', 'RA', 'BR', 'HZ', 'VCTS', 'DZ', 'TS', 'FG']
    for code in codes:
        data[code] = np.where(data['CodeSum'].str.contains(code), 1, 0)
    
    # create dummy for high temperature
    data['high_temp'] = np.where(data['Tavg'] > 70, 1, 0)
    
    # create dummy for dry conditions
    data['dry'] = np.where(data['PrecipTotal'] == 0, 1, 0)
    
    # Convert 'Date' column to a datetime object
    data['Date'] = pd.to_datetime(data['Date'])
    
    # create dummies to represent if the date falls within one of the migration periods for American Robins
    data['fall_migrate'] = np.where((data['Date'].dt.month == 10) | (data['Date'].dt.month == 11), 1, 0)
    data['spring_migrate'] = np.where((data['Date'].dt.month == 4) | (data['Date'].dt.month == 5), 1, 0)
    
    return data

In [4]:
west_nile = preprocess_data(west_nile)
west_nile.head()

Unnamed: 0,Date,Address,Species,Block,Street,Trap,AddressNumberAndStreet,Latitude,Longitude,AddressAccuracy,WnvPresent,NumMosquitos,Station,Tmax,Tmin,Tavg,Depart,DewPoint,WetBulb,Heat,Cool,Sunrise,Sunset,PrecipTotal,StnPressure,SeaLevel,ResultSpeed,ResultDir,AvgSpeed,CodeSum,carrier,TSRA,RA,BR,HZ,VCTS,DZ,TS,FG,high_temp,dry,fall_migrate,spring_migrate
0,2007-05-29,"1100 Roosevelt Road, Chicago, IL 60608, USA",CULEX PIPIENS/RESTUANS,11,W ROOSEVELT,T048,"1100 W ROOSEVELT, Chicago, IL",41.867108,-87.654224,8,0,1,1.5,88.0,62.5,75.5,10.0,58.5,65.5,0.0,10.5,421.0,1917.0,0.0,29.415,30.1,5.8,17.0,6.95,BR HZBR HZ,1,0,0,1,1,0,0,0,0,1,1,0,1
1,2007-05-29,"1100 Roosevelt Road, Chicago, IL 60608, USA",CULEX RESTUANS,11,W ROOSEVELT,T048,"1100 W ROOSEVELT, Chicago, IL",41.867108,-87.654224,8,0,2,1.5,88.0,62.5,75.5,10.0,58.5,65.5,0.0,10.5,421.0,1917.0,0.0,29.415,30.1,5.8,17.0,6.95,BR HZBR HZ,1,0,0,1,1,0,0,0,0,1,1,0,1
2,2007-05-29,"1100 South Peoria Street, Chicago, IL 60608, USA",CULEX RESTUANS,11,S PEORIA ST,T091,"1100 S PEORIA ST, Chicago, IL",41.862292,-87.64886,8,0,1,1.5,88.0,62.5,75.5,10.0,58.5,65.5,0.0,10.5,421.0,1917.0,0.0,29.415,30.1,5.8,17.0,6.95,BR HZBR HZ,1,0,0,1,1,0,0,0,0,1,1,0,1
3,2007-05-29,"1100 West Chicago Avenue, Chicago, IL 60642, USA",CULEX RESTUANS,11,W CHICAGO,T049,"1100 W CHICAGO, Chicago, IL",41.896282,-87.655232,8,0,1,1.5,88.0,62.5,75.5,10.0,58.5,65.5,0.0,10.5,421.0,1917.0,0.0,29.415,30.1,5.8,17.0,6.95,BR HZBR HZ,1,0,0,1,1,0,0,0,0,1,1,0,1
4,2007-05-29,"1500 North Long Avenue, Chicago, IL 60651, USA",CULEX RESTUANS,15,N LONG AVE,T153,"1500 N LONG AVE, Chicago, IL",41.907645,-87.760886,8,0,1,1.5,88.0,62.5,75.5,10.0,58.5,65.5,0.0,10.5,421.0,1917.0,0.0,29.415,30.1,5.8,17.0,6.95,BR HZBR HZ,1,0,0,1,1,0,0,0,0,1,1,0,1


In [5]:
west_nile.to_csv('../data/preprocessed_data.csv', index = False)

# Modeling with the full dataset

Early attempts at building simple models. Starting point for further analysis.

In [None]:
# import necessary modules
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler

# Set up X and y
features = ['pipiens', 'restuans', 'Latitude', 'Longitude', 'NumMosquitos', 'Tavg', 'PrecipTotal', 'StnPressure',
           'BR', 'HZ', 'RA', 'TSRA', 'VCTS', 'DZ', 'TS', 'FG']

X = west_nile[features]
y = west_nile['WnvPresent']

# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, stratify = y)

In [None]:
dt = DecisionTreeClassifier() # Instantiate model
tree = dt.fit(X_train, y_train) # Fit model
predictions = tree.predict(X_test) # Predict y
tree.score(X_test, y_test) # Score model

In [None]:
# Generate confusion matrix
cm = confusion_matrix(y_test, predictions) 

cm_df = pd.DataFrame(cm, columns = ['Predicted No West Nile', 'Predicted West Nile'],
                    index = ['Actual No West Nile', 'Actual West Nile'])

cm_df

In [None]:
# Standard Scale data
ss = StandardScaler()

X_train_scaled = ss.fit_transform(X_train)
X_test_scaled = ss.transform(X_test)

In [None]:
knn = KNeighborsClassifier() # Instantiate model
knn = knn.fit(X_train_scaled, y_train) # Fit model
predictions = knn.predict(X_test_scaled) # Predict y
knn.score(X_test_scaled, y_test) # Score model

In [None]:
# Generate confusion matrix
cm = confusion_matrix(y_test, predictions) 

cm_df = pd.DataFrame(cm, columns = ['Predicted No West Nile', 'Predicted West Nile'],
                    index = ['Actual No West Nile', 'Actual West Nile'])

cm_df