In [19]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from haversine import haversine
from sklearn.model_selection import train_test_split
np.random.seed(42)
import scipy.stats as stats
import seaborn as sns

from matplotlib import rcParams
rcParams.update({'figure.autolayout':True})

plt.style.use(["presentation"])

%matplotlib inline

In [20]:
prediction_data=pd.read_csv('./assets/test.csv')

In [21]:
working_data=pd.read_csv('./assets/train.csv')

In [22]:
NOAA=pd.read_csv('./assets/weather.csv')

In [23]:
spray=pd.read_csv('./assets/spray.csv')

In [24]:
standing=pd.read_csv('./extra_data/Sanitation_Revised.csv')

# A. Data QA/Cleaning

In [25]:

train, test=train_test_split(working_data, test_size=.20, random_state=523)

In [26]:
train.groupby(by='Species')['WnvPresent'].mean()

Species
CULEX PIPIENS             0.085952
CULEX PIPIENS/RESTUANS    0.054068
CULEX RESTUANS            0.016988
CULEX SALINARIUS          0.000000
CULEX TARSALIS            0.000000
CULEX TERRITANS           0.000000
Name: WnvPresent, dtype: float64

In [27]:
train=train.copy()
test=test.copy()

In [29]:
def clean(df):
    df['Date']=pd.to_datetime(df['Date'])
    
    def mapping(x):
        if x == 'CULEX PIPIENS':
            return 'PIPIENS'
        elif x == 'CULEX PIPIENS/RESTUANS':
            return 'MIX'
        elif x == 'CULEX RESTUANS':
            return 'RESTUANS'
        else:
            return 'OTHER'
    
    df['Species']=df['Species'].map(mapping)
    
    dummies=pd.get_dummies(df['Species']).drop(['OTHER'], axis=1)
    df=pd.concat([df, dummies], axis=1)
    df['Location']=list(zip(df['Latitude'],df['Longitude']))
    df=df.drop(['Address', 'Latitude', 'Longitude', 'Block', 'Street', 'Trap', 
                'AddressNumberAndStreet', 'AddressAccuracy','Species'], axis=1)
                    
    return df

In [30]:
train=clean(train)
test=clean(test)
prediction_data=clean(prediction_data)

In [31]:
standing['Coord']=list(zip(standing['Latitude'], standing['Longitude']))

In [32]:
standing=standing['Coord'].dropna()

In [33]:
train.head()

Unnamed: 0,Date,NumMosquitos,WnvPresent,MIX,PIPIENS,RESTUANS,Location
4526,2009-07-06,4,0,1,0,0,"(41.803422999999995, -87.642984)"
2076,2007-08-16,50,1,0,1,0,"(41.673408, -87.599862)"
3574,2007-09-24,1,0,0,0,0,"(41.984809000000006, -87.728492)"
3427,2007-09-24,50,0,0,1,0,"(41.731922, -87.677512)"
3932,2009-06-02,3,0,1,0,0,"(41.800737, -87.71188000000001)"


# Get distance to the nearest location where standing water complaint ever filed

In [34]:
def measure_to_standing(input_location):
    distances=[]
    for standing_spot_location in standing:
        standing_spot_distance=haversine(standing_spot_location, input_location)
        distances+=[standing_spot_distance]
    return min(distances)
        

train['dist_to_standing_water']=train['Location'].map(measure_to_standing)
test['dist_to_standing_water']=train['Location'].map(measure_to_standing)
prediction_data['dist_to_standing_water']=train['Location'].map(measure_to_standing)
                                                      

In [35]:
train.head()

Unnamed: 0,Date,NumMosquitos,WnvPresent,MIX,PIPIENS,RESTUANS,Location,dist_to_standing_water
4526,2009-07-06,4,0,1,0,0,"(41.803422999999995, -87.642984)",0.204493
2076,2007-08-16,50,1,0,1,0,"(41.673408, -87.599862)",1.499085
3574,2007-09-24,1,0,0,0,0,"(41.984809000000006, -87.728492)",1.040195
3427,2007-09-24,50,0,0,1,0,"(41.731922, -87.677512)",1.110843
3932,2009-06-02,3,0,1,0,0,"(41.800737, -87.71188000000001)",0.303958


In [161]:
train['dist_to_standing_water'].median()

0.5192411590273422

In [166]:
pos=train[train['dist_to_standing_water']>.519]['NumMosquitos']
neg=train[train['dist_to_standing_water']<=.519]['NumMosquitos']

# C. Get elevation data

In [1]:
# following method from http://geologyandpython.com/dem-processing.html

west, east, north, south = (-88, -87.5, 41.6, 42.1)

In [4]:
import elevation

In [None]:
import os
dem_path = './'
output = os.getcwd() + dem_path