In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from haversine import haversine
from sklearn.model_selection import train_test_split
np.random.seed(42)
import scipy.stats as stats
import seaborn as sns

from matplotlib import rcParams
rcParams.update({'figure.autolayout':True})

plt.style.use(["presentation"])

%matplotlib inline

In [2]:
prediction_data=pd.read_csv('./assets/test.csv')

In [3]:
working_data=pd.read_csv('./assets/train.csv')

In [4]:
NOAA=pd.read_csv('./assets/weather.csv')

In [5]:
spray=pd.read_csv('./assets/spray.csv')

In [6]:
standing=pd.read_csv('./extra_data/Sanitation_Revised.csv')

# A. Data QA/Cleaning

In [7]:

train, test=train_test_split(working_data, test_size=.20, random_state=523)

In [8]:
train.groupby(by='Species')['WnvPresent'].mean()

Species
CULEX PIPIENS             0.085952
CULEX PIPIENS/RESTUANS    0.054068
CULEX RESTUANS            0.016988
CULEX SALINARIUS          0.000000
CULEX TARSALIS            0.000000
CULEX TERRITANS           0.000000
Name: WnvPresent, dtype: float64

In [9]:
train=train.copy()
test=test.copy()

In [10]:
def clean(df):
    df['Date']=pd.to_datetime(df['Date'])
    
    def mapping(x):
        if x == 'CULEX PIPIENS':
            return 'PIPIENS'
        elif x == 'CULEX PIPIENS/RESTUANS':
            return 'MIX'
        elif x == 'CULEX RESTUANS':
            return 'RESTUANS'
        else:
            return 'OTHER'
    
    df['Species']=df['Species'].map(mapping)
    
    dummies=pd.get_dummies(df['Species']).drop(['OTHER'], axis=1)
    df=pd.concat([df, dummies], axis=1)
    df['Location']=list(zip(df['Latitude'],df['Longitude']))
    df=df.drop(['Address', 'Latitude', 'Longitude', 'Trap', 
                'AddressNumberAndStreet', 'AddressAccuracy','Species'], axis=1)
                    
    return df

In [11]:
train=clean(train)
test=clean(test)
prediction_data=clean(prediction_data)

In [12]:
prediction_data.dtypes

Id                   int64
Date        datetime64[ns]
Block                int64
Street              object
MIX                  uint8
PIPIENS              uint8
RESTUANS             uint8
Location            object
dtype: object

In [13]:
standing['Coord']=list(zip(standing['Latitude'], standing['Longitude']))

In [14]:
standing=standing['Coord'].dropna()

In [15]:
train.head()

Unnamed: 0,Date,Block,Street,NumMosquitos,WnvPresent,MIX,PIPIENS,RESTUANS,Location
4526,2009-07-06,50,S UNION AVE,4,0,1,0,0,"(41.803422999999995, -87.642984)"
2076,2007-08-16,12,S DOTY AVE,50,1,0,1,0,"(41.673408, -87.599862)"
3574,2007-09-24,58,N PULASKI RD,1,0,0,0,0,"(41.984809000000006, -87.728492)"
3427,2007-09-24,22,W 89TH ST,50,0,0,1,0,"(41.731922, -87.677512)"
3932,2009-06-02,35,W 51ST ST,3,0,1,0,0,"(41.800737, -87.71188000000001)"


# B. Get distance to the nearest location where standing water complaint ever filed

In [16]:
def measure_to_standing(input_location):
    distances=[]
    for standing_spot_location in standing:
        standing_spot_distance=haversine(standing_spot_location, input_location)
        distances+=[standing_spot_distance]
    return min(distances)
        

train['dist_to_standing_water']=train['Location'].map(measure_to_standing)


                                                      

In [17]:
test['dist_to_standing_water']=test['Location'].map(measure_to_standing)


In [18]:
prediction_data['dist_to_standing_water']=prediction_data['Location'].map(measure_to_standing)


In [19]:
train.to_csv('./assets/train_step1')
test.to_csv('./assets/test_step1')
prediction_data.to_csv('./assets/prediction_step1')