In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
np.random.seed(42)

from matplotlib import rcParams
rcParams.update({'figure.autolayout':True})

plt.style.use(["presentation"])

%matplotlib inline

# Importing cleaning and splitting data

In [2]:
prediction_data=pd.read_csv('./assets/test.csv')

In [3]:
working_data=pd.read_csv('./assets/train.csv')

In [4]:
NOAA=pd.read_csv('./assets/weather.csv')

In [5]:
spray=pd.read_csv('./assets/spray.csv')

In [6]:
train, test=train_test_split(working_data, test_size=.20, random_state=523)

In [7]:
from haversine import haversine

In [8]:
def clean(df):
    df['Date']=pd.to_datetime(df['Date'])
    dummies=pd.get_dummies(df['Species']).drop(['CULEX TERRITANS'], axis=1)
    df=pd.concat([df, dummies], axis=1)
    df['Location']=list(zip(df['Latitude'],df['Longitude']))
    df=df.drop(['Address','Block', 'Street', 'Trap', 
                'AddressNumberAndStreet', 'AddressAccuracy','Species'], axis=1)
                    
    return df

In [9]:
df = clean(working_data)

In [10]:
df.head(2)

Unnamed: 0,Date,Latitude,Longitude,NumMosquitos,WnvPresent,CULEX ERRATICUS,CULEX PIPIENS,CULEX PIPIENS/RESTUANS,CULEX RESTUANS,CULEX SALINARIUS,CULEX TARSALIS,Location
0,2007-05-29,41.95469,-87.800991,1,0,0,0,1,0,0,0,"(41.95469, -87.800991)"
1,2007-05-29,41.95469,-87.800991,1,0,0,0,0,1,0,0,"(41.95469, -87.800991)"


## Distance calculation

** Data Preparation **

Spray Locations

In [10]:
spray.head(3)

Unnamed: 0,Date,Time,Latitude,Longitude
0,2011-08-29,6:56:58 PM,42.391623,-88.089163
1,2011-08-29,6:57:08 PM,42.391348,-88.089163
2,2011-08-29,6:57:18 PM,42.391022,-88.089157


In [11]:
spray['Location']=list(zip(spray['Latitude'],spray['Longitude']))

In [12]:
# getting the unique spray locations
spray.drop_duplicates(['Location'], keep='last',inplace=True)

In [13]:
spray.shape

(14294, 5)

Test locations

In [14]:
test['Location']=list(zip(test['Latitude'],test['Longitude']))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [15]:
test.head(2)

Unnamed: 0,Date,Address,Species,Block,Street,Trap,AddressNumberAndStreet,Latitude,Longitude,AddressAccuracy,NumMosquitos,WnvPresent,Location
3477,2007-09-24,"3700 North Kedvale Avenue, Chicago, IL 60641, USA",CULEX PIPIENS/RESTUANS,37,N KEDVALE AVE,T218,"3700 N KEDVALE AVE, Chicago, IL",41.948167,-87.730698,8,7,0,"(41.948167, -87.730698)"
5484,2009-08-25,"3700 118th Street, Chicago, IL 60617, USA",CULEX PIPIENS/RESTUANS,37,E 118TH ST,T212,"3700 E 118TH ST, Chicago, IL",41.680946,-87.535198,8,1,0,"(41.680946, -87.535198)"


Train Locations

In [16]:
train['Location']=list(zip(train['Latitude'],train['Longitude']))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [17]:
train.head(2)

Unnamed: 0,Date,Address,Species,Block,Street,Trap,AddressNumberAndStreet,Latitude,Longitude,AddressAccuracy,NumMosquitos,WnvPresent,Location
4526,2009-07-06,"5000 South Union Avenue, Chicago, IL 60609, USA",CULEX PIPIENS/RESTUANS,50,S UNION AVE,T082,"5000 S UNION AVE, Chicago, IL",41.803423,-87.642984,8,4,0,"(41.803422999999995, -87.642984)"
2076,2007-08-16,"South Doty Avenue, Chicago, IL, USA",CULEX PIPIENS,12,S DOTY AVE,T115,"1200 S DOTY AVE, Chicago, IL",41.673408,-87.599862,5,50,1,"(41.673408, -87.599862)"


## Spray Distanec Calculations

In [None]:
# assign list of spray locations as iteration target
traps =spray['Location'].dropna()

def measure_to_spray(input_location):
    distances=[]
    for location in traps:
        trap_distance=haversine(location, input_location)
        distances+=[trap_distance]
    return min(distances)
        

df['dist_to_traps']=df['Location'].map(measure_to_spray)
train['dist_to_traps']=train['Location'].map(measure_to_spray)
test['dist_to_traps']=train['Location'].map(measure_to_spray)
prediction_data['dist_to_traps']=train['Location'].map(measure_to_spray)

In [30]:
prediction_data.head(2)

Unnamed: 0,Id,Date,Address,Species,Block,Street,Trap,AddressNumberAndStreet,Latitude,Longitude,AddressAccuracy,dist_to_traps
0,1,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS/RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,0.256049
1,2,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,0.256049


In [28]:
train.head(2)


Unnamed: 0,Date,Address,Species,Block,Street,Trap,AddressNumberAndStreet,Latitude,Longitude,AddressAccuracy,NumMosquitos,WnvPresent,Location,dist_to_traps
4526,2009-07-06,"5000 South Union Avenue, Chicago, IL 60609, USA",CULEX PIPIENS/RESTUANS,50,S UNION AVE,T082,"5000 S UNION AVE, Chicago, IL",41.803423,-87.642984,8,4,0,"(41.803422999999995, -87.642984)",4.701486
2076,2007-08-16,"South Doty Avenue, Chicago, IL, USA",CULEX PIPIENS,12,S DOTY AVE,T115,"1200 S DOTY AVE, Chicago, IL",41.673408,-87.599862,5,50,1,"(41.673408, -87.599862)",4.605039


## Saving files to pickle

In [31]:
train.to_pickle('train_Hg.pickle')
test.to_pickle('test_Hg.pickle')
prediction_data.to_pickle('prediection_data_Hg.pickle')