In [5]:
import pandas as pd
import numpy as np
from geopy.distance import vincenty

# importing all datasets
weather_path = '../assets/weather.csv'
spray_path = '../assets/spray.csv'
train_path = '../assets/train.csv'
weather_df = pd.read_csv(weather_path)
spray_df = pd.read_csv(spray_path)
train_df = pd.read_csv(train_path)

# From Kaggle description page:
# for the weather data, here are the GEO coordinates for the two stations (two airports)
# Station 1: CHICAGO O'HARE INTERNATIONAL AIRPORT Lat: 41.995 Lon: -87.933 Elev: 662 ft. above sea level
# Station 2: CHICAGO MIDWAY INTL ARPT Lat: 41.786 Lon: -87.752 Elev: 612 ft. above sea level
LAT1 = 41.995
LON1 = -87.933
LAT2 = 41.786
LON2 = -87.752
L1 = (LAT1,LON1)
L2 = (LAT2,LON2)


# looking only for these particular days (in training set)
training_days = train_df.Date.unique()
print 'unique days in training set', len(train_df.Date.unique())
print 'unique days in weather data',len(weather_df.Date.unique())

# only need weather data for the day that was recorded, subsetting for efficiency
weather_sub = weather_df[weather_df['Date'].isin(training_days)]
print 'total weather row count:', len(weather_df),' only relevant days:', len(weather_sub)



# adding the fields, 
# initally taking the temperature of the closer airport. Don't expect the temperature
# to drastically change between the two airports. 
# will use an apply function to add several columns, distance to each airport
# and the rough distance %
# and the closer airport (more dominant)

def makeAirportPct(x):
    current = (x['Latitude'],x['Longitude']  )
    dist_to_stat1 = vincenty(current,L1).miles
    dist_to_stat2 = vincenty(current,L2).miles
    denom = dist_to_stat1 + dist_to_stat2
    x['STATION1DIST'] = dist_to_stat1
    x['STATION2DIST'] = dist_to_stat2
    x['STATION1PCT'] = dist_to_stat1 / denom
    x['STATION2PCT'] = dist_to_stat2 / denom
    if dist_to_stat1 < dist_to_stat2:
        x['CLOSER_STATION'] = 1
    else:
        x['CLOSER_STATION'] = 2
    return x

train_df = train_df.apply(makeAirportPct,axis=1)


#lastly will merge the weather in, will match by DATE and STATION
train_df = train_df.merge(weather_sub,how='left',left_on=['Date','CLOSER_STATION'], right_on=['Date','Station'])




unique days in training set 95
unique days in weather data 1472
total weather row count: 2944  only relevant days: 190


In [6]:
train_df.head()

Unnamed: 0,Date,Address,Species,Block,Street,Trap,AddressNumberAndStreet,Latitude,Longitude,AddressAccuracy,...,CodeSum,Depth,Water1,SnowFall,PrecipTotal,StnPressure,SeaLevel,ResultSpeed,ResultDir,AvgSpeed
0,2007-05-29,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS/RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,...,BR HZ,0,M,0.0,0.0,29.39,30.11,5.8,18,6.5
1,2007-05-29,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,...,BR HZ,0,M,0.0,0.0,29.39,30.11,5.8,18,6.5
2,2007-05-29,"6200 North Mandell Avenue, Chicago, IL 60646, USA",CULEX RESTUANS,62,N MANDELL AVE,T007,"6200 N MANDELL AVE, Chicago, IL",41.994991,-87.769279,9,...,BR HZ,0,M,0.0,0.0,29.39,30.11,5.8,18,6.5
3,2007-05-29,"7900 West Foster Avenue, Chicago, IL 60656, USA",CULEX PIPIENS/RESTUANS,79,W FOSTER AVE,T015,"7900 W FOSTER AVE, Chicago, IL",41.974089,-87.824812,8,...,BR HZ,0,M,0.0,0.0,29.39,30.11,5.8,18,6.5
4,2007-05-29,"7900 West Foster Avenue, Chicago, IL 60656, USA",CULEX RESTUANS,79,W FOSTER AVE,T015,"7900 W FOSTER AVE, Chicago, IL",41.974089,-87.824812,8,...,BR HZ,0,M,0.0,0.0,29.39,30.11,5.8,18,6.5
