In [14]:
import pandas as pd
import numpy as np
from geopy.distance import vincenty
import cPickle as pickle

def get_merged_weather(csv_path):
    clean_weather_path = '../assets/weather_clean.csv'
    weather_df = pd.read_csv(clean_weather_path)
    weather_df['Date'] = weather_df['Date'].map(lambda x : x.split(' ')[0]) 
    base_df = pd.read_csv(csv_path)

    # From Kaggle description page:
    # for the weather data, here are the GEO coordinates for the two stations (two airports)
    # Station 1: CHICAGO O'HARE INTERNATIONAL AIRPORT Lat: 41.995 Lon: -87.933 Elev: 662 ft. above sea level
    # Station 2: CHICAGO MIDWAY INTL ARPT Lat: 41.786 Lon: -87.752 Elev: 612 ft. above sea level
    LAT1 = 41.995
    LON1 = -87.933
    LAT2 = 41.786
    LON2 = -87.752
    L1 = (LAT1,LON1)
    L2 = (LAT2,LON2)


    # looking only for these particular days (in training set)
    training_days = base_df.Date.unique()
    print 'unique days in training set', len(base_df.Date.unique())
    print 'unique days in weather data',len(weather_df.Date.unique())

    # only need weather data for the day that was recorded, subsetting for efficiency
    weather_sub = weather_df[weather_df['Date'].isin(training_days)]
    print 'total weather row count:', len(weather_df),' only relevant days:', len(weather_sub)



    # adding the fields, 
    # initally taking the temperature of the closer airport. Don't expect the temperature
    # to drastically change between the two airports. 
    # will use an apply function to add several columns, distance to each airport
    # and the rough distance %
    # and the closer airport (more dominant)

    def makeAirportPct(x):
        current = (x['Latitude'],x['Longitude']  )
        dist_to_stat1 = vincenty(current,L1).miles
        dist_to_stat2 = vincenty(current,L2).miles
        denom = dist_to_stat1 + dist_to_stat2
        x['STATION1DIST'] = dist_to_stat1
        x['STATION2DIST'] = dist_to_stat2
        x['STATION1PCT'] = dist_to_stat1 / denom
        x['STATION2PCT'] = dist_to_stat2 / denom
        if dist_to_stat1 < dist_to_stat2:
            x['CLOSER_STATION'] = 1
        else:
            x['CLOSER_STATION'] = 2
        return x
    
    print 'comparing locations to airports...'
    base_df = base_df.apply(makeAirportPct,axis=1)
    print 'assignments complete, preparing to merge in weather'
    print 'pre-merge rows', base_df.shape

    #lastly will merge the weather in, will match by DATE and STATION
    merged_df = base_df.merge(weather_sub,how='left',left_on=['Date','CLOSER_STATION'], right_on=['Date','Station'])

    print 'post-merge rows', merged_df.shape
    
    def create_month_features(df):
        df['Date'] = pd.to_datetime(df['Date'])
        df['Date'] = df['Date'].map(lambda x: x.month)
        for i in range(1,13):
            df[i] = 0
        for ind, month in enumerate(df['Date']):
            for i in range(1,13):
                if i==month:
                    df.ix[ind,i] = 1
        return df
    print 'finished adding month features', merged_df.shape
    return create_month_features(merged_df)

In [17]:
train_df = get_merged_weather('../assets/train.csv')
with open('../assets/train_w_weather.p','wb') as f:
    pickle.dump(train_df,f)
    


unique days in training set 95
unique days in weather data 1472
total weather row count: 2944  only relevant days: 190
pre-merge rows (10506, 17)
pre-merge rows (10506, 17)


In [None]:
test_df = get_merged_weather('../assets/test.csv')
with open('../assets/test_w_weather.p','wb') as f:
    pickle.dump(test_df,f)

unique days in training set 95
unique days in weather data 1472
total weather row count: 2944  only relevant days: 190


In [None]:
import pandas as pd
import numpy as np
from geopy.distance import vincenty
import cPickle as pickle


# importing all datasets
weather_path = '../assets/weather.csv'
clean_weather_path = '../assets/weather_clean.csv'
spray_path = '../assets/spray.csv'
train_path = '../assets/train.csv'
weather_df = pd.read_csv(clean_weather_path)
weather_df['Date'] = weather_df['Date'].map(lambda x : x.split(' ')[0]) 
spray_df = pd.read_csv(spray_path)
train_df = pd.read_csv(train_path)

# From Kaggle description page:
# for the weather data, here are the GEO coordinates for the two stations (two airports)
# Station 1: CHICAGO O'HARE INTERNATIONAL AIRPORT Lat: 41.995 Lon: -87.933 Elev: 662 ft. above sea level
# Station 2: CHICAGO MIDWAY INTL ARPT Lat: 41.786 Lon: -87.752 Elev: 612 ft. above sea level
LAT1 = 41.995
LON1 = -87.933
LAT2 = 41.786
LON2 = -87.752
L1 = (LAT1,LON1)
L2 = (LAT2,LON2)


# looking only for these particular days (in training set)
training_days = train_df.Date.unique()
print 'unique days in training set', len(train_df.Date.unique())
print 'unique days in weather data',len(weather_df.Date.unique())

# only need weather data for the day that was recorded, subsetting for efficiency
weather_sub = weather_df[weather_df['Date'].isin(training_days)]
print 'total weather row count:', len(weather_df),' only relevant days:', len(weather_sub)



# adding the fields, 
# initally taking the temperature of the closer airport. Don't expect the temperature
# to drastically change between the two airports. 
# will use an apply function to add several columns, distance to each airport
# and the rough distance %
# and the closer airport (more dominant)

def makeAirportPct(x):
    current = (x['Latitude'],x['Longitude']  )
    dist_to_stat1 = vincenty(current,L1).miles
    dist_to_stat2 = vincenty(current,L2).miles
    denom = dist_to_stat1 + dist_to_stat2
    x['STATION1DIST'] = dist_to_stat1
    x['STATION2DIST'] = dist_to_stat2
    x['STATION1PCT'] = dist_to_stat1 / denom
    x['STATION2PCT'] = dist_to_stat2 / denom
    if dist_to_stat1 < dist_to_stat2:
        x['CLOSER_STATION'] = 1
    else:
        x['CLOSER_STATION'] = 2
    return x

train_df = train_df.apply(makeAirportPct,axis=1)

print 'pre-merge rows', train_df.shape

#lastly will merge the weather in, will match by DATE and STATION
train_df = train_df.merge(weather_sub,how='left',left_on=['Date','CLOSER_STATION'], right_on=['Date','Station'])

print 'pre-merge rows', train_df.shape