In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [2]:
weather_raw = pd.read_csv('../kaggle_competition/assets/weather.csv')
train_raw = pd.read_csv('../kaggle_competition/assets/train.csv')

In [3]:
weather_raw.head().T

Unnamed: 0,0,1,2,3,4
Station,1,2,1,2,1
Date,2007-05-01,2007-05-01,2007-05-02,2007-05-02,2007-05-03
Tmax,83,84,59,60,66
Tmin,50,52,42,43,46
Tavg,67,68,51,52,56
Depart,14,M,-3,M,2
DewPoint,51,51,42,42,40
WetBulb,56,57,47,47,48
Heat,0,0,14,13,9
Cool,2,3,0,0,0


In [37]:
cols = [col for col in weather_raw.columns]
target_cols = ['Station', 'Date', 'Tmax', 'Tmin', 'Tavg', 'WetBulb', 'PrecipTotal', 'ResultSpeed', 'AvgSpeed']

weather_clean = weather_raw[target_cols]

# Drop Wetbulb rows with 'M'
weather_clean = weather_clean[weather_clean['WetBulb'] != 'M']
weather_clean['WetBulb'] = weather_clean['WetBulb'].astype(int)

# Proxy for PrecipTotal rows with 'T'
weather_clean['PrecipTotal'] = weather_clean['PrecipTotal'].map(lambda x: 0 if x in ['  T','M'] else x)
weather_clean['PrecipTotal'] = weather_clean['PrecipTotal'].astype(float)

# Drop AvgSpeed rows with 'M'
weather_clean = weather_clean[weather_clean['AvgSpeed'] != 'M']
weather_clean['AvgSpeed'] = weather_clean['AvgSpeed'].astype(float)

# Proxy for avg temp where values are 'M'
weather_clean['Tavg'] = weather_clean.apply(lambda row: (row['Tmax'] + row['Tmin'])/2 if row['Tavg'] == 'M' 
                                            else row['Tavg'], axis=1)

weather_clean['Tavg'] = weather_clean['Tavg'].astype(int)


In [41]:
weather_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2937 entries, 0 to 2943
Data columns (total 9 columns):
Station        2937 non-null int64
Date           2937 non-null object
Tmax           2937 non-null int64
Tmin           2937 non-null int64
Tavg           2937 non-null int64
WetBulb        2937 non-null int64
PrecipTotal    2937 non-null float64
ResultSpeed    2937 non-null float64
AvgSpeed       2937 non-null float64
dtypes: float64(3), int64(5), object(1)
memory usage: 229.5+ KB


In [42]:
ORD_clean = weather_clean[weather_clean['Station'] == 1]
MDW_clean = weather_clean[weather_clean['Station'] == 2]

In [43]:
train_raw.head().T

Unnamed: 0,0,1,2,3,4
Date,2007-05-29,2007-05-29,2007-05-29,2007-05-29,2007-05-29
Address,"4100 North Oak Park Avenue, Chicago, IL 60634,...","4100 North Oak Park Avenue, Chicago, IL 60634,...","6200 North Mandell Avenue, Chicago, IL 60646, USA","7900 West Foster Avenue, Chicago, IL 60656, USA","7900 West Foster Avenue, Chicago, IL 60656, USA"
Species,CULEX PIPIENS/RESTUANS,CULEX RESTUANS,CULEX RESTUANS,CULEX PIPIENS/RESTUANS,CULEX RESTUANS
Block,41,41,62,79,79
Street,N OAK PARK AVE,N OAK PARK AVE,N MANDELL AVE,W FOSTER AVE,W FOSTER AVE
Trap,T002,T002,T007,T015,T015
AddressNumberAndStreet,"4100 N OAK PARK AVE, Chicago, IL","4100 N OAK PARK AVE, Chicago, IL","6200 N MANDELL AVE, Chicago, IL","7900 W FOSTER AVE, Chicago, IL","7900 W FOSTER AVE, Chicago, IL"
Latitude,41.9547,41.9547,41.995,41.9741,41.9741
Longitude,-87.801,-87.801,-87.7693,-87.8248,-87.8248
AddressAccuracy,9,9,9,8,8


In [44]:
train_dist = train_raw[['Date', 'Latitude', 'Longitude']]
train_dist.head()

Unnamed: 0,Date,Latitude,Longitude
0,2007-05-29,41.95469,-87.800991
1,2007-05-29,41.95469,-87.800991
2,2007-05-29,41.994991,-87.769279
3,2007-05-29,41.974089,-87.824812
4,2007-05-29,41.974089,-87.824812


In [45]:
# Distances need to be updated with lat long calc
train_dist['Dist_ORD'] = 0.5 
train_dist['Dist_MDW'] = 1

# Calculate closest station
train_dist['Station'] = train_dist.apply(lambda row: 'ORD' if min(row['Dist_ORD'], row['Dist_MDW']) == row['Dist_ORD'] else 'MDW', axis=1)
train_dist['Number'] = train_dist['Station'].map(lambda x: 1 if x == 'ORD' else 2)

train_dist.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas

Unnamed: 0,Date,Latitude,Longitude,Dist_ORD,Dist_MDW,Station,Number
0,2007-05-29,41.95469,-87.800991,0.5,1,ORD,1
1,2007-05-29,41.95469,-87.800991,0.5,1,ORD,1
2,2007-05-29,41.994991,-87.769279,0.5,1,ORD,1
3,2007-05-29,41.974089,-87.824812,0.5,1,ORD,1
4,2007-05-29,41.974089,-87.824812,0.5,1,ORD,1
