In [139]:
import pandas as pd
from datetime import date, datetime, timedelta
from time import sleep
import html5lib
import string
import os

## Clean Hourly Weather Data

In [2]:
columns = ['Time (CST)', 'Temp', 'Dew Point', 'Humidity', 
           'Pressure', 'Visibility', 'Wind Dir', 'Wind Speed', 'Gust Speed', 
           'Precip', 'Events', 'Conditions', 'Airport', 'Date']

In [3]:
df = pd.read_csv('../Assets/Datasets/Weather/Weather_Hourly_2016125129.csv', names=columns)

In [4]:
# remove windchill and gust speed. too many null values
df = df.drop(['Gust Speed'], axis=1)

In [8]:
# Format time to match BTS time (24hr int)
# add hours column
def get_hour(x):
    hour = str(x[0])
    signal = x[1].split(' ')[1]
    if signal == 'AM' and hour == '12':
        hour = '00'
    if signal == 'PM' and hour != '12':
        hour = str(int(hour) + 12)
    return int(hour)

def get_time(x):
    hour = str(x[0])
    minute = x[1].split(' ')[0]
    signal = x[1].split(' ')[1]
    if signal == 'AM' and hour == '12':
        hour = '00'
    if signal == 'PM' and hour != '12':
        hour = str(int(hour) + 12)
    return int(hour+minute)

df['Hour'] = df['Time (CST)'].str.split(':').apply(lambda x: get_hour(x))
df['Time (CST)'] = df['Time (CST)'].str.split(':').apply(lambda x: get_time(x))

In [9]:
# Drop duplicate rows for airport, date, hour
df.drop_duplicates(subset=['Airport', 'Date', 'Hour'], inplace=True)

In [10]:
columns = df.columns

In [11]:
# remove non-ascii characters
printable = set(string.printable)
def clean_cell(cell):
    cell = str(cell)
    return filter(lambda x: x in printable, cell)
    
for column in columns:
    df[column] = df[column].apply(lambda x: clean_cell(x))

In [12]:
# null values for precip and events become 0 and No_Event
df['Precip'] = df['Precip'].apply(lambda x: 0 if x in ['nan', 'T'] else x)
df['Events'] = df['Events'].apply(lambda x: 'No_Event' if x == 'nan' else x)

In [13]:
# remove non-numeric characters
num_columns = ['Hour', 'Time (CST)', 'Temp', 'Dew Point', 'Humidity', 'Pressure', 'Visibility', 'Wind Speed',
              'Precip']

def num_only(x):
    x = str(x)
    new_x = ''
    for char in x:
        if char.isdigit() or char == '.':
            new_x += char
    return new_x

for column in num_columns:
    df[column] = df[column].apply(lambda x: num_only(x))

In [14]:
# Change types
for column in num_columns:
    df[column] = df[column].apply(pd.to_numeric)

In [15]:
# Add 4-hour dayparts
def daypart(x):
    if x < 4:
        return 'red_eye'
    elif x < 8:
        return 'early_morning'
    elif x < 12:
        return 'morning'
    elif x < 16:
        return 'afternoon'
    elif x < 20:
        return 'evening'
    else:
        return 'night'
            
df['Daypart'] = df['Hour'].apply(lambda x: daypart(x))

In [16]:
# When Wind Dir is Calm, Wind speed is also calm
# Replace wind speed as 0 if wind dir is calm
df['Wind Speed'] = df.apply(lambda row: (0 if row['Wind Dir'] == 'Calm' else row['Wind Speed']), axis=1)

In [51]:
df

Unnamed: 0,Time (CST),Temp,Dew Point,Humidity,Pressure,Visibility,Wind Dir,Wind Speed,Precip,Events,Conditions,Airport,Date,Hour,Daypart
0,251,37.9,25.0,60.0,29.78,10.0,WSW,11.5,0.00,No_Event,Overcast,KMDW,2011-01-01,2,red_eye
1,351,36.0,26.1,67.0,29.78,10.0,SW,13.8,0.00,No_Event,Mostly Cloudy,KMDW,2011-01-01,3,red_eye
2,451,33.1,25.0,72.0,29.79,10.0,WSW,17.3,0.00,No_Event,Scattered Clouds,KMDW,2011-01-01,4,early_morning
3,551,30.9,21.9,69.0,29.81,10.0,WSW,16.1,0.00,No_Event,Scattered Clouds,KMDW,2011-01-01,5,early_morning
4,648,30.2,21.2,69.0,29.80,10.0,WSW,16.1,0.00,No_Event,Mostly Cloudy,KMDW,2011-01-01,6,early_morning
6,751,28.0,17.1,63.0,29.85,10.0,WSW,16.1,0.00,No_Event,Mostly Cloudy,KMDW,2011-01-01,7,early_morning
7,851,26.1,14.0,60.0,29.90,10.0,WSW,16.1,0.00,No_Event,Overcast,KMDW,2011-01-01,8,morning
8,951,24.1,10.0,55.0,29.93,10.0,WSW,23.0,0.00,No_Event,Overcast,KMDW,2011-01-01,9,morning
9,1051,21.9,8.1,55.0,29.94,10.0,WSW,21.9,0.00,No_Event,Overcast,KMDW,2011-01-01,10,morning
10,1151,21.0,7.0,55.0,29.94,10.0,WSW,20.7,0.00,No_Event,Mostly Cloudy,KMDW,2011-01-01,11,morning


In [136]:
df.isnull().sum()

Time (CST)     0
Temp          11
Dew Point     13
Humidity      14
Pressure      18
Visibility    16
Wind Dir       0
Wind Speed    16
Precip         0
Events         0
Conditions     0
Airport        0
Date           0
Hour           0
Daypart        0
dtype: int64

In [144]:
df.dropna(inplace=True)
df.shape

(97575, 15)

## Error Checking

In [140]:
# Create table of all expected Airport, Date, Hours
def date_range(start, end):
    dates = []
    curr = start
    while curr <= end:
        dates.append(curr)
        curr += timedelta(days=1)
    return dates

airports = ['KMDW', 'KORD']
start_date = date(2011, 1, 1)
end_date = date(2016, 7, 31)
dates = date_range(start_date, end_date)
hours = range(0, 24)

table = []
for airport in airports:
    for date in dates:
        for hour in hours:
            table.append([airport, date, hour])

table = pd.DataFrame(table)
table.columns = ['Airport', 'Date', 'Hour']
table['Hour'] = table['Hour'].astype(int)
table['Year'] = table['Date'].apply(lambda x: str(x.year))
table['Month'] = table['Date'].apply(lambda x: str(x.month))
table['Month'] = table['Month'].apply(lambda x: x if len(str(x)) == 2 else '0'+ x)
table['Day'] = table['Date'].apply(lambda x: str(x.day))
table['Day'] = table['Day'].apply(lambda x: x if len(str(x)) == 2 else '0'+ x)
table['Date'] = table['Year'] + '-' + table['Month'] + '-' + table['Day']

In [141]:
# Find rows in table, not in df
match = pd.merge(table[['Airport', 'Date', 'Hour']], df, on= ['Airport', 'Date', 'Hour'], how='left')
missing = match[match['Time (CST)'].isnull()]
missing.shape

In [142]:
# Summarize missing hours per airport & day
missing.groupby(['Airport', 'Date'])['Hour'].count().sort_values(ascending=False)

Airport  Date      
KMDW     2015-06-17    5
KORD     2015-06-17    5
KMDW     2014-05-06    4
         2011-02-17    4
KORD     2013-01-19    4
         2014-05-06    4
         2014-02-12    3
         2014-08-03    3
         2014-07-26    3
         2014-05-19    3
KMDW     2014-07-26    3
KORD     2014-01-27    3
         2014-09-06    3
KMDW     2014-08-13    3
KORD     2012-12-20    3
KMDW     2015-10-02    3
KORD     2014-08-13    3
KMDW     2016-01-04    3
KORD     2015-10-02    3
         2016-01-04    3
KMDW     2011-12-09    2
KORD     2012-12-17    2
         2014-06-28    2
         2014-06-12    2
         2014-06-09    2
         2014-02-27    2
         2015-10-03    2
KMDW     2014-05-19    2
KORD     2013-02-19    2
         2013-01-17    2
                      ..
         2011-07-24    1
         2011-06-07    1
         2011-05-28    1
         2011-05-12    1
         2011-03-13    1
         2011-01-01    1
KMDW     2016-07-06    1
KORD     2012-10-01    1
     

## Export Final

In [145]:
df.to_csv('../Assets/Datasets/Weather/Clean_Weather_Hourly.csv', index=False)