In [1]:
import pandas as pd
import numpy as np
from datetime import date, datetime, timedelta
from time import sleep
import html5lib
import string
import os

## Clean Hourly Weather Data

In [62]:
columns = ['Time (CST)', 'Temp', 'Dew Point', 'Humidity', 
           'Pressure', 'Visibility', 'Wind Dir', 'Wind Speed', 'Gust Speed', 
           'Precip', 'Events', 'Conditions', 'Airport', 'Date']

In [63]:
df = pd.read_csv('../Assets/Datasets/Weather/Weather_Hourly_2016125129.csv', names=columns)

In [5]:
df = pd.read_csv('../Assets/Datasets/Weather/Weather_Hourly_DEN201612101537.csv', names=columns)
df_2 = pd.read_csv('../Assets/Datasets/Weather/Weather_Hourly_DEN201612101738.csv', names=columns)

In [64]:
# remove windchill and gust speed. too many null values
df = df.drop(['Gust Speed'], axis=1)

In [65]:
# Format time to match BTS time (24hr int)
# add hours column
def get_hour(x):
    hour = str(x[0])
    signal = x[1].split(' ')[1]
    if signal == 'AM' and hour == '12':
        hour = '00'
    if signal == 'PM' and hour != '12':
        hour = str(int(hour) + 12)
    return hour

df['Hour'] = df['Time (CST)'].str.split(':').apply(lambda x: get_hour(x))
df['DateTime'] = df['Date'] + ' ' + df['Hour'] + ':00'
df['DateTime'] = pd.to_datetime(df['DateTime'], infer_datetime_format=True)
df.drop('Time (CST)', axis=1, inplace=True)

In [66]:
# Drop duplicate rows for airport, date, hour
df.drop_duplicates(subset=['Airport', 'DateTime'], inplace=True)

In [67]:
# remove non-ascii characters
non_ascii_columns = ['Temp', 'Dew Point', 'Humidity', 'Pressure', 'Visibility', 'Wind Dir',
                     'Wind Speed', 'Precip', 'Events', 'Conditions', 'Airport']
printable = set(string.printable)
def clean_cell(cell):
    cell = str(cell)
    return filter(lambda x: x in printable, cell)
    
for column in non_ascii_columns:
    df[column] = df[column].apply(lambda x: clean_cell(x))

In [68]:
# null values for precip and events become 0 and No_Event
df['Precip'] = df['Precip'].apply(lambda x: 0 if x in ['nan', 'T'] else x)
df['Events'] = df['Events'].apply(lambda x: 'No_Event' if x == 'nan' else x)

In [20]:
# # replace null values in precip and events
# df = df.fillna(value={'Precip' : 0, 'Events' : 'No_Event'}, inplace=True)

In [70]:
# account for negative temperatures
df['negative_temp'] = df['Temp'].apply(lambda x: -1 if '-' in x else 1) 

In [75]:
# remove non-numeric characters, change numeric columns to numeric
num_columns = ['Temp', 'Dew Point', 'Humidity', 'Pressure', 'Visibility', 'Wind Speed',
              'Precip']

def num_only(x):
    x = str(x)
    new_x = ''
    for char in x:
        if char.isdigit() or char in ['.']:
            new_x += char
    return new_x

for column in num_columns:
    df[column] = df[column].apply(lambda x: num_only(x))
    
for column in num_columns:
    df[column] = pd.to_numeric(df[column])
df['Temp'] = df['Temp'] * df['negative_temp']
df.drop('negative_temp', axis=1, inplace=True)

In [86]:
# Add 4-hour dayparts
def daypart(x):
    x = int(x)
    if x < 4:
        return 'red_eye'
    elif x < 8:
        return 'early_morning'
    elif x < 12:
        return 'morning'
    elif x < 16:
        return 'afternoon'
    elif x < 20:
        return 'evening'
    else:
        return 'night'
            
df['Daypart'] = df['Hour'].apply(lambda x: daypart(x))

In [87]:
# When Wind Dir is Calm, Wind speed is also calm
# Replace wind speed as 0 if wind dir is calm
df['Wind Speed'] = np.where(df['Wind Dir'] == 'Calm', 
                            0, df['Wind Speed'])

In [88]:
df.isnull().sum()

Temp          11
Dew Point     13
Humidity      14
Pressure      18
Visibility    16
Wind Dir       0
Wind Speed    16
Precip         0
Events         0
Conditions     0
Airport        0
Date           0
Hour           0
DateTime       0
Daypart        0
dtype: int64

In [89]:
df.dropna(inplace=True)
df.shape

(97575, 15)

## Error Checking

In [90]:
# Create table of all expected Airport, Date, Hours
def date_range(start, end):
    dates = []
    curr = start
    while curr <= end:
        dates.append(curr)
        curr += timedelta(days=1)
    return dates

airports = df['Airport'].unique().tolist()
start_date = date(2011, 1, 1)
end_date = date(2016, 7, 31)
dates = date_range(start_date, end_date)
hours = range(0, 24)

table = []
for airport in airports:
    for date in dates:
        for hour in hours:
            table.append([airport, 
                          datetime(date.year, date.month, date.day, hour),
                          date])

table = pd.DataFrame(table)
table.columns = ['Airport', 'DateTime', 'Dates']


In [91]:
# Find rows in table, not in df
match = pd.merge(table, df, on= ['Airport', 'DateTime'], how='left')
missing = match[match['Date'].isnull()]
missing.shape

(297, 16)

In [25]:
# Summarize missing hours per airport & day
# If day is missing all 24 hours need to repull
# if day is missing less than 24 hours, data is missing on site
missing.groupby(['Airport', 'Dates'])['DateTime'].count().sort_values(ascending=False)

Airport  Dates     
KDEN     2011-07-08    6
         2016-01-04    4
         2014-05-06    4
         2015-06-17    4
         2015-10-02    4
         2011-07-09    4
         2014-07-26    3
         2012-04-12    2
         2014-05-19    2
         2012-12-17    2
         2013-02-19    2
         2013-03-10    2
         2015-03-17    2
         2015-06-20    2
         2015-06-16    2
         2011-02-04    2
         2011-01-26    2
         2016-06-26    2
         2014-08-13    2
         2012-10-18    1
         2013-02-04    1
         2012-11-04    1
         2012-12-05    1
         2012-12-13    1
         2013-07-02    1
         2013-01-15    1
         2013-11-03    1
         2013-01-17    1
         2013-02-05    1
         2013-07-01    1
                      ..
         2016-03-13    1
         2016-05-28    1
         2016-06-11    1
         2014-09-16    1
         2014-09-08    1
         2014-09-05    1
         2014-06-19    1
         2014-05-04    1
     

## Export Final

In [92]:
df.to_csv('../Assets/Datasets/Weather/Clean_Dep_Weather_Hourly.csv', index=False)

In [26]:
df.to_csv('../Assets/Datasets/Weather/Clean_Arr_Weather_Hourly.csv', index=False)