In [1]:
import pandas as pd
from datetime import date, datetime, timedelta
from time import sleep
import html5lib
import string
import os

## Clean Hourly Weather Data

### Original had all columns, rescrape were missing windchill column

In [2]:
# Original
columns = ['Time (CST)', 'Temp', 'Windchill', 'Dew Point', 'Humidity', 
           'Pressure', 'Visibility', 'Wind Dir', 'Wind Speed', 'Gust Speed', 
           'Precip', 'Events', 'Conditions', 'Airport', 'Date']

In [3]:
# Original
df = pd.read_csv('../Assets/Datasets/Weather/Weather_Hourly_201611201715.csv', names=columns)

In [4]:
# Original
# remove windchill and gust speed. too many null values
df = df.drop(['Windchill', 'Gust Speed'], axis=1)

In [None]:
# Rescrape 
columns = ['Time (CST)', 'Temp', 'Dew Point', 'Humidity', 
           'Pressure', 'Visibility', 'Wind Dir', 'Wind Speed', 'Gust Speed', 
           'Precip', 'Events', 'Conditions', 'Airport', 'Date']

In [None]:
# Rescrape
df = pd.read_csv('../Assets/Datasets/Weather/Weather_Hourly_rescrape.csv', names=columns)

In [None]:
# Rescrape
# remove windchill and gust speed. too many null values
df = df.drop(['Gust Speed'], axis=1)

In [5]:
# Format time to match BTS time (24hr int)
# add hours column
def get_hour(x):
    hour = str(x[0])
    signal = x[1].split(' ')[1]
    if signal == 'AM' and hour == '12':
        hour = '00'
    if signal == 'PM' and hour != '12':
        hour = str(int(hour) + 12)
    return int(hour)

def get_time(x):
    hour = str(x[0])
    minute = x[1].split(' ')[0]
    signal = x[1].split(' ')[1]
    if signal == 'AM' and hour == '12':
        hour = '00'
    if signal == 'PM' and hour != '12':
        hour = str(int(hour) + 12)
    return int(hour+minute)

df['Hour'] = df['Time (CST)'].str.split(':').apply(lambda x: get_hour(x))
df['Time (CST)'] = df['Time (CST)'].str.split(':').apply(lambda x: get_time(x))

In [6]:
# Drop duplicate rows for airport, date, hour
df.drop_duplicates(subset=['Airport', 'Date', 'Hour'], inplace=True)

In [7]:
columns = df.columns

In [8]:
# remove non-ascii characters
printable = set(string.printable)
def clean_cell(cell):
    cell = str(cell)
    return filter(lambda x: x in printable, cell)
    
for column in columns:
    df[column] = df[column].apply(lambda x: clean_cell(x))

In [9]:
# null values for precip and events become 0 and No_Event
df['Precip'] = df['Precip'].apply(lambda x: 0 if x in ['nan', 'T'] else x)
df['Events'] = df['Events'].apply(lambda x: 'No_Event' if x == 'nan' else x)

In [10]:
# remove non-numeric characters
num_columns = ['Hour', 'Time (CST)', 'Temp', 'Dew Point', 'Humidity', 'Pressure', 'Visibility', 'Wind Speed',
              'Precip']

def num_only(x):
    x = str(x)
    new_x = ''
    for char in x:
        if char.isdigit() or char == '.':
            new_x += char
    return new_x

for column in num_columns:
    df[column] = df[column].apply(lambda x: num_only(x))

In [11]:
# Change types
for column in num_columns:
    df[column] = df[column].apply(pd.to_numeric)

In [12]:
# Add 4-hour dayparts
def daypart(x):
    if x < 4:
        return 'red_eye'
    elif x < 8:
        return 'early_morning'
    elif x < 12:
        return 'morning'
    elif x < 16:
        return 'afternoon'
    elif x < 20:
        return 'evening'
    else:
        return 'night'
            
df['Daypart'] = df['Hour'].apply(lambda x: daypart(x))

In [13]:
# When Wind Dir is Calm, Wind speed is also calm
# Replace wind speed as 0 if wind dir is calm
df['Wind Speed'] = df.apply(lambda row: (0 if row['Wind Dir'] == 'Calm' else row['Wind Speed']), axis=1)

## Error Checking

In [None]:
# Windspeed or visibility or precip is null.  
# These tables had no Windchill column on Wunderground
repull = df[(df['Wind Speed'].isnull()) | (df['Visibility'].isnull())][['Conditions', 'Airport']]
repull = repull[repull['Conditions'].isin(['KMDW', 'KORD'])]
print repull.shape
repull.drop_duplicates(inplace=True)
repull['Year'] = repull['Airport'].str.split('-').apply(lambda x: x[0])
repull['Month'] = repull['Airport'].str.split('-').apply(lambda x: x[1])
repull['Day'] = repull['Airport'].str.split('-').apply(lambda x: x[2])
repull = repull.drop('Airport', axis = 1)
repull.columns = ['Airport', 'Year', 'Month', 'Day']


In [None]:
# Rescrape
repull.to_csv('../Assets/Datasets/Weather/incorrectly_scraped.csv', index=False)

## Error Correcting

In [None]:
# Rescrape
df_repull = df

In [14]:
# Remove rescrape rows from df
df_clean = df[~(df['Wind Speed'].isnull()) & ~(df['Visibility'].isnull())]

In [None]:
df_clean.shape

In [None]:
df_final = pd.concat([df_clean, df_repull])

In [None]:
# Drop duplicate rows for airport, date, hour
df_final.drop_duplicates(subset=['Airport', 'Date', 'Hour'], inplace=True)


## Export Final

In [None]:
df_final.isnull().sum()

In [None]:
df_final.dropna(inplace=True)

In [None]:
df_final.shape

In [None]:
df_final.to_csv('../Assets/Datasets/Weather/Clean_Weather_Hourly.csv', index=False)