# Weather Data Preparation 

In [21]:
import pandas as pd
import numpy as np
from datetime import datetime

In [22]:
# read data
df = pd.read_csv('data/raw/weather_hourly.csv', index_col=0)

In [23]:
# reduce dataframe to important columns
df.drop(['Dew Point', 'Wind', 'Wind Gust', 'Pressure', 'boolean_column'], axis=1, inplace=True)

In [24]:
# clear units + convert formats 
df['Temperature'] = df['Temperature'].str.replace(r'°F', '').astype(int)
df['Humidity'] = df['Humidity'].str.replace(r'°%', '').astype(int)
df['Wind Speed'] = df['Wind Speed'].str.replace(r'°mph', '').astype(int)
df['Precip.'] = df['Precip.'].str.replace(r'°in', '')
df['Precip.'] = df['Precip.'].str.split().str.join(' ').astype(float)

### create datetime column

In [25]:
# convert raw data to datetime format 
df['date'] = df['date'].apply(lambda x: datetime.strptime(x, '%d_%m').replace(year=2014).date()) # date column 
df['Time'] = df['Time'].apply(lambda x: pd.to_datetime(x, format="%I:%M %p").time()) # time column 

# merge column date and time to datetime and drop unused columns
df['datetime'] = df.apply(lambda x: datetime.combine(x['date'], x['Time']), axis=1)
df.drop(['date', 'Time'], axis=1, inplace=True)

# set index
df = df.set_index('datetime')

In [26]:
# TODO skip part

#round to Xmin interval
#df['60min_rounded'] = df['datetime'].apply(lambda x: x.round('60min'))
#df['30min_rounded'] = df['datetime'].apply(lambda x: x.round('30min'))
#df['15min_rounded'] = df['datetime'].apply(lambda x: x.round('15min'))

# duplicates per time intervals 
#print('number of duplicates for 60min: ' + str(len(df[df.duplicated(subset=['60min_rounded'])])))
#print('number of duplicates for 30min: ' + str(len(df[df.duplicated(subset=['30min_rounded'])])))
#print('number of duplicates for 15min: ' + str(len(df[df.duplicated(subset=['15min_rounded'])])))

# sorting by datetime
#df.sort_values('datetime', inplace=True)

# discard all rows with same hour 
#df.drop_duplicates(subset=['60min_rounded'], keep='first', inplace=True)

### categorize and encode conditions

In [27]:
# inspect categorical values
df['Condition'].value_counts() 

# subtract 'windy' condition -> information is covered by wind speed column
df['cond_new'] = df['Condition'].str.replace(' / Windy', '')

# TODO check if needed 
# check precip vs rain feature  -> precip column does not capture occurence of rain like condition column
#len(df[(df["Precip."] >= 0.01)]) 
#len(df.where[df['Condition' == 'Rain'] & df['Condition' == 'Light Rain']])

# simplify conditions 
df['cond_new'] = df['cond_new'].str.replace('Snow and Sleet', 'Snow').replace('Rain and Snow', 'Snow').replace('Light Snow', 'Snow') # simplify 'Light Snow' + 'Snow and Sleet' + 'Rain and Snow' to 'Snow' 
df['cond_new'] = df['cond_new'].str.replace('Haze', 'Fog').replace('Patches of Fog', 'Fog') # simplify 'haze' and 'patches of fog' to fog
df['cond_new'] = df['cond_new'].str.replace('Light Rain with Thunder', 'Thunder') # simplify 'Light Rain with Thunder' to 'Thunder'
df['cond_new'] = df['cond_new'].str.replace('Heavy T-Storm', 'T-Storm') # simplify 'Heavy T-Storm' to 'T-Storm'

# ordinal encoding of features rain + clouds 
df['cloud'] = df['cond_new'].replace('Cloudy', 3).replace('Mostly Cloudy', 2).replace('Partly Cloudy', 1).apply(pd.to_numeric, errors='coerce') # ordinal encoding of clouds
df['rain'] = df['cond_new'].replace('Heavy Rain', 4).replace('Rain', 3).replace('Light Rain', 2).replace('Light Drizzle', 1).apply(pd.to_numeric, errors='coerce') # ordinal encoding of rain
df[['cloud', 'rain']] = df[['cloud', 'rain']].fillna(0) 

# one hot encoding of nominal features
nominal_features = df['cond_new'].replace(dict.fromkeys(['Cloudy','Mostly Cloudy','Partly Cloudy','Heavy Rain', 'Rain', 'Light Rain', 'Light Drizzle'], np.nan)) # subtract ordinal features
one_hot = pd.get_dummies(nominal_features) # one-hot encoding 
df = df.join(one_hot) # join encoded variables 
df.drop(['Condition', 'cond_new'], axis=1, inplace=True) # drop unused column 

In [28]:
# TODO feature: extreme weather condition

### resample hourly 

In [29]:
# number of observations in raw dataset and of expected hourly observations compared
print('number of actual observations: ' + str(len(df)))
print('number of expected obersavtions: ' + str(365*24))

number of actual observations: 10344
number of expected obersavtions: 8760


In [30]:
# resample hourly 
df = df.resample('H').last()
df.dtypes # check new dtypes
# TODO convert dtypes

Temperature    float64
Humidity       float64
Wind Speed     float64
Precip.        float64
cloud          float64
rain           float64
Fair           float64
Fog            float64
Snow           float64
T-Storm        float64
Thunder        float64
dtype: object

In [31]:
# check for NaN
df[df.isna().any(axis=1)]

#interpolate metric values and temperature 
df[['Temperature', 'Humidity', 'Wind Speed', 'Precip.']] = df[['Temperature', 'Humidity', 'Wind Speed', 'Precip.']].interpolate(method='linear', axis=0)
df[['Temperature', 'Humidity', 'Wind Speed']] = df[['Temperature', 'Humidity', 'Wind Speed']].round(decimals=0)
df['Precip.'] = df['Precip.'].round(decimals=1)

# interpolate binary + ordinal features 
df[['cloud', 'rain', 'Fair', 'Fog', 'Snow', 'T-Storm', 'Thunder']] = df[['cloud', 'rain', 'Fair', 'Fog', 'Snow', 'T-Storm', 'Thunder']].interpolate(method='pad', axis=0)


df[df.isna().any(axis=1)]

Unnamed: 0_level_0,Temperature,Humidity,Wind Speed,Precip.,cloud,rain,Fair,Fog,Snow,T-Storm,Thunder
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1


In [32]:
# convert fahrenheit to celsius
def celsius(x):
    x = (x-32)*5/9
    return float(x)

df['Temperature'] = df['Temperature'].apply(celsius).round(decimals=1)

In [33]:
#TODO normalize data 
df

Unnamed: 0_level_0,Temperature,Humidity,Wind Speed,Precip.,cloud,rain,Fair,Fog,Snow,T-Storm,Thunder
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2014-01-01 00:00:00,-10.0,84.0,7.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
2014-01-01 01:00:00,-8.9,84.0,6.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
2014-01-01 02:00:00,-8.3,84.0,7.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
2014-01-01 03:00:00,-7.8,84.0,7.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
2014-01-01 04:00:00,-7.2,85.0,6.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
2014-12-31 19:00:00,-7.2,47.0,20.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2014-12-31 20:00:00,-7.2,50.0,16.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2014-12-31 21:00:00,-6.7,48.0,17.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2014-12-31 22:00:00,-6.7,46.0,21.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


### saving dataframe

In [14]:
df.to_pickle("data/weather.pickle")