### Dataset Visualizations

Dataset used: [Metro dataset](https://archive.ics.uci.edu/ml/datasets/Metro+Interstate+Traffic+Volume)

##### Features:
* holiday: US National Holiday + Minnesota State Holiday
* temp: average Temperature in Kelvin
* rain_1h: mm or Rain
* snow_1h: mm of Snow
* clouds_all: percentage of cloud cover
* weather_main: short text descr. of weather
* weather_description: longer text descr. of weather
* date_time: datetime
* traffic_volume: westbound Traffic Volume (Ground Truth)

In [1]:
# All imports needed
import pandas as pd 
import numpy as np

import datetime

In [2]:
# Read data from file
df = pd.read_csv("../data/metro/metro_raw.csv")
df.head()

Unnamed: 0,holiday,temp,rain_1h,snow_1h,clouds_all,weather_main,weather_description,date_time,traffic_volume
0,,288.28,0.0,0.0,40,Clouds,scattered clouds,2012-10-02 09:00:00,5545
1,,289.36,0.0,0.0,75,Clouds,broken clouds,2012-10-02 10:00:00,4516
2,,289.58,0.0,0.0,90,Clouds,overcast clouds,2012-10-02 11:00:00,4767
3,,290.13,0.0,0.0,90,Clouds,overcast clouds,2012-10-02 12:00:00,5026
4,,291.14,0.0,0.0,75,Clouds,broken clouds,2012-10-02 13:00:00,4918


In [3]:
# Showing missing values in metro dataset (none)
null_vals = df[df.isnull().values.any(axis=1)]
null_vals.shape

(0, 9)

In [4]:
# show categorical features
for col_name in df.columns:
    if df[col_name].dtypes == 'object':
        unique_cat = len(df[col_name].unique())
        print("Feature '{col_name}' has {unique_cat} unique categories".format(
        col_name=col_name, unique_cat=unique_cat))

Feature 'holiday' has 12 unique categories
Feature 'weather_main' has 11 unique categories
Feature 'weather_description' has 38 unique categories
Feature 'date_time' has 40575 unique categories


In [5]:
# clean up holiday feature
df['holiday'] = [0 if x == 'None' else 1 for x in df['holiday']]
print(df['holiday'].value_counts())

0    48143
1       61
Name: holiday, dtype: int64


In [6]:
df.head()

Unnamed: 0,holiday,temp,rain_1h,snow_1h,clouds_all,weather_main,weather_description,date_time,traffic_volume
0,0,288.28,0.0,0.0,40,Clouds,scattered clouds,2012-10-02 09:00:00,5545
1,0,289.36,0.0,0.0,75,Clouds,broken clouds,2012-10-02 10:00:00,4516
2,0,289.58,0.0,0.0,90,Clouds,overcast clouds,2012-10-02 11:00:00,4767
3,0,290.13,0.0,0.0,90,Clouds,overcast clouds,2012-10-02 12:00:00,5026
4,0,291.14,0.0,0.0,75,Clouds,broken clouds,2012-10-02 13:00:00,4918


In [7]:
# clean up date_time feature --> split to weekday and hour

datetime = pd.to_datetime(df['date_time'])
df['weekday'] = datetime.dt.dayofweek
df['hour'] = datetime.dt.hour
df = df.drop('date_time',1)

In [8]:
df.head()

Unnamed: 0,holiday,temp,rain_1h,snow_1h,clouds_all,weather_main,weather_description,traffic_volume,weekday,hour
0,0,288.28,0.0,0.0,40,Clouds,scattered clouds,5545,1,9
1,0,289.36,0.0,0.0,75,Clouds,broken clouds,4516,1,10
2,0,289.58,0.0,0.0,90,Clouds,overcast clouds,4767,1,11
3,0,290.13,0.0,0.0,90,Clouds,overcast clouds,5026,1,12
4,0,291.14,0.0,0.0,75,Clouds,broken clouds,4918,1,13


In [9]:
def clean_weather(df, main=False, descr=False):
    if main: 
        df['weather_main'] = df['weather_main'].str.lower()
        mdummies = pd.get_dummies(df['weather_main'], prefix='wmain')
        df = pd.concat([df, mdummies], axis=1)
    if descr:
        df['weather_description'] = df['weather_description'].str.lower()
        ddummies = pd.get_dummies(df['weather_description'], prefix='wdescr')
        df = pd.concat([df, ddummies], axis=1)
        
    df = df.drop('weather_main',1)
    df = df.drop('weather_description',1)
    return df

In [10]:
df_weather_main = clean_weather(df, main=True)
df_weather_main.to_csv('../data/metro/metro_weather_main.csv')

df_weather_descr = clean_weather(df, descr=True)
df_weather_descr.to_csv('../data/metro/metro_weather_descr.csv')

df_weather_all = clean_weather(df, main=True, descr=True)
df_weather_all.to_csv('../data/metro/metro_weather_all.csv')