### Dataset Visualizations

Dataset used: [Metro dataset](https://archive.ics.uci.edu/ml/datasets/Metro+Interstate+Traffic+Volume)

##### Features:
* holiday: US National Holiday + Minnesota State Holiday
* temp: average Temperature in Kelvin
* rain_1h: mm or Rain
* snow_1h: mm of Snow
* clouds_all: percentage of cloud cover
* weather_main: short text descr. of weather
* weather_description: longer text descr. of weather
* date_time: datetime
* traffic_volume: westbound Traffic Volume (Ground Truth)

In [24]:
# All imports needed
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [25]:
# Read data from file
df = pd.read_csv("../data/metro/metro.csv")
df.head()

Unnamed: 0,holiday,temp,rain_1h,snow_1h,clouds_all,weather_main,weather_description,date_time,traffic_volume
0,,288.28,0.0,0.0,40,Clouds,scattered clouds,2012-10-02 09:00:00,5545
1,,289.36,0.0,0.0,75,Clouds,broken clouds,2012-10-02 10:00:00,4516
2,,289.58,0.0,0.0,90,Clouds,overcast clouds,2012-10-02 11:00:00,4767
3,,290.13,0.0,0.0,90,Clouds,overcast clouds,2012-10-02 12:00:00,5026
4,,291.14,0.0,0.0,75,Clouds,broken clouds,2012-10-02 13:00:00,4918


In [26]:
# Showing missing values in metro dataset (none)
null_vals = df[df.isnull().values.any(axis=1)]
null_vals.shape

(0, 9)

In [27]:
# separate data from ground truth
X = df.drop('traffic_volume', axis=1)
Y = df['traffic_volume']

In [28]:
X.head()

Unnamed: 0,holiday,temp,rain_1h,snow_1h,clouds_all,weather_main,weather_description,date_time
0,,288.28,0.0,0.0,40,Clouds,scattered clouds,2012-10-02 09:00:00
1,,289.36,0.0,0.0,75,Clouds,broken clouds,2012-10-02 10:00:00
2,,289.58,0.0,0.0,90,Clouds,overcast clouds,2012-10-02 11:00:00
3,,290.13,0.0,0.0,90,Clouds,overcast clouds,2012-10-02 12:00:00
4,,291.14,0.0,0.0,75,Clouds,broken clouds,2012-10-02 13:00:00


In [29]:
# show categorical features
for col_name in X.columns:
    if X[col_name].dtypes == 'object':
        unique_cat = len(X[col_name].unique())
        print("Feature '{col_name}' has {unique_cat} unique categories".format(
        col_name=col_name, unique_cat=unique_cat))

Feature 'holiday' has 12 unique categories
Feature 'weather_main' has 11 unique categories
Feature 'weather_description' has 38 unique categories
Feature 'date_time' has 40575 unique categories


In [30]:
# clean up holiday feature
print(X['holiday'].value_counts(dropna=False))
X['holiday'] = [0 if x == 'None' else 1 for x in X['holiday']]
print()
print(X['holiday'].value_counts())

None                         48143
Labor Day                        7
Thanksgiving Day                 6
Martin Luther King Jr Day        6
New Years Day                    6
Christmas Day                    6
Columbus Day                     5
Memorial Day                     5
Independence Day                 5
Veterans Day                     5
State Fair                       5
Washingtons Birthday             5
Name: holiday, dtype: int64

0    48143
1       61
Name: holiday, dtype: int64


In [31]:
X.head()

Unnamed: 0,holiday,temp,rain_1h,snow_1h,clouds_all,weather_main,weather_description,date_time
0,0,288.28,0.0,0.0,40,Clouds,scattered clouds,2012-10-02 09:00:00
1,0,289.36,0.0,0.0,75,Clouds,broken clouds,2012-10-02 10:00:00
2,0,289.58,0.0,0.0,90,Clouds,overcast clouds,2012-10-02 11:00:00
3,0,290.13,0.0,0.0,90,Clouds,overcast clouds,2012-10-02 12:00:00
4,0,291.14,0.0,0.0,75,Clouds,broken clouds,2012-10-02 13:00:00


In [32]:
# weather_main feature
print(X['weather_main'].value_counts(dropna=False))

Clouds          15164
Clear           13391
Mist             5950
Rain             5672
Snow             2876
Drizzle          1821
Haze             1360
Thunderstorm     1034
Fog               912
Smoke              20
Squall              4
Name: weather_main, dtype: int64

In [33]:
# one-hot dummies
dummies = pd.get_dummies(X['weather_main'], prefix='wmain')
X = X.drop('weather_main',1)
X = pd.concat([X, dummies], axis=1)
X.head()

Unnamed: 0,holiday,temp,rain_1h,snow_1h,clouds_all,weather_description,date_time,wmain_Clear,wmain_Clouds,wmain_Drizzle,wmain_Fog,wmain_Haze,wmain_Mist,wmain_Rain,wmain_Smoke,wmain_Snow,wmain_Squall,wmain_Thunderstorm
0,0,288.28,0.0,0.0,40,scattered clouds,2012-10-02 09:00:00,0,1,0,0,0,0,0,0,0,0,0
1,0,289.36,0.0,0.0,75,broken clouds,2012-10-02 10:00:00,0,1,0,0,0,0,0,0,0,0,0
2,0,289.58,0.0,0.0,90,overcast clouds,2012-10-02 11:00:00,0,1,0,0,0,0,0,0,0,0,0
3,0,290.13,0.0,0.0,90,overcast clouds,2012-10-02 12:00:00,0,1,0,0,0,0,0,0,0,0,0
4,0,291.14,0.0,0.0,75,broken clouds,2012-10-02 13:00:00,0,1,0,0,0,0,0,0,0,0,0


In [34]:
# clean up weather_description feature
X['weather_description'] = X['weather_description'].str.lower()
print(X['weather_description'].value_counts(dropna=False))

sky is clear                           13391
mist                                    5950
overcast clouds                         5081
broken clouds                           4666
scattered clouds                        3461
light rain                              3372
few clouds                              1956
light snow                              1946
moderate rain                           1664
haze                                    1360
light intensity drizzle                 1100
fog                                      912
proximity thunderstorm                   673
drizzle                                  651
heavy snow                               616
heavy intensity rain                     467
snow                                     293
proximity shower rain                    136
thunderstorm                             125
heavy intensity drizzle                   64
thunderstorm with heavy rain              63
thunderstorm with light rain              54
proximity 

In [35]:
# one-hot dummies
pd.get_dummies(X['weather_description']).head()

Unnamed: 0,broken clouds,drizzle,few clouds,fog,freezing rain,haze,heavy intensity drizzle,heavy intensity rain,heavy snow,light intensity drizzle,...,smoke,snow,squalls,thunderstorm,thunderstorm with drizzle,thunderstorm with heavy rain,thunderstorm with light drizzle,thunderstorm with light rain,thunderstorm with rain,very heavy rain
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [36]:
# clean up date_time feature --> make it only time
X['time'] = X['date_time'].str.slice(start=11, stop=16)
X = X.drop('date_time',1)
print(X['time'].value_counts(dropna=False))

04:00    2091
06:00    2087
08:00    2080
07:00    2079
10:00    2078
05:00    2063
01:00    2049
23:00    2040
00:00    2037
03:00    2025
02:00    2019
09:00    2018
22:00    1994
16:00    1988
18:00    1986
21:00    1982
20:00    1979
14:00    1969
19:00    1961
12:00    1955
11:00    1952
15:00    1934
17:00    1933
13:00    1905
Name: time, dtype: int64

In [37]:
X.head()

Unnamed: 0,holiday,temp,rain_1h,snow_1h,clouds_all,weather_description,wmain_Clear,wmain_Clouds,wmain_Drizzle,wmain_Fog,wmain_Haze,wmain_Mist,wmain_Rain,wmain_Smoke,wmain_Snow,wmain_Squall,wmain_Thunderstorm,time
0,0,288.28,0.0,0.0,40,scattered clouds,0,1,0,0,0,0,0,0,0,0,0,09:00
1,0,289.36,0.0,0.0,75,broken clouds,0,1,0,0,0,0,0,0,0,0,0,10:00
2,0,289.58,0.0,0.0,90,overcast clouds,0,1,0,0,0,0,0,0,0,0,0,11:00
3,0,290.13,0.0,0.0,90,overcast clouds,0,1,0,0,0,0,0,0,0,0,0,12:00
4,0,291.14,0.0,0.0,75,broken clouds,0,1,0,0,0,0,0,0,0,0,0,13:00
