In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import minmax_scale

## Traffic
Traffic dataset is already normalized, apparently.

In [2]:
traffic_dataset = pd.read_csv('merged_mmda_wwo_Taft Ave._2015_transformed.csv', skipinitialspace=True, encoding='cp1252')
traffic_dataset = traffic_dataset[['statusN', 'statusS']]
traffic_dataset

Unnamed: 0,statusN,statusS
0,0.231394,0.408168
1,0.237548,0.407201
2,0.243492,0.406253
3,0.249231,0.405322
4,0.254765,0.404408
5,0.260099,0.403512
6,0.265234,0.402633
7,0.270174,0.401771
8,0.274921,0.400926
9,0.279478,0.400098


# Normalize Traffic and Weather
This uses the min-max normalization from sklearn.

In [3]:
def normalize(data):
    data = minmax_scale(data)
    return data

In [4]:
weather_dataset = pd.read_csv('interpolated_weather_with_precip_2015.csv', skipinitialspace=True, encoding='cp1252')
weather_dataset.columns = ['dt', 'tempC', 'dewptC', 'windspeedKMH', 'presMB', 'presTenMB', 'ccOKTA', 'lowCCOKTA', 'HKM', 'visKM', 'precipMM']
weather_dataset = weather_dataset.drop(['dt'], axis=1)

In [5]:
columns = weather_dataset.columns
for column in columns:
    weather_dataset[column] = normalize(weather_dataset[column])
weather_dataset

Unnamed: 0,tempC,dewptC,windspeedKMH,presMB,presTenMB,ccOKTA,lowCCOKTA,HKM,visKM,precipMM
0,0.310811,0.321739,0.230769,0.743590,0.837838,1.0,0.75,1.0,0.777778,0.000000
1,0.319257,0.322464,0.237179,0.745370,0.806306,1.0,0.75,1.0,0.759259,0.000000
2,0.327703,0.323188,0.243590,0.747151,0.774775,1.0,0.75,1.0,0.740741,0.000000
3,0.336149,0.323913,0.250000,0.748932,0.743243,1.0,0.75,1.0,0.722222,0.000000
4,0.344595,0.324638,0.256410,0.750712,0.711712,1.0,0.75,1.0,0.703704,0.000000
5,0.353041,0.325362,0.262821,0.752493,0.680180,1.0,0.75,1.0,0.685185,0.000000
6,0.361486,0.326087,0.269231,0.754274,0.648649,1.0,0.75,1.0,0.666667,0.000000
7,0.369932,0.326812,0.275641,0.756054,0.617117,1.0,0.75,1.0,0.648148,0.000000
8,0.378378,0.327536,0.282051,0.757835,0.585586,1.0,0.75,1.0,0.629630,0.000000
9,0.386824,0.328261,0.288462,0.759615,0.554054,1.0,0.75,1.0,0.611111,0.000000


# Merge Traffic and Weather

In [18]:
dataset = pd.concat([traffic_dataset, weather_dataset], axis=1, join='inner')
dataset

Unnamed: 0,statusN,statusS,tempC,dewptC,windspeedKMH,presMB,presTenMB,ccOKTA,lowCCOKTA,HKM,visKM,precipMM
0,0.231394,0.408168,0.310811,0.321739,0.230769,0.743590,0.837838,1.0,0.75,1.0,0.777778,0.000000
1,0.237548,0.407201,0.319257,0.322464,0.237179,0.745370,0.806306,1.0,0.75,1.0,0.759259,0.000000
2,0.243492,0.406253,0.327703,0.323188,0.243590,0.747151,0.774775,1.0,0.75,1.0,0.740741,0.000000
3,0.249231,0.405322,0.336149,0.323913,0.250000,0.748932,0.743243,1.0,0.75,1.0,0.722222,0.000000
4,0.254765,0.404408,0.344595,0.324638,0.256410,0.750712,0.711712,1.0,0.75,1.0,0.703704,0.000000
5,0.260099,0.403512,0.353041,0.325362,0.262821,0.752493,0.680180,1.0,0.75,1.0,0.685185,0.000000
6,0.265234,0.402633,0.361486,0.326087,0.269231,0.754274,0.648649,1.0,0.75,1.0,0.666667,0.000000
7,0.270174,0.401771,0.369932,0.326812,0.275641,0.756054,0.617117,1.0,0.75,1.0,0.648148,0.000000
8,0.274921,0.400926,0.378378,0.327536,0.282051,0.757835,0.585586,1.0,0.75,1.0,0.629630,0.000000
9,0.279478,0.400098,0.386824,0.328261,0.288462,0.759615,0.554054,1.0,0.75,1.0,0.611111,0.000000


# Correlate Traffic and Weather

## Correlate

In [7]:
corr = dataset.corr(method='spearman')
corr.to_csv('corr_results/corr_results_mmda_ogimet_Taft_2015.csv')
corr

Unnamed: 0,statusN,statusS,tempC,dewptC,windspeedKMH,presMB,presTenMB,ccOKTA,lowCCOKTA,HKM,visKM,precipMM
statusN,1.0,0.102014,0.148535,0.026394,0.061294,-0.079067,-0.039135,0.065032,0.078929,-0.020362,-0.012332,0.011075
statusS,0.102014,1.0,0.185627,0.009969,0.168094,-0.000363,-0.096688,0.112272,0.109883,0.015237,0.31788,0.042502
tempC,0.148535,0.185627,1.0,0.420948,0.459881,-0.401972,-0.200783,-0.130756,-0.009183,0.023074,0.511638,-0.160606
dewptC,0.026394,0.009969,0.420948,1.0,0.254152,-0.664596,-0.03301,0.276498,0.255923,-0.124631,-0.015261,0.250934
windspeedKMH,0.061294,0.168094,0.459881,0.254152,1.0,-0.335768,-0.120778,0.159596,0.182469,-0.06646,0.298297,0.133285
presMB,-0.079067,-0.000363,-0.401972,-0.664596,-0.335768,1.0,0.197011,-0.227592,-0.226809,0.13322,0.041036,-0.252829
presTenMB,-0.039135,-0.096688,-0.200783,-0.03301,-0.120778,0.197011,1.0,-0.05219,-0.11608,-0.056396,-0.13571,0.060323
ccOKTA,0.065032,0.112272,-0.130756,0.276498,0.159596,-0.227592,-0.05219,1.0,0.747408,-0.160326,-0.10843,0.425054
lowCCOKTA,0.078929,0.109883,-0.009183,0.255923,0.182469,-0.226809,-0.11608,0.747408,1.0,-0.085609,-0.114992,0.376702
HKM,-0.020362,0.015237,0.023074,-0.124631,-0.06646,0.13322,-0.056396,-0.160326,-0.085609,1.0,0.104921,-0.229509


## Correlate with Lags

In [24]:
total_lags = 31
new_dataset = dataset.copy()

for i in range(1, total_lags):
    new_dataset.statusN = new_dataset.statusN.shift(-1)
    new_dataset.statusS = new_dataset.statusS.shift(-1)
    new_dataset = new_dataset[:(len(new_dataset)-1)]
    
    corr = new_dataset.corr(method='spearman')
    corr.to_csv('corr_mmda_ogimet/corr_mmda_ogimet_Taft_2015_lag' + str(i) + '.csv')