In [1]:
# import packages
import numpy as np
import matplotlib
import os
import matplotlib.pyplot as plt
import pandas as pd
import statsmodels.formula.api as smf
from sklearn.metrics import mean_squared_error, r2_score
import datetime as dt
from statsmodels.tsa.stattools import adfuller
import statsmodels.graphics.tsaplots as sg
from statsmodels.tsa.stattools import arma_order_select_ic
from statsmodels.tsa.arima_model import ARIMA
from sklearn.mixture import GaussianMixture
from scipy import stats
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression

In [2]:
def addLag(dataset, maxlag):
    # return the full dataset with lag
    LAG = np.arange(1,maxlag + 1)
    lagdata = pd.DataFrame([])
    for lag in LAG:
        varname = 'lag' + str(lag)
        lagdata[varname] = dataset['arrival'].loc[maxlag-lag:len(dataset)-lag-1].reset_index(drop = True)
    dataset = pd.concat([dataset.loc[maxlag:].reset_index(drop = True), lagdata.reset_index(drop = True)], axis = 1, sort = False)
    return dataset

In [3]:
FACILITY = ['JFK','LGA','PENN']
TRAFFIC = ['vehicle']
DATA = {}
maxlag = 12
for facility in FACILITY:
    dataset = pd.read_csv(facility + '.csv')
    DATA[facility] = addLag(dataset, maxlag)


In [4]:
LINEAR_REGRESSOR = {}
DECISIONTREE_REGRESSOR = {}
DECISIONTREE_CROSS_VAL_SCORE = {}
LINEAR_CROSS_VAL_SCORE = {}

for facility in FACILITY:
    featureData = DATA[facility].drop(['date','yellow','fhv','vehicle'], axis = 1)
    targetData = DATA[facility]['vehicle']
    # linear regression
    feature = featureData.columns.values.tolist()
    #print(feature)
    featureString = ''
    for i in range(0,len(feature)):
        featureString = featureString + '+' + feature[i]
    linearModel = smf.ols(formula = 'vehicle' + ' ~ ' + featureString, data = DATA[facility]).fit()
    # Decision Tree Regressor
    DECISIONTREE_CROSS_VAL_SCORE[facility] = cross_val_score(DecisionTreeRegressor(random_state = 0), featureData, targetData, cv = 10)
    LINEAR_CROSS_VAL_SCORE[facility] = cross_val_score(LinearRegression(), featureData, targetData, cv = 10)
    
for key, value in  DECISIONTREE_CROSS_VAL_SCORE.items():
    print(key)
    print(np.mean(value))
for key, value in LINEAR_CROSS_VAL_SCORE.items():
    print(key)
    print(np.mean(value))

['arrival', 'ifmon', 'iftue', 'ifwed', 'ifthu', 'iffri', 'ifsat', 'ifsun', 'if0', 'if1', 'if2', 'if3', 'if4', 'if5', 'if6', 'if7', 'if8', 'if9', 'if10', 'if11', 'if12', 'if13', 'if14', 'if15', 'if16', 'if17', 'if18', 'if19', 'if20', 'if21', 'if22', 'if23', 'maxtemp', 'mintemp', 'avgtemp', 'departure', 'hdd', 'cdd', 'participation', 'newsnow', 'snowdepth', 'ifSnow', 'lag1', 'lag2', 'lag3', 'lag4', 'lag5', 'lag6', 'lag7', 'lag8', 'lag9', 'lag10', 'lag11', 'lag12']
['arrival', 'ifmon', 'iftue', 'ifwed', 'ifthu', 'iffri', 'ifsat', 'ifsun', 'if0', 'if1', 'if2', 'if3', 'if4', 'if5', 'if6', 'if7', 'if8', 'if9', 'if10', 'if11', 'if12', 'if13', 'if14', 'if15', 'if16', 'if17', 'if18', 'if19', 'if20', 'if21', 'if22', 'if23', 'maxtemp', 'mintemp', 'avgtemp', 'departure', 'hdd', 'cdd', 'participation', 'newsnow', 'snowdepth', 'ifSnow', 'lag1', 'lag2', 'lag3', 'lag4', 'lag5', 'lag6', 'lag7', 'lag8', 'lag9', 'lag10', 'lag11', 'lag12']
['arrival', 'ifmon', 'iftue', 'ifwed', 'ifthu', 'iffri', 'ifsat', 