In [1]:
import pandas as pd
#from sklearn.decomposition import PCA
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor

### Helper Functions

In [2]:
def loadData(file):
    data = pd.read_csv(file)
    print('Raw shape: ',data.shape)
    data['Date'] = pd.to_datetime(data.Date)
    print('Days: ',len(set(data.Date)))
    return data

In [3]:
def getTimeSeries(df):
    table = pd.pivot_table(df, values='vehicle_count', index=['Date','Hour'],
                    columns=['DOLocationID'], aggfunc=np.sum, fill_value=0)
    return table

In [4]:
def zscoreNormalizeSpatial(matrix):
    m = matrix.copy()
    for i in range(m.shape[0]):
        m[i, :] = (m[i, :] - m[i, :].mean()) / (m[i, :].std()+1e-10)
        
    return m

In [5]:
def standardize(matrix):
    m = matrix.copy()
    scaler = StandardScaler()
    scaler.fit(matrix)
    t = scaler.transform(matrix)
    return scaler, t

In [6]:
def inverse_standardize(matrix, scaler):
    t = matrix.copy()
    return scaler.inverse_transform(t)

In [28]:
def addLag(dataset, maxlag, lagvar):
    # return the full dataset with lag
    LAG = np.arange(1,maxlag + 1)
    lagdata = pd.DataFrame([])
    for lag in LAG:
        varname = str(lagvar)+'_lag' + str(lag)
        lagdata[varname] = dataset[lagvar].loc[maxlag-lag:len(dataset)-lag-1].reset_index(drop = True)
    dataset = pd.concat([dataset.loc[maxlag:].reset_index(drop = True), lagdata.reset_index(drop = True)], axis = 1, sort = False)
    return dataset

In [8]:
def get_rmse(matrix1, matrix2):
    sumSquareError = np.mean(np.power(matrix1 - matrix2,2))
    rmse = np.power(sumSquareError,0.5)
    return rmse

#### Preparing Data

In [9]:
file = '/home/urwa/Documents/Projects/NYU Remote/project/data/JfkVehiceByHour.csv'

In [10]:
data = loadData(file)

Raw shape:  (4520160, 4)
Days:  730


In [11]:
data = getTimeSeries(data)

In [12]:
_index = data.index

In [13]:
matrix = data.values.astype(np.float64)

In [26]:
data.head(2)

Unnamed: 0_level_0,DOLocationID,1,2,3,4,5,6,7,8,9,10,...,254,255,256,257,258,259,260,261,262,263
Date,Hour,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2017-01-01,0,0,0,0,1,0,0,2,0,0,6,...,0,3,0,2,0,0,1,0,2,0
2017-01-01,1,0,0,0,1,0,0,3,0,0,5,...,0,3,1,0,0,0,0,1,2,3


### Normalization

In [14]:
scaler, s_matrix = standardize(matrix)

## PCA

In [23]:
pcaDf = pd.DataFrame(matrix)
pcaDf.index = _index
pcaDf.columns = data.columns
pcaDf = pcaDf.reset_index()

In [24]:
pcaDf.head(3)

DOLocationID,Date,Hour,1,2,3,4,5,6,7,8,...,254,255,256,257,258,259,260,261,262,263
0,2017-01-01,0,0.0,0.0,0.0,1.0,0.0,0.0,2.0,0.0,...,0.0,3.0,0.0,2.0,0.0,0.0,1.0,0.0,2.0,0.0
1,2017-01-01,1,0.0,0.0,0.0,1.0,0.0,0.0,3.0,0.0,...,0.0,3.0,1.0,0.0,0.0,0.0,0.0,1.0,2.0,3.0
2,2017-01-01,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


### Lag Variables

In [30]:
maxlag = 12
DateColumns = ['Date', 'Hour']
lagColumns = [c for c in pcaDf.columns if c not in DateColumns]
for c in lagColumns:
    dataset = addLag(pcaDf, maxlag, c)

### Modelling

In [32]:
featureData = dataset.drop(lagColumns + DateColumns , axis = 1)
targetData = dataset[lagColumns]
targetData.shape

(17508, 258)

In [33]:
rf = RandomForestRegressor()

In [34]:
rf.fit(featureData,targetData)
rf.score(featureData,targetData)



0.8890883848274697

### Predict

In [35]:
pca_prediction = rf.predict(featureData)
pca_prediction.shape

(17508, 258)

In [36]:
# network_prediction = inverse_pca(pca_prediction,pca)
# network_prediction.shape

In [37]:
network_prediction = inverse_standardize(network_prediction, scaler)
network_prediction.shape

NameError: name 'network_prediction' is not defined

### Evaluate

In [28]:
get_rmse(matrix[maxlag:], network_prediction)

1.6874852592711531

In [27]:
r2_score(matrix[maxlag:], network_prediction)

0.547722471856429