In [1]:
import pandas as pd
#from sklearn.decomposition import PCA
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor

from sklearn.model_selection import train_test_split

### Helper Functions

In [2]:
def loadData(file):
    data = pd.read_csv(file)
    print('Raw shape: ',data.shape)
    data['Date'] = pd.to_datetime(data.Date)
    print('Days: ',len(set(data.Date)))
    return data

In [3]:
def getTimeSeries(df):
    table = pd.pivot_table(df, values='vehicle_count', index=['Date','Hour'],
                    columns=['DOLocationID'], aggfunc=np.sum, fill_value=0)
    return table

In [4]:
def zscoreNormalizeSpatial(matrix):
    m = matrix.copy()
    for i in range(m.shape[0]):
        m[i, :] = (m[i, :] - m[i, :].mean()) / (m[i, :].std()+1e-10)
        
    return m

In [5]:
def standardize(matrix):
    m = matrix.copy()
    scaler = StandardScaler()
    scaler.fit(m)
    t = scaler.transform(m)
    return scaler, t

In [6]:
def inverse_standardize(matrix, scaler):
    t = matrix.copy()
    return scaler.inverse_transform(t)

In [7]:
def addLag(dataset, maxlag, lagvar):
    # return the full dataset with lag
    LAG = np.arange(1,maxlag + 1)
    lagdata = pd.DataFrame([])
    for lag in LAG:
        varname = str(lagvar)+'_lag' + str(lag)
        lagdata[varname] = dataset[lagvar].loc[maxlag-lag:len(dataset)-lag-1].reset_index(drop = True)
    dataset = pd.concat([dataset.loc[maxlag:].reset_index(drop = True), lagdata.reset_index(drop = True)], axis = 1, sort = False)
    return dataset

In [8]:
def get_rmse(matrix1, matrix2):
    sumSquareError = np.mean(np.power(matrix1 - matrix2,2))
    rmse = np.power(sumSquareError,0.5)
    return rmse

In [9]:
def pca_performance(matrix, components):
    rmseList = []
    r2List = []
    for n in components:
        s,k = standardize(matrix)
        p,k = getPCAFeatures(k, _index, n=n)
        k = inverse_pca(k,p)
        k = inverse_standardize(k, s)
        
        rmseList.append(get_rmse(matrix, k))
        r2List.append(r2_score(matrix, k))
        
    plt.figure()
    plt.title('RMSE')
    plt.xlabel('components')
    plt.ylabel('rmse')
    plt.plot(component,rmseList)
    plt.figure()
    plt.title('R Squared')
    plt.xlabel('components')
    plt.ylabel('R2')
    plt.plot(component,r2List)
    
    return rmseList,r2List

#### Preparing Data

In [10]:
file = '/home/urwa/Documents/Projects/NYU Remote/project/data/JfkVehiceByHour.csv'

In [11]:
data = loadData(file)

Raw shape:  (4520160, 4)
Days:  730


In [12]:
data = getTimeSeries(data)

In [13]:
_index = data.index

In [14]:
matrix = data.values.astype(np.float64)

In [15]:
data.head(2)

Unnamed: 0_level_0,DOLocationID,1,2,3,4,5,6,7,8,9,10,...,254,255,256,257,258,259,260,261,262,263
Date,Hour,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2017-01-01,0,0,0,0,1,0,0,2,0,0,6,...,0,3,0,2,0,0,1,0,2,0
2017-01-01,1,0,0,0,1,0,0,3,0,0,5,...,0,3,1,0,0,0,0,1,2,3


### Normalization

In [16]:
scaler, s_matrix = standardize(matrix)

## PCA

In [17]:
pcaDf = pd.DataFrame(s_matrix)
pcaDf.index = _index
pcaDf.columns = data.columns
pcaDf = pcaDf.reset_index()

In [18]:
pcaDf.head(3)

DOLocationID,Date,Hour,1,2,3,4,5,6,7,8,...,254,255,256,257,258,259,260,261,262,263
0,2017-01-01,0,-0.707687,-0.054701,-0.488711,-0.382197,-0.180696,-0.28904,-0.679973,-0.1451,...,-0.55223,-0.667575,-1.118449,0.6311,-0.978605,-0.560696,-0.153305,-1.110956,-0.566347,-1.15781
1,2017-01-01,1,-0.707687,-0.054701,-0.488711,-0.382197,-0.180696,-0.28904,-0.397491,-0.1451,...,-0.55223,-0.667575,-0.845618,-0.863982,-0.978605,-0.560696,-0.934547,-0.786487,-0.566347,-0.483477
2,2017-01-01,2,-0.707687,-0.054701,-0.488711,-0.923558,-0.180696,-0.28904,-1.244938,-0.1451,...,-0.55223,-1.200808,-1.118449,-0.863982,-0.310777,-0.560696,-0.934547,-1.110956,-1.148972,-1.15781


### Lag Variables

In [20]:
maxlag = 12
DateColumns = ['Date', 'Hour']
lagColumns = [c for c in pcaDf.columns if c not in DateColumns]
dataset = pcaDf
for c in lagColumns:
    dataset = addLag(dataset, maxlag, c)

### Modelling

In [21]:
featureData = dataset.drop(lagColumns + DateColumns , axis = 1)
targetData = dataset[lagColumns]
targetData.shape

(14424, 258)

In [22]:
featureData.shape

(14424, 3096)

In [23]:
X_train, X_test, y_train, y_test = train_test_split(featureData, targetData, test_size=0.3)

In [24]:
rf = RandomForestRegressor(random_state = 0, n_estimators=200, 
                           min_samples_split=10,
                           min_samples_leaf= 3, 
                           max_features= 'sqrt',
                           max_depth= 30, 
                           bootstrap= True)

In [25]:
rf.fit(X_train,y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=30,
           max_features='sqrt', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=3, min_samples_split=10,
           min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=None,
           oob_score=False, random_state=0, verbose=0, warm_start=False)

In [26]:
rf.score(X_train,y_train)

0.459172789472832

In [27]:
rf.score(X_test,y_test)

0.2328085176365642

### Predict

In [28]:
test_prediction = rf.predict(X_test)
test_prediction.shape

(4328, 258)

In [29]:
# network_prediction = inverse_pca(pca_prediction,pca)
# network_prediction.shape

In [None]:
network_prediction = inverse_standardize(test_prediction, scaler)
network_prediction.shape

### Evaluate

In [None]:
get_rmse(matrix[maxlag:], network_prediction)

In [None]:
r2_score(matrix[maxlag:], network_prediction)