In [1]:
import pandas as pd
from sklearn.decomposition import PCA
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor

from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.multioutput import MultiOutputRegressor

### Helper Functions

In [2]:
def loadData(file):
    data = pd.read_csv(file)
    print('Raw shape: ',data.shape)
    data['Date'] = pd.to_datetime(data.Date)
    print('Days: ',len(set(data.Date)))
    return data

In [3]:
def getTimeSeries(df):
    table = pd.pivot_table(df, values='vehicle_count', index=['Date','Hour'],
                    columns=['DOLocationID'], aggfunc=np.sum, fill_value=0)
    return table

In [4]:
# def standardize(matrix):
#     m = matrix.copy()
#     scaler = StandardScaler()
#     scaler.fit(m)
#     t = scaler.transform(m)
#     return scaler, t

In [5]:
# def inverse_standardize(matrix, scaler):
#     t = matrix.copy()
#     return scaler.inverse_transform(t)

In [6]:
# def getPCAFeatures(matrix, n=10):
#     pca = PCA(n_components=n)
#     pca.fit(matrix)
#     reducedMatrixPCA = pca.transform(matrix)
#     reducedMatrixPCA.shape

#     reducedDict = {str(i+1):reducedMatrixPCA[:,i] for i in range(reducedMatrixPCA.shape[1])}
#     reducedDf = pd.DataFrame(reducedDict)
#     #reducedDf.index = index
#     return pca,reducedDf

In [7]:
# def PCA_test(matrix, pca):

#     reducedMatrixPCA = pca.transform(matrix)

#     reducedDict = {str(i+1):reducedMatrixPCA[:,i] for i in range(reducedMatrixPCA.shape[1])}
#     reducedDf = pd.DataFrame(reducedDict)
#     #reducedDf.index = index
#     return reducedDf

In [8]:
# def inverse_pca(matrix,pca):
#     m = matrix.copy()
#     return pca.inverse_transform(m)

In [9]:
def addLag(dataset, maxlag):
    dataset_list = [dataset]

    for l in range(1, maxlag+1):
        df = dataset.shift(l)
        df.columns = [str(c)+'_lag_'+str(l) for c in df.columns]
        dataset_list.append(df)

    dataset = pd.concat(dataset_list, axis=1).dropna()
    return dataset

In [10]:
def get_rmse(matrix1, matrix2):
    sumSquareError = np.mean(np.power(matrix1 - matrix2,2))
    rmse = np.power(sumSquareError,0.5)
    return rmse

#### Preparing Data

In [11]:
file = '/home/urwa/Documents/Projects/NYU Remote/project/data/JfkVehiceByHour.csv'

In [12]:
data = loadData(file)

Raw shape:  (4520160, 4)
Days:  730


In [13]:
data = getTimeSeries(data)

In [14]:
data.shape

(17520, 258)

In [15]:
sep = int(0.8*len(data))
sep

14016

In [16]:
trainData = data[:sep]
testData = data[sep:]

In [17]:
trainData.shape

(14016, 258)

In [18]:
testData.shape

(3504, 258)

In [19]:
trainmatrix = trainData.values.astype(np.float64)
testmatrix = testData.values.astype(np.float64)

### Normalization

In [20]:
# scaler, s_train_matrix = standardize(trainmatrix)
# s_test_matrix = scaler.transform(testmatrix)

## PCA

In [21]:
# pca,pcaTrain = getPCAFeatures(s_train_matrix,n=10)
# pcaTest = PCA_test(s_test_matrix, pca)

In [22]:
# pcaTrain.shape, pcaTest.shape

In [23]:
# pcaTrain.head(3)

In [24]:
testData = testData.reset_index()

In [25]:
trainData=trainData.reset_index()

In [26]:
trainData.shape

(14016, 260)

In [27]:
DateColumns = ['Date', 'Hour']
lagColumns = [c for c in trainData.columns if c not in DateColumns]
columnsToRemove = DateColumns + lagColumns

### Lag Variables

In [28]:
maxlag = 12

dataset_train = trainData.drop(DateColumns , axis = 1)
dataset_train = addLag(dataset_train, maxlag)

dataset_train.shape

(14004, 3354)

In [29]:
dataset_train.head(3)

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,254_lag_12,255_lag_12,256_lag_12,257_lag_12,258_lag_12,259_lag_12,260_lag_12,261_lag_12,262_lag_12,263_lag_12
12,0,0,0,3,0,0,3,0,2,3,...,0.0,3.0,0.0,2.0,0.0,0.0,1.0,0.0,2.0,0.0
13,1,1,0,2,0,0,3,0,0,3,...,0.0,3.0,1.0,0.0,0.0,0.0,0.0,1.0,2.0,3.0
14,0,0,0,1,0,0,5,0,0,7,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [30]:
dataset_test = testData.drop(DateColumns , axis = 1)
dataset_test = addLag(dataset_test, maxlag)
dataset_test.shape

(3492, 3354)

### Modelling

In [31]:
X_train = dataset_train.drop(lagColumns , axis = 1)
X_test = dataset_test.drop(lagColumns , axis = 1)
y_train = dataset_train[lagColumns]
y_test = dataset_test[lagColumns]

In [32]:
X_train.shape, X_test.shape

((14004, 3096), (3492, 3096))

In [33]:
y_train.shape, y_test.shape

((14004, 258), (3492, 258))

In [34]:
X_train.columns

Index(['1_lag_1', '2_lag_1', '3_lag_1', '4_lag_1', '5_lag_1', '6_lag_1',
       '7_lag_1', '8_lag_1', '9_lag_1', '10_lag_1',
       ...
       '254_lag_12', '255_lag_12', '256_lag_12', '257_lag_12', '258_lag_12',
       '259_lag_12', '260_lag_12', '261_lag_12', '262_lag_12', '263_lag_12'],
      dtype='object', length=3096)

In [35]:
# X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.3)

In [36]:
# def train(X_train,y_train,lagColumns):
#     models = {}
#     for cc in lagColumns:
#         curr_cols = [c for c in X_train.columns if c.split('_')[0]==str(cc)]
#         curr_train_X = X_train[curr_cols]  
#         curr_train_Y = y_train[cc]
#         models[cc] = RandomForestRegressor(random_state = 0, n_estimators=100, 
#                                    min_samples_split=2,
#                                    min_samples_leaf= 1, 
#                                    max_features= 'sqrt',
#                                    max_depth= 50, 
#                                    bootstrap= True)
#         models[cc].fit(curr_train_X,curr_train_Y)
        
#     return models

In [37]:
# rf_model = train(X_train,y_train,lagColumns)

In [39]:
# cc

In [40]:
rf2 = RandomForestRegressor(random_state = 0, n_estimators=100, 
                           min_samples_split=2,
                           min_samples_leaf= 1, 
                           max_features= 'sqrt',
                           max_depth= 50, 
                           bootstrap= True)

In [41]:
rf2.fit(X_train,y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=50,
           max_features='sqrt', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
           oob_score=False, random_state=0, verbose=0, warm_start=False)

In [42]:
rf2.score(X_train,y_train)

0.9369011071571428

In [43]:
rf2.score(X_test,y_test)

0.5376473411411318

In [None]:
r2_score(y_test,rf2.predict(X_test))

### Predict

In [None]:
network_prediction = rf2.predict(X_test)
network_prediction.shape

In [None]:
# network_prediction = inverse_pca(pca_prediction,pca)
# network_prediction.shape

In [None]:
# network_prediction = inverse_standardize(network_prediction, scaler)
# network_prediction.shape

### Evaluate

In [None]:
get_rmse(testmatrix[maxlag:], network_prediction)

In [None]:
r2_score(testmatrix[maxlag:], network_prediction)