In [1]:
import pandas as pd
from sklearn.decomposition import PCA
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor

from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.multioutput import MultiOutputRegressor

### Helper Functions

In [2]:
def loadData(file):
    data = pd.read_csv(file)
    print('Raw shape: ',data.shape)
    data['Date'] = pd.to_datetime(data.Date)
    print('Days: ',len(set(data.Date)))
    return data

In [3]:
def getTimeSeries(df):
    table = pd.pivot_table(df, values='vehicle_count', index=['Date','Hour'],
                    columns=['DOLocationID'], aggfunc=np.sum, fill_value=0)
    return table

In [4]:
def zscoreNormalizeSpatial(matrix):
    m = matrix.copy()
    for i in range(m.shape[0]):
        m[i, :] = (m[i, :] - m[i, :].mean()) / (m[i, :].std()+1e-10)
        
    return m

In [5]:
def standardize(matrix):
    m = matrix.copy()
    scaler = StandardScaler()
    scaler.fit(m)
    t = scaler.transform(m)
    return scaler, t

In [6]:
def inverse_standardize(matrix, scaler):
    t = matrix.copy()
    return scaler.inverse_transform(t)

In [7]:
def getPCAFeatures(matrix, n=10):
    pca = PCA(n_components=n)
    pca.fit(matrix)
    reducedMatrixPCA = pca.transform(matrix)
    reducedMatrixPCA.shape

    reducedDict = {str(i+1):reducedMatrixPCA[:,i] for i in range(reducedMatrixPCA.shape[1])}
    reducedDf = pd.DataFrame(reducedDict)
    #reducedDf.index = index
    return pca,reducedDf

In [8]:
def PCA_test(matrix, pca):

    reducedMatrixPCA = pca.transform(matrix)

    reducedDict = {str(i+1):reducedMatrixPCA[:,i] for i in range(reducedMatrixPCA.shape[1])}
    reducedDf = pd.DataFrame(reducedDict)
    #reducedDf.index = index
    return reducedDf

In [9]:
def inverse_pca(matrix,pca):
    m = matrix.copy()
    return pca.inverse_transform(m)

In [10]:
def addLag(dataset, maxlag):
    dataset_list = [dataset]

    for l in range(1, maxlag+1):
        df = dataset.shift(l)
        df.columns = [c+'_lag_'+str(l) for c in df.columns]
        dataset_list.append(df)

    dataset = pd.concat(dataset_list, axis=1).dropna()
    return dataset

In [11]:
def get_rmse(matrix1, matrix2):
    sumSquareError = np.mean(np.power(matrix1 - matrix2,2))
    rmse = np.power(sumSquareError,0.5)
    return rmse

#### Preparing Data

In [14]:
file = '/home/urwa/Documents/Projects/NYU Remote/project/data/JfkVehiceByHour.csv'

In [19]:
data = loadData(file)

Raw shape:  (4520160, 4)
Days:  730


In [21]:
zones = pd.read_csv('../Data/taxi_zones.csv')
zones.head(2)

Unnamed: 0,OBJECTID,Shape_Leng,the_geom,Shape_Area,zone,LocationID,borough
0,1,0.116357,MULTIPOLYGON (((-74.18445299999996 40.69499599...,0.000782,Newark Airport,1,EWR
1,2,0.43347,MULTIPOLYGON (((-73.82337597260663 40.63898704...,0.004866,Jamaica Bay,2,Queens


In [23]:
zontoBorough = dict(zip(zones.LocationID.values,zones.borough.values))

In [25]:
data['DOLocationID'] = data['DOLocationID'].apply(lambda x:zontoBorough[x])

In [27]:
data = getTimeSeries(data)

In [28]:
data.shape

(17520, 6)

### Train Test split

In [29]:
sep = int(0.8*len(data))
sep

14016

In [30]:
trainData = data[:sep]
testData = data[sep:]

In [31]:
trainData.shape

(14016, 6)

In [32]:
testData.shape

(3504, 6)

In [33]:
trainmatrix = trainData.values.astype(np.float64)
testmatrix = testData.values.astype(np.float64)

### Normalization

In [40]:
scaler, s_train_matrix = standardize(trainmatrix)
s_test_matrix = scaler.transform(testmatrix)

In [44]:
s_train_matrix_df = pd.DataFrame(s_train_matrix)
s_train_matrix_df.columns = trainData.columns

s_test_matrix_df = pd.DataFrame(s_test_matrix)
s_test_matrix_df.columns = testData.columns

### Lag Variables

In [45]:
maxlag = 12
DateColumns = ['Date', 'Hour']
lagColumns = [c for c in s_train_matrix_df.columns if c not in DateColumns]

dataset_train = addLag(s_train_matrix_df, maxlag)

dataset_train.shape

(14004, 78)

In [46]:
dataset_test = addLag(s_test_matrix_df, maxlag)
dataset_test.shape

(3492, 78)

### Modelling

In [47]:
X_train = dataset_train.drop(lagColumns , axis = 1)
X_test = dataset_test.drop(lagColumns , axis = 1)
y_train = dataset_train[lagColumns]
y_test = dataset_test[lagColumns]

In [48]:
X_train.shape, X_test.shape

((14004, 72), (3492, 72))

In [49]:
y_train.shape, y_test.shape

((14004, 6), (3492, 6))

### Hyperparameter Tuning

In [86]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 150, stop = 300, num = 3)]
# Number of features to consider at every split
max_features = ['sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(50, 110, num = 5)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2,3,4]
# Minimum number of samples required at each leaf node
min_samples_leaf = [2,3]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)

{'n_estimators': [150, 225, 300], 'max_features': ['sqrt'], 'max_depth': [50, 65, 80, 95, 110, None], 'min_samples_split': [2, 3, 4], 'min_samples_leaf': [2, 3], 'bootstrap': [True, False]}


In [87]:
rf = RandomForestRegressor()
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 5, \
                               cv = 5, verbose=2, random_state=42, n_jobs = -1)
rf_random.fit(X_train, y_train)
rf_random.best_params_

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  23 out of  25 | elapsed:  1.7min remaining:    9.0s
[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:  1.8min finished


{'n_estimators': 150,
 'min_samples_split': 3,
 'min_samples_leaf': 2,
 'max_features': 'sqrt',
 'max_depth': None,
 'bootstrap': False}

### Training the Best Model

In [89]:
rf2 = RandomForestRegressor(random_state = 2019, n_estimators=150, 
                           min_samples_split=3,
                           min_samples_leaf= 2, 
                           max_features= 'sqrt',
                           max_depth= None, 
                           bootstrap= False)

In [90]:
rf2.fit(X_train,y_train)

RandomForestRegressor(bootstrap=False, criterion='mse', max_depth=None,
           max_features='sqrt', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=2, min_samples_split=3,
           min_weight_fraction_leaf=0.0, n_estimators=150, n_jobs=None,
           oob_score=False, random_state=2019, verbose=0, warm_start=False)

In [91]:
rf2.score(X_train,y_train)

0.9690208210931406

In [92]:
rf2.score(X_test,y_test)

0.5800145925575442

### Predict

In [93]:
prediction = rf2.predict(X_test)
prediction.shape

(3492, 6)

In [94]:
network_prediction = inverse_standardize(prediction, scaler)
network_prediction.shape

(3492, 6)

### Evaluate

In [95]:
get_rmse(testmatrix[maxlag:], network_prediction)

36.96784401313524

In [96]:
r2_score(testmatrix[maxlag:], network_prediction, multioutput='variance_weighted')

0.8326573153718216

In [97]:
testmatrix.mean()

101.86020738203958