In [28]:
import pandas as pd
from sklearn.decomposition import PCA
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor

from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV

### Helper Functions

In [2]:
def loadData(file):
    data = pd.read_csv(file)
    print('Raw shape: ',data.shape)
    data['Date'] = pd.to_datetime(data.Date)
    print('Days: ',len(set(data.Date)))
    return data

In [3]:
def getTimeSeries(df):
    table = pd.pivot_table(df, values='vehicle_count', index=['Date','Hour'],
                    columns=['DOLocationID'], aggfunc=np.sum, fill_value=0)
    return table

In [4]:
def zscoreNormalizeSpatial(matrix):
    m = matrix.copy()
    for i in range(m.shape[0]):
        m[i, :] = (m[i, :] - m[i, :].mean()) / (m[i, :].std()+1e-10)
        
    return m

In [5]:
def standardize(matrix):
    m = matrix.copy()
    scaler = StandardScaler()
    scaler.fit(m)
    t = scaler.transform(m)
    return scaler, t

In [6]:
def inverse_standardize(matrix, scaler):
    t = matrix.copy()
    return scaler.inverse_transform(t)

In [7]:
def getPCAFeatures(matrix, index, n=10):
    pca = PCA(n_components=n)
    pca.fit(matrix)
    reducedMatrixPCA = pca.transform(matrix)
    reducedMatrixPCA.shape

    reducedDict = {str(i+1):reducedMatrixPCA[:,i] for i in range(reducedMatrixPCA.shape[1])}
    reducedDf = pd.DataFrame(reducedDict)
    reducedDf.index = index
    return pca,reducedDf

In [8]:
def inverse_pca(matrix,pca):
    m = matrix.copy()
    return pca.inverse_transform(m)

In [9]:
def addLag(dataset, maxlag, lagvar):
    # return the full dataset with lag
    LAG = np.arange(1,maxlag + 1)
    lagdata = pd.DataFrame([])
    for lag in LAG:
        varname = lagvar+'_lag' + str(lag)
        lagdata[varname] = dataset[lagvar].loc[maxlag-lag:len(dataset)-lag-1].reset_index(drop = True)
    dataset = pd.concat([dataset.loc[maxlag:].reset_index(drop = True), lagdata.reset_index(drop = True)], axis = 1, sort = False)
    return dataset

In [10]:
def get_rmse(matrix1, matrix2):
    sumSquareError = np.mean(np.power(matrix1 - matrix2,2))
    rmse = np.power(sumSquareError,0.5)
    return rmse

#### Preparing Data

In [11]:
file = '/home/urwa/Documents/Projects/NYU Remote/project/data/JfkVehiceByHour.csv'

In [12]:
data = loadData(file)

Raw shape:  (4520160, 4)
Days:  730


In [13]:
data = getTimeSeries(data)

In [14]:
_index = data.index

In [15]:
matrix = data.values.astype(np.float64)

### Normalization

In [16]:
scaler, s_matrix = standardize(matrix)

## PCA

In [17]:
pca,pcaDf = getPCAFeatures(s_matrix,_index,n=10)
pcaDf = pcaDf.reset_index()

In [18]:
pcaDf.head(3)

Unnamed: 0,Date,Hour,1,2,3,4,5,6,7,8,9,10
0,2017-01-01,0,-5.879216,3.666016,0.823052,-0.245443,0.193578,-0.260518,0.38787,1.838882,-0.373252,1.147021
1,2017-01-01,1,-10.155525,0.939707,1.724198,0.201495,-0.116128,-0.1583,-0.69979,0.225235,0.223539,0.42041
2,2017-01-01,2,-12.153637,0.822044,1.133211,-0.37645,0.376917,-0.303484,0.253443,0.02067,0.231,0.308479


### Lag Variables

In [19]:
maxlag = 12
DateColumns = ['Date', 'Hour']
lagColumns = [c for c in pcaDf.columns if c not in DateColumns]
for c in lagColumns:
    dataset = addLag(pcaDf, maxlag, c)

### Modelling

In [20]:
featureData = dataset.drop(lagColumns + DateColumns , axis = 1)
targetData = dataset[lagColumns]

In [29]:
X_train, X_test, y_train, y_test = train_test_split(featureData, targetData, test_size=0.2)

In [36]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 1, stop = 1000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)

{'n_estimators': [1, 112, 223, 334, 445, 556, 667, 778, 889, 1000], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}


In [37]:
rf = RandomForestRegressor()
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 5, \
                               cv = 5, verbose=2, random_state=42, n_jobs = -1)

NameError: name 'RandomizedSearchCV' is not defined

In [35]:
rf = RandomForestRegressor?

In [None]:
rf = RandomForestRegressor

In [None]:
rf = RandomForestRegressor

In [31]:
rf.fit(X_train,y_train)



0.07878360591145675

In [32]:
rf.score(X_train,y_train)

0.8333411718789149

In [33]:
rf.score(X_test,y_test)

0.07878360591145675

### Predict

In [23]:
pca_prediction = rf.predict(featureData)
pca_prediction.shape

(17508, 10)

In [24]:
network_prediction = inverse_pca(pca_prediction,pca)
network_prediction.shape

(17508, 258)

In [25]:
network_prediction = inverse_standardize(network_prediction, scaler)
network_prediction.shape

(17508, 258)

### Evaluate

In [26]:
get_rmse(matrix[maxlag:], network_prediction)

1.7213582840932657

In [27]:
r2_score(matrix[maxlag:], network_prediction)

0.2900140133014976