In [1]:
import pandas as pd
from sklearn.decomposition import PCA
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor

from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.multioutput import MultiOutputRegressor

### Helper Functions

In [2]:
def loadData(file):
    data = pd.read_csv(file)
    print('Raw shape: ',data.shape)
    data['Date'] = pd.to_datetime(data.Date)
    print('Days: ',len(set(data.Date)))
    return data

In [3]:
def getTimeSeries(df):
    table = pd.pivot_table(df, values='vehicle_count', index=['Date','Hour'],
                    columns=['DOLocationID'], aggfunc=np.sum, fill_value=0)
    return table

In [4]:
def zscoreNormalizeSpatial(matrix):
    m = matrix.copy()
    for i in range(m.shape[0]):
        m[i, :] = (m[i, :] - m[i, :].mean()) / (m[i, :].std()+1e-10)
        
    return m

In [5]:
def standardize(matrix):
    m = matrix.copy()
    scaler = StandardScaler()
    scaler.fit(m)
    t = scaler.transform(m)
    return scaler, t

In [6]:
def inverse_standardize(matrix, scaler):
    t = matrix.copy()
    return scaler.inverse_transform(t)

In [7]:
def getPCAFeatures(matrix, n=10):
    pca = PCA(n_components=n)
    pca.fit(matrix)
    reducedMatrixPCA = pca.transform(matrix)
    reducedMatrixPCA.shape

    reducedDict = {str(i+1):reducedMatrixPCA[:,i] for i in range(reducedMatrixPCA.shape[1])}
    reducedDf = pd.DataFrame(reducedDict)
    #reducedDf.index = index
    return pca,reducedDf

In [8]:
def PCA_test(matrix, pca):

    reducedMatrixPCA = pca.transform(matrix)

    reducedDict = {str(i+1):reducedMatrixPCA[:,i] for i in range(reducedMatrixPCA.shape[1])}
    reducedDf = pd.DataFrame(reducedDict)
    #reducedDf.index = index
    return reducedDf

In [9]:
def inverse_pca(matrix,pca):
    m = matrix.copy()
    return pca.inverse_transform(m)

In [10]:
def addLag(dataset, maxlag):
    dataset_list = [dataset]

    for l in range(1, maxlag+1):
        df = dataset.shift(l)
        df.columns = [c+'_lag_'+str(l) for c in df.columns]
        dataset_list.append(df)

    dataset = pd.concat(dataset_list, axis=1).dropna()
    return dataset

In [11]:
def get_rmse(matrix1, matrix2):
    sumSquareError = np.mean(np.power(matrix1 - matrix2,2))
    rmse = np.power(sumSquareError,0.5)
    return rmse

#### Preparing Data

In [12]:
file = '/home/urwa/Documents/Projects/NYU Remote/project/data/JfkVehiceByHour.csv'

In [13]:
data = loadData(file)

Raw shape:  (4520160, 4)
Days:  730


In [14]:
data = getTimeSeries(data)

In [15]:
data.shape

(17520, 258)

In [16]:
sep = int(0.8*len(data))
sep

14016

In [17]:
trainData = data[:sep]
testData = data[sep:]

In [18]:
trainData.shape

(14016, 258)

In [19]:
testData.shape

(3504, 258)

In [20]:
trainmatrix = trainData.values.astype(np.float64)
testmatrix = testData.values.astype(np.float64)

### Normalization

In [21]:
scaler, s_train_matrix = standardize(trainmatrix)
s_test_matrix = scaler.transform(testmatrix)

## PCA

In [22]:
pca,pcaTrain = getPCAFeatures(s_train_matrix,n=10)
pcaTest = PCA_test(s_test_matrix, pca)

In [23]:
pcaTrain.shape, pcaTest.shape

((14016, 10), (3504, 10))

In [24]:
pcaTrain.head(3)

Unnamed: 0,1,2,3,4,5,6,7,8,9,10
0,-5.343973,3.997718,0.678648,-0.221603,0.288819,-0.67436,0.589661,1.034974,0.082162,-1.13681
1,-9.769835,1.061747,1.704459,0.183137,-0.175229,-0.13204,-0.369262,0.197216,0.029548,0.252415
2,-11.843036,0.896696,1.072944,-0.341663,0.261827,-0.49403,0.299545,0.116666,-0.181864,-0.381054


### Lag Variables

In [25]:
maxlag = 12
DateColumns = ['Date', 'Hour']
lagColumns = [c for c in pcaTrain.columns if c not in DateColumns]

dataset_train = addLag(pcaTrain, maxlag)

dataset_train.shape

(14004, 130)

In [26]:
dataset_test = addLag(pcaTest, maxlag)
dataset_test.shape

(3492, 130)

### Modelling

In [27]:
X_train = dataset_train.drop(lagColumns , axis = 1)
X_test = dataset_test.drop(lagColumns , axis = 1)
y_train = dataset_train[lagColumns]
y_test = dataset_test[lagColumns]

In [28]:
X_train.shape, X_test.shape

((14004, 120), (3492, 120))

In [29]:
y_train.shape, y_test.shape

((14004, 10), (3492, 10))

In [30]:
# X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.3)

In [31]:
rf2 = RandomForestRegressor(random_state = 0, n_estimators=200, 
                           min_samples_split=10,
                           min_samples_leaf= 3, 
                           max_features= 'sqrt',
                           max_depth= 30, 
                           bootstrap= True)

In [32]:
rf2.fit(X_train,y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=30,
           max_features='sqrt', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=3, min_samples_split=10,
           min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=None,
           oob_score=False, random_state=0, verbose=0, warm_start=False)

In [33]:
rf2.score(X_train,y_train)

0.8672445484949778

In [34]:
rf2.score(X_test,y_test)

0.6989923770821103

### Predict

In [35]:
pca_prediction = rf2.predict(X_test)
pca_prediction.shape

(3492, 10)

In [36]:
network_prediction = inverse_pca(pca_prediction,pca)
network_prediction.shape

(3492, 258)

In [37]:
network_prediction = inverse_standardize(network_prediction, scaler)
network_prediction.shape

(3492, 258)

### Evaluate

In [38]:
get_rmse(testmatrix[maxlag:], network_prediction)

1.960473264932762

In [39]:
r2_score(testmatrix[maxlag:], network_prediction)

0.23780149509353318