In [2]:
import pandas as pd
from sklearn.decomposition import PCA
import numpy as np
import matplotlib.pyplot as plt

### Helper Functions

In [3]:
def loadData(file):
    data = pd.read_csv(file)
    print('Raw shape: ',data.shape)
    data['Date'] = pd.to_datetime(data.Date)
    print('Days: ',len(set(data.Date)))
    return data

In [4]:
def getTimeSeries(df):
    table = pd.pivot_table(df, values='vehicle_count', index=['Date','Hour'],
                    columns=['DOLocationID'], aggfunc=np.sum, fill_value=0)
    return table

In [5]:
def zscoreNormalizeSpatial(matrix):
    for i in range(matrix.shape[0]):
        matrix[i, :] = (matrix[i, :] - matrix[i, :].mean()) / (matrix[i, :].std()+1e-10)
        
    return matrix

In [6]:
def getPCAFeatures(matrix, n=10):
    pca = PCA(n_components=n)
    pca.fit(matrix)
    reducedMatrixPCA = pca.transform(matrix)
    reducedMatrixPCA.shape

    reducedDict = {str(i+1):reducedMatrixPCA[:,i] for i in range(reducedMatrixPCA.shape[1])}
    reducedDf = pd.DataFrame(reducedDict)
    reducedDf.index = data.index
    return reducedDf

In [26]:
def getPCACost(matrix, n=10):
    pca = PCA(n_components=n)
    pca.fit(matrix)
    reducedMatrixPCA = pca.transform(matrix)
    reconMatrixPCA = pca.inverse_transform(reducedMatrixPCA)
    reconCost = np.sum(reconMatrixPCA - matrix, axis=1)

    reducedDict = {'rec_cost':reconCost}
    reducedDf = pd.DataFrame(reducedDict)
    reducedDf.index = data.index
    return reducedDf

#### Preparing Data

In [12]:
file = '/home/urwa/Documents/Projects/NYU Remote/project/data/JfkVehiceByHour.csv'

In [13]:
data = loadData(file)

Raw shape:  (4520160, 4)
Days:  730


In [14]:
data = getTimeSeries(data)

In [15]:
matrix = data.values

### Normalization

In [16]:
matrix = zscoreNormalizeSpatial(matrix)
matrix.shape

(17520, 258)

In [27]:
recCost = getPCACost(matrix,n=100)

In [28]:
recCost.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,rec_cost
Date,Hour,Unnamed: 2_level_1
2017-01-01,0,1.368361
2017-01-01,1,1.787756
2017-01-01,2,-3.20123
2017-01-01,3,-11.87914
2017-01-01,4,-0.050818


In [29]:
recCost.to_csv('PCA_recon_cost_features.csv')