In [1]:
import pandas as pd
from sklearn.decomposition import PCA
import numpy as np
import matplotlib.pyplot as plt

### Helper Functions

In [2]:
def loadData(file):
    data = pd.read_csv(file)
    print('Raw shape: ',data.shape)
    data['Date'] = pd.to_datetime(data.Date)
    print('Days: ',len(set(data.Date)))
    return data

In [3]:
def getTimeSeries(df):
    table = pd.pivot_table(df, values='vehicle_count', index=['Date','Hour'],
                    columns=['DOLocationID'], aggfunc=np.sum, fill_value=0)
    return table

In [4]:
def gridSearchPCAComponent(matrix, component):
    rmse = []
    for n in component:
        pca = PCA(n_components=n)
        pca.fit(matrix)
        
        reducedMatrixPCA = pca.transform(matrix)
        reconMatrixPCA = pca.inverse_transform(reducedMatrixPCA)
        reconCostPCA = np.mean(np.power(reconMatrixPCA - matrix,2))
        reconCostPCA = np.power(reconCostPCA,0.5)
        rmse.append(reconCostPCA)
        
    plt.plot(component,rmse)

#### Preparing Data

In [5]:
file = '/home/urwa/Documents/Projects/NYU Remote/project/JfkVehiceByHour.csv'

In [6]:
data = loadData(file)

Raw shape:  (4520160, 4)
Days:  730


In [7]:
data = getTimeSeries(data)

In [8]:
matrix = data.values
matrix.shape

(17520, 258)

In [9]:
matrix = matrix.sum(axis=1)
matrix.shape

(17520,)

In [10]:
data['Total_Outgoing'] = matrix

In [11]:
data = data[['Total_Outgoing']]

In [12]:
data.shape

(17520, 1)

In [13]:
data.head()

Unnamed: 0_level_0,DOLocationID,Total_Outgoing
Date,Hour,Unnamed: 2_level_1
2017-01-01,0,245
2017-01-01,1,115
2017-01-01,2,47
2017-01-01,3,32
2017-01-01,4,16


#### Saving results

In [14]:
data.to_csv('Total_Out.csv')