In [1]:
import pandas as pd
from sklearn.decomposition import PCA
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor

from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.multioutput import MultiOutputRegressor

### Helper Functions

In [2]:
def loadData(file):
    data = pd.read_csv(file)
    print('Raw shape: ',data.shape)
    data['Date'] = pd.to_datetime(data.Date)
    print('Days: ',len(set(data.Date)))
    return data

In [3]:
def getTimeSeries(df):
    table = pd.pivot_table(df, values='vehicle_count', index=['Date','Hour'],
                    columns=['DOLocationID'], aggfunc=np.sum, fill_value=0)
    return table

In [4]:
def zscoreNormalizeSpatial(matrix):
    m = matrix.copy()
    for i in range(m.shape[0]):
        m[i, :] = (m[i, :] - m[i, :].mean()) / (m[i, :].std()+1e-10)
        
    return m

In [5]:
def standardize(matrix):
    m = matrix.copy()
    scaler = StandardScaler()
    scaler.fit(m)
    t = scaler.transform(m)
    return scaler, t

In [6]:
def inverse_standardize(matrix, scaler):
    t = matrix.copy()
    return scaler.inverse_transform(t)

In [7]:
def addLag(dataset, maxlag, lagColumns):
    dataset_list = [dataset]

    for l in range(1, maxlag+1):
        df = dataset.shift(l)
        df = df[lagColumns]
        df.columns = [c+'_lag_'+str(l) for c in df.columns]
        dataset_list.append(df)

    dataset = pd.concat(dataset_list, axis=1).dropna()
    return dataset

In [8]:
def get_rmse(matrix1, matrix2):
    sumSquareError = np.mean(np.power(matrix1 - matrix2,2))
    rmse = np.power(sumSquareError,0.5)
    return rmse

In [9]:
def get_weights(rawdata, zontoBorough):
    
    rawdata['Borough'] = rawdata['DOLocationID'].apply(lambda x:zontoBorough[x])
    
    borough_df = rawdata[['vehicle_count','Borough']].groupby(by='Borough').sum().reset_index()

    zone_df = rawdata[['vehicle_count','DOLocationID']].groupby(by='DOLocationID').sum().reset_index()

    zone_df['Borough'] = zone_df['DOLocationID'].apply(lambda x:zontoBorough[x])

    zone_df = pd.merge(borough_df, zone_df, on=['Borough'], how='inner')

    zone_df['zone_weight'] = zone_df.vehicle_count_y / zone_df.vehicle_count_x

    zone_df = zone_df[['Borough', 'DOLocationID', 'zone_weight']]

    return zone_df

#### Preparing Data

In [10]:
hub = 'Penn'
tune_hyp_params = False

In [11]:
dataDir = '/home/urwa/Documents/Projects/NYU Remote/project/data/processedData/'
file = dataDir + hub + 'VehiceByHour.csv'

In [12]:
rawdata = loadData(file)

Raw shape:  (2251320, 4)
Days:  365


In [13]:
edge_data = getTimeSeries(rawdata)
edge_data = edge_data.reset_index()
edge_data.head(3)

DOLocationID,Date,Hour,1,2,3,4,5,6,7,8,...,254,255,256,257,258,259,260,261,262,263
0,2018-01-01,0,0,0,0,4,0,0,1,0,...,0,3,1,0,1,0,0,2,7,3
1,2018-01-01,1,0,0,0,8,0,0,6,0,...,0,13,4,3,0,0,1,3,5,13
2,2018-01-01,2,1,0,0,7,0,0,8,0,...,0,4,2,1,0,0,1,4,4,10


### Community Level Aggregation

In [14]:
zones = pd.read_csv('/home/urwa/Documents/Projects/NYU Remote/project/UrbanTemporalNetworks/Data/ZonetoComm.csv')
zones.head(2)

Unnamed: 0,start_id,start_community
0,1,0.0
1,2,4.2


In [15]:
zones['start_community'] = zones.start_community.astype(str)

zontoComm = dict(zip(zones.start_id.values,zones.start_community.values))

In [16]:
comm_data = rawdata.copy(deep=True)
comm_data['DOLocationID'] = comm_data['DOLocationID'].apply(lambda x:zontoComm[x])
comm_data.head(2)

Unnamed: 0,DOLocationID,Date,Hour,vehicle_count
0,0.0,2018-01-01,0,0.0
1,4.2,2018-01-01,0,0.0


In [17]:
comm_data = getTimeSeries(comm_data)
comm_data = comm_data.reset_index()
comm_data.head(2)

DOLocationID,Date,Hour,0.0,0.1,0.2,1.0,1.1,1.2,1.3,2.0,...,4.0,4.1,4.2,4.3,4.4,4.5,5.0,5.1,5.2,5.3
0,2018-01-01,0,119,62,116,4,18,3,14,5,...,0,2,4,2,0,8,0,0,0,0
1,2018-01-01,1,162,60,244,8,29,10,47,1,...,0,4,2,7,0,12,0,0,1,0


In [18]:
zone_weights = get_weights(rawdata, zontoComm)
zone_weights.head(2)

Unnamed: 0,Borough,DOLocationID,zone_weight
0,0.0,1,0.01463
1,0.0,48,0.097804


### Merge External Data Features

In [19]:
externalDataDir = "/home/urwa/Documents/Projects/NYU Remote/project/data/HongData/"
extFile = externalDataDir + hub.upper() + ".csv"

In [20]:
extDf = pd.read_csv(extFile)
print(extDf.shape)
extDf.head(2)

(8760, 46)


Unnamed: 0,date,arrival,fhv,yellow,vehicle,ifmon,iftue,ifwed,ifthu,iffri,...,maxtemp,mintemp,avgtemp,departure,hdd,cdd,participation,newsnow,snowdepth,ifSnow
0,18/1/1 0:00,0,147,319,466,1,0,0,0,0,...,19,7,13.0,-20.4,52,0,0.0,0.0,0,0
1,18/1/1 1:00,1,347,397,744,1,0,0,0,0,...,19,7,13.0,-20.4,52,0,0.0,0.0,0,0


In [21]:
extDf['date'] = pd.to_datetime(extDf['date'], yearfirst=True)
extDf.head(2)

Unnamed: 0,date,arrival,fhv,yellow,vehicle,ifmon,iftue,ifwed,ifthu,iffri,...,maxtemp,mintemp,avgtemp,departure,hdd,cdd,participation,newsnow,snowdepth,ifSnow
0,2018-01-01 00:00:00,0,147,319,466,1,0,0,0,0,...,19,7,13.0,-20.4,52,0,0.0,0.0,0,0
1,2018-01-01 01:00:00,1,347,397,744,1,0,0,0,0,...,19,7,13.0,-20.4,52,0,0.0,0.0,0,0


In [22]:
min(extDf.date), max(extDf.date)

(Timestamp('2018-01-01 00:00:00'), Timestamp('2018-12-31 23:00:00'))

In [23]:
extDf['Hour'] = extDf['date'].dt.hour
extDf['Dow'] = extDf['date'].dt.dayofweek
extDf['Date'] = extDf['date'].dt.date

In [24]:
extDf.columns

Index(['date', 'arrival', 'fhv', 'yellow', 'vehicle', 'ifmon', 'iftue',
       'ifwed', 'ifthu', 'iffri', 'ifsat', 'ifsun', 'if0', 'if1', 'if2', 'if3',
       'if4', 'if5', 'if6', 'if7', 'if8', 'if9', 'if10', 'if11', 'if12',
       'if13', 'if14', 'if15', 'if16', 'if17', 'if18', 'if19', 'if20', 'if21',
       'if22', 'if23', 'maxtemp', 'mintemp', 'avgtemp', 'departure', 'hdd',
       'cdd', 'participation', 'newsnow', 'snowdepth', 'ifSnow', 'Hour', 'Dow',
       'Date'],
      dtype='object')

In [25]:
selected_columns = ['Date', 'Hour', 'Dow', 'arrival','maxtemp', 'mintemp', 'avgtemp', 'departure', 'hdd',
       'cdd', 'participation', 'newsnow', 'snowdepth', 'ifSnow']

In [26]:
extDf = extDf[selected_columns]

In [27]:
print(comm_data.shape)
print(extDf.shape)

(8760, 26)
(8760, 14)


In [28]:
comm_data['Date'] = pd.to_datetime(comm_data['Date'])
extDf['Date'] = pd.to_datetime(extDf['Date'])

In [29]:
comm_data = pd.merge(comm_data,extDf, on=['Date', 'Hour'], how='inner')
print(comm_data.shape)
comm_data['Date'] = comm_data['Date'].dt.date
comm_data.head()

(8760, 38)


Unnamed: 0,Date,Hour,0.0,0.1,0.2,1.0,1.1,1.2,1.3,2.0,...,maxtemp,mintemp,avgtemp,departure,hdd,cdd,participation,newsnow,snowdepth,ifSnow
0,2018-01-01,0,119,62,116,4,18,3,14,5,...,19,7,13.0,-20.4,52,0,0.0,0.0,0,0
1,2018-01-01,1,162,60,244,8,29,10,47,1,...,19,7,13.0,-20.4,52,0,0.0,0.0,0,0
2,2018-01-01,2,178,65,174,12,23,7,55,4,...,19,7,13.0,-20.4,52,0,0.0,0.0,0,0
3,2018-01-01,3,134,37,126,15,16,8,35,3,...,19,7,13.0,-20.4,52,0,0.0,0.0,0,0
4,2018-01-01,4,82,22,71,6,11,3,21,3,...,19,7,13.0,-20.4,52,0,0.0,0.0,0,0


In [30]:
comm_data.columns

Index(['Date', 'Hour', '0.0', '0.1', '0.2', '1.0', '1.1', '1.2', '1.3', '2.0',
       '2.1', '2.2', '2.3', '3.0', '3.1', '3.2', '4.0', '4.1', '4.2', '4.3',
       '4.4', '4.5', '5.0', '5.1', '5.2', '5.3', 'Dow', 'arrival', 'maxtemp',
       'mintemp', 'avgtemp', 'departure', 'hdd', 'cdd', 'participation',
       'newsnow', 'snowdepth', 'ifSnow'],
      dtype='object')

In [31]:
lagColumns = ['0.0', '0.1', '0.2', '1.0', '1.1', '1.2', '1.3', '2.0',
       '2.1', '2.2', '2.3', '3.0', '3.1', '3.2', '4.0', '4.1', '4.2', '4.3',
       '4.4', '4.5', '5.0', '5.1', '5.2', '5.3', 'arrival']

DateColumns = ['Date']

targetColumns = ['0.0', '0.1', '0.2', '1.0', '1.1', '1.2', '1.3', '2.0',
       '2.1', '2.2', '2.3', '3.0', '3.1', '3.2', '4.0', '4.1', '4.2', '4.3',
       '4.4', '4.5', '5.0', '5.1', '5.2', '5.3']

In [32]:
maxlag = 12

comm_data_lag = addLag(comm_data, maxlag, lagColumns)

comm_data_lag.shape

(8748, 338)

### Train Test split

In [33]:
# sep = int(0.75*len(comm_data_lag))
# sep

In [34]:
CommR2List = []
EdgeR2List = []
residualDf_list = []

for m in range(1,13):
    print()
    print("month: ",m)
    month_index  = pd.to_datetime(comm_data_lag.Date).dt.month == m

    dataset_train = comm_data_lag[~month_index]
    dataset_test = comm_data_lag[month_index]
    print("Train Size: ",dataset_train.shape)
    print("Test Size: ",dataset_test.shape)

    edgeMonthIndex = [False] * maxlag + list(month_index)
    edge_testData = edge_data[edgeMonthIndex]
    select_cols = [c for c in edge_testData.columns if c not in ['Date','Hour']]
    edge_testData = edge_testData[select_cols]
    print("edge test data shape: ",edge_testData.shape)


    X_train = dataset_train.drop(targetColumns+DateColumns , axis = 1)
    X_test = dataset_test.drop(targetColumns+DateColumns , axis = 1)
    y_train = dataset_train[targetColumns]
    y_test = dataset_test[targetColumns]

    rf2 = RandomForestRegressor(random_state = 2019, n_estimators=150, 
                               min_samples_split=3,
                               min_samples_leaf= 2, 
                               max_features= 'sqrt',
                               max_depth= None, 
                               bootstrap= False)

    rf2.fit(X_train,y_train)

    print("Train R2: ",rf2.score(X_train,y_train))
    test_r2 = rf2.score(X_test,y_test)
    print("Test R2: ",test_r2)


    comm_prediction = rf2.predict(X_test)
    edge_prediction_df = pd.DataFrame(comm_prediction)
    edge_prediction_df.columns = y_test.columns

    residual = y_test - comm_prediction
    residual_df = dataset_test[['Date','Hour']]
    residual_df = pd.concat([residual_df,pd.DataFrame(residual)], axis =1)

    boroughs = list(edge_prediction_df.columns)
    for bor in boroughs:
    #     print(bor)

        weight_df = zone_weights[zone_weights.Borough == bor]

    #     print(len(weight_df.DOLocationID))

        for b_zone,z_weight in zip(weight_df.DOLocationID.values,weight_df.zone_weight.values):        
            edge_prediction_df[b_zone] = edge_prediction_df[bor] * z_weight


    select_cols = [c for c in edge_prediction_df.columns if c not in boroughs]
    edge_prediction_df = edge_prediction_df[select_cols]


    edge_prediction_df = edge_prediction_df[edge_testData.columns]

    edge_r2 = r2_score(edge_testData.values, edge_prediction_df.values, multioutput='variance_weighted')
    print("Edge R2: ",edge_r2)

    CommR2List.append(test_r2)
    EdgeR2List.append(edge_r2)
    residualDf_list.append(residual_df)


month:  1
Train Size:  (8016, 338)
Test Size:  (732, 338)
edge test data shape:  (732, 257)
Train R2:  0.9931886858605989
Test R2:  0.856861852019376
Edge R2:  0.530348269869422

month:  2
Train Size:  (8076, 338)
Test Size:  (672, 338)
edge test data shape:  (672, 257)
Train R2:  0.9931215433189904
Test R2:  0.8815216894306375
Edge R2:  0.5539930755214771

month:  3
Train Size:  (8004, 338)
Test Size:  (744, 338)
edge test data shape:  (744, 257)
Train R2:  0.9932071855590934
Test R2:  0.8725161873980556
Edge R2:  0.5367241364700431

month:  4
Train Size:  (8028, 338)
Test Size:  (720, 338)
edge test data shape:  (720, 257)
Train R2:  0.9930775907293694
Test R2:  0.9088344164884865
Edge R2:  0.562598728288345

month:  5
Train Size:  (8004, 338)
Test Size:  (744, 338)
edge test data shape:  (744, 257)
Train R2:  0.9930439707355283
Test R2:  0.9135562177668293
Edge R2:  0.5719174223298332

month:  6
Train Size:  (8028, 338)
Test Size:  (720, 338)
edge test data shape:  (720, 257)
Train

In [35]:
print(np.mean(CommR2List))
print(np.mean(EdgeR2List))

0.8840373479973257
0.5391601780164489


In [36]:
res_df = pd.concat(residualDf_list, axis = 0)
print(res_df.shape)
res_df.head()

(8748, 26)


Unnamed: 0,Date,Hour,0.0,0.1,0.2,1.0,1.1,1.2,1.3,2.0,...,4.0,4.1,4.2,4.3,4.4,4.5,5.0,5.1,5.2,5.3
12,2018-01-01,12,4.515556,-22.593333,-31.028889,-0.473333,-1.528889,1.524444,2.096667,-0.67,...,0.0,4.465556,5.978889,0.398889,-0.07,-0.061111,-0.02,-0.053333,-0.052222,-0.027778
13,2018-01-01,13,16.723333,-7.936667,-12.345556,6.096667,2.012222,-0.826667,-1.426667,1.153333,...,0.0,0.336667,4.998889,1.201111,-0.077778,2.794444,0.95,0.957778,-0.048889,-0.056667
14,2018-01-01,14,-44.23,-12.697778,-32.47,1.956667,5.143333,0.032222,5.632222,0.907778,...,-0.004444,0.304444,-3.17,-0.827778,-0.043333,4.405556,-0.045556,-0.073333,-0.025556,-0.024444
15,2018-01-01,15,28.075556,5.295556,2.993333,1.054444,10.474444,4.497778,-3.603333,-0.315556,...,0.0,0.232222,-0.374444,-1.125556,-0.073333,-3.023333,-0.058889,-0.066667,-0.044444,-0.038889
16,2018-01-01,16,42.561111,-7.11,12.101111,2.68,19.763333,1.393333,15.561111,-1.242222,...,-0.002222,1.16,2.711111,4.734444,-0.09,3.704444,-0.083333,-0.065556,-0.046667,-0.071111


In [41]:
res_df.to_csv('/home/urwa/Documents/Projects/NYU Remote/project/data/residuals/penn_comm24.csv')

In [40]:
res_df

Unnamed: 0,Date,Hour,0.0,0.1,0.2,1.0,1.1,1.2,1.3,2.0,...,4.0,4.1,4.2,4.3,4.4,4.5,5.0,5.1,5.2,5.3
12,2018-01-01,12,4.515556,-22.593333,-31.028889,-0.473333,-1.528889,1.524444,2.096667,-0.670000,...,0.000000,4.465556,5.978889,0.398889,-0.070000,-0.061111,-0.020000,-0.053333,-0.052222,-0.027778
13,2018-01-01,13,16.723333,-7.936667,-12.345556,6.096667,2.012222,-0.826667,-1.426667,1.153333,...,0.000000,0.336667,4.998889,1.201111,-0.077778,2.794444,0.950000,0.957778,-0.048889,-0.056667
14,2018-01-01,14,-44.230000,-12.697778,-32.470000,1.956667,5.143333,0.032222,5.632222,0.907778,...,-0.004444,0.304444,-3.170000,-0.827778,-0.043333,4.405556,-0.045556,-0.073333,-0.025556,-0.024444
15,2018-01-01,15,28.075556,5.295556,2.993333,1.054444,10.474444,4.497778,-3.603333,-0.315556,...,0.000000,0.232222,-0.374444,-1.125556,-0.073333,-3.023333,-0.058889,-0.066667,-0.044444,-0.038889
16,2018-01-01,16,42.561111,-7.110000,12.101111,2.680000,19.763333,1.393333,15.561111,-1.242222,...,-0.002222,1.160000,2.711111,4.734444,-0.090000,3.704444,-0.083333,-0.065556,-0.046667,-0.071111
17,2018-01-01,17,19.647778,-12.154444,35.284444,10.665556,8.285556,1.884444,16.446667,0.547778,...,0.000000,0.987778,1.364444,2.362222,-0.137778,6.230000,-0.123333,-0.095556,-0.037778,-0.054444
18,2018-01-01,18,47.233333,5.232222,4.335556,10.491111,11.147778,-0.470000,3.902222,1.196667,...,0.000000,-2.057778,-0.336667,-1.005556,-0.102222,0.272222,-0.062222,-0.051111,-0.037778,-0.061111
19,2018-01-01,19,-32.007778,-11.315556,-52.508889,6.945556,6.968889,4.086667,11.167778,-1.835556,...,0.000000,-1.118889,1.626667,-1.267778,-0.115556,-3.764444,-0.102222,-0.074444,-0.070000,-0.086667
20,2018-01-01,20,-35.608889,-21.566667,-85.345556,-1.146667,-12.426667,-3.586667,-13.396667,-1.300000,...,0.000000,-0.488889,-5.718889,-0.592222,-0.117778,-0.671111,-0.134444,-0.107778,-0.067778,-0.074444
21,2018-01-01,21,34.491111,-7.993333,43.637778,9.685556,8.442222,1.060000,2.524444,0.292222,...,0.000000,-1.775556,-2.384444,-3.088889,-0.127778,1.388889,-0.135556,-0.108889,-0.075556,0.886667


In [38]:
residual_df.head()

Unnamed: 0,Date,Hour,0.0,0.1,0.2,1.0,1.1,1.2,1.3,2.0,...,4.0,4.1,4.2,4.3,4.4,4.5,5.0,5.1,5.2,5.3
8016,2018-12-01,0,10.165556,11.772222,19.046667,-3.68,1.955556,5.044444,-2.626667,-1.39,...,0.0,-1.518889,-3.567778,-1.353333,-0.406667,0.121111,-0.26,-0.36,-0.188889,1.846667
8017,2018-12-01,1,-5.981111,-7.912222,3.627778,7.007778,2.296667,0.323333,3.567778,-0.2,...,0.0,0.39,-2.524444,-1.855556,-0.324444,1.867778,-0.145556,0.808889,-0.124444,-0.097778
8018,2018-12-01,2,-22.36,-0.551111,-3.152222,-1.496667,-0.624444,-0.017778,1.554444,-1.792222,...,0.0,4.902222,0.1,2.435556,-0.13,-2.346667,-0.072222,-0.137778,-0.057778,-0.125556
8019,2018-12-01,3,-2.871111,-1.966667,-11.968889,-3.057778,4.374444,-2.102222,-1.741111,-1.888889,...,0.0,-1.606667,0.881111,-2.702222,-0.112222,5.576667,-0.09,-0.1,-0.058889,-0.034444
8020,2018-12-01,4,5.996667,0.836667,-3.407778,-2.392222,-1.977778,0.104444,3.241111,0.022222,...,0.0,-0.53,-2.804444,0.642222,-0.064444,2.966667,-0.09,-0.045556,-0.046667,-0.023333
