In [1]:
import pandas as pd
from sklearn.decomposition import PCA
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor

from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.multioutput import MultiOutputRegressor

### Helper Functions

In [2]:
def loadData(file):
    data = pd.read_csv(file)
    print('Raw shape: ',data.shape)
    data['Date'] = pd.to_datetime(data.Date)
    print('Days: ',len(set(data.Date)))
    return data

In [3]:
def getTimeSeries(df):
    table = pd.pivot_table(df, values='vehicle_count', index=['Date','Hour'],
                    columns=['DOLocationID'], aggfunc=np.sum, fill_value=0)
    return table

In [4]:
def zscoreNormalizeSpatial(matrix):
    m = matrix.copy()
    for i in range(m.shape[0]):
        m[i, :] = (m[i, :] - m[i, :].mean()) / (m[i, :].std()+1e-10)
        
    return m

In [5]:
def standardize(matrix):
    m = matrix.copy()
    scaler = StandardScaler()
    scaler.fit(m)
    t = scaler.transform(m)
    return scaler, t

In [6]:
def inverse_standardize(matrix, scaler):
    t = matrix.copy()
    return scaler.inverse_transform(t)

In [7]:
def addLag(dataset, maxlag, lagColumns):
    dataset_list = [dataset]

    for l in range(1, maxlag+1):
        df = dataset.shift(l)
        df = df[lagColumns]
        df.columns = [c+'_lag_'+str(l) for c in df.columns]
        dataset_list.append(df)

    dataset = pd.concat(dataset_list, axis=1).dropna()
    return dataset

In [8]:
def get_rmse(matrix1, matrix2):
    sumSquareError = np.mean(np.power(matrix1 - matrix2,2))
    rmse = np.power(sumSquareError,0.5)
    return rmse

In [9]:
def get_weights(rawdata, zontoBorough):
    
    rawdata['Borough'] = rawdata['DOLocationID'].apply(lambda x:zontoBorough[x])
    
    borough_df = rawdata[['vehicle_count','Borough']].groupby(by='Borough').sum().reset_index()

    zone_df = rawdata[['vehicle_count','DOLocationID']].groupby(by='DOLocationID').sum().reset_index()

    zone_df['Borough'] = zone_df['DOLocationID'].apply(lambda x:zontoBorough[x])

    zone_df = pd.merge(borough_df, zone_df, on=['Borough'], how='inner')

    zone_df['zone_weight'] = zone_df.vehicle_count_y / zone_df.vehicle_count_x

    zone_df = zone_df[['Borough', 'DOLocationID', 'zone_weight']]

    return zone_df

#### Preparing Data

In [10]:
hub = 'Penn'
tune_hyp_params = False

In [11]:
dataDir = '/home/urwa/Documents/Projects/NYU Remote/project/data/processedData/'
file = dataDir + hub + 'VehiceByHour.csv'

In [12]:
rawdata = loadData(file)

Raw shape:  (2251320, 4)
Days:  365


In [13]:
edge_data = getTimeSeries(rawdata)
edge_data = edge_data.reset_index()
edge_data.head(3)

DOLocationID,Date,Hour,1,2,3,4,5,6,7,8,...,254,255,256,257,258,259,260,261,262,263
0,2018-01-01,0,0,0,0,4,0,0,1,0,...,0,3,1,0,1,0,0,2,7,3
1,2018-01-01,1,0,0,0,8,0,0,6,0,...,0,13,4,3,0,0,1,3,5,13
2,2018-01-01,2,1,0,0,7,0,0,8,0,...,0,4,2,1,0,0,1,4,4,10


### Community Level Aggregation

In [14]:
zones = pd.read_csv('/home/urwa/Documents/Projects/NYU Remote/project/UrbanTemporalNetworks/Data/ZonetoComm.csv')
zones.head(2)

Unnamed: 0,start_id,start_community
0,1,0.0
1,2,4.2


In [15]:
zones['start_community'] = zones.start_community.astype(str)

zontoComm = dict(zip(zones.start_id.values,zones.start_community.values))

In [16]:
comm_data = rawdata.copy(deep=True)
comm_data['DOLocationID'] = comm_data['DOLocationID'].apply(lambda x:zontoComm[x])
comm_data.head(2)

Unnamed: 0,DOLocationID,Date,Hour,vehicle_count
0,0.0,2018-01-01,0,0.0
1,4.2,2018-01-01,0,0.0


In [17]:
comm_data = getTimeSeries(comm_data)
comm_data = comm_data.reset_index()
comm_data.head(2)

DOLocationID,Date,Hour,0.0,0.1,0.2,1.0,1.1,1.2,1.3,2.0,...,4.0,4.1,4.2,4.3,4.4,4.5,5.0,5.1,5.2,5.3
0,2018-01-01,0,119,62,116,4,18,3,14,5,...,0,2,4,2,0,8,0,0,0,0
1,2018-01-01,1,162,60,244,8,29,10,47,1,...,0,4,2,7,0,12,0,0,1,0


In [18]:
zone_weights = get_weights(rawdata, zontoComm)
zone_weights.head(2)

Unnamed: 0,Borough,DOLocationID,zone_weight
0,0.0,1,0.01463
1,0.0,48,0.097804


### Merge External Data Features

In [19]:
externalDataDir = "/home/urwa/Documents/Projects/NYU Remote/project/data/HongData/"
extFile = externalDataDir + hub.upper() + ".csv"

In [20]:
extDf = pd.read_csv(extFile)
print(extDf.shape)
extDf.head(2)

(8760, 46)


Unnamed: 0,date,arrival,fhv,yellow,vehicle,ifmon,iftue,ifwed,ifthu,iffri,...,maxtemp,mintemp,avgtemp,departure,hdd,cdd,participation,newsnow,snowdepth,ifSnow
0,18/1/1 0:00,0,147,319,466,1,0,0,0,0,...,19,7,13.0,-20.4,52,0,0.0,0.0,0,0
1,18/1/1 1:00,1,347,397,744,1,0,0,0,0,...,19,7,13.0,-20.4,52,0,0.0,0.0,0,0


In [21]:
extDf['date'] = pd.to_datetime(extDf['date'], yearfirst=True)
extDf.head(2)

Unnamed: 0,date,arrival,fhv,yellow,vehicle,ifmon,iftue,ifwed,ifthu,iffri,...,maxtemp,mintemp,avgtemp,departure,hdd,cdd,participation,newsnow,snowdepth,ifSnow
0,2018-01-01 00:00:00,0,147,319,466,1,0,0,0,0,...,19,7,13.0,-20.4,52,0,0.0,0.0,0,0
1,2018-01-01 01:00:00,1,347,397,744,1,0,0,0,0,...,19,7,13.0,-20.4,52,0,0.0,0.0,0,0


In [22]:
min(extDf.date), max(extDf.date)

(Timestamp('2018-01-01 00:00:00'), Timestamp('2018-12-31 23:00:00'))

In [23]:
extDf['Hour'] = extDf['date'].dt.hour
extDf['Dow'] = extDf['date'].dt.dayofweek
extDf['Date'] = extDf['date'].dt.date

In [24]:
extDf.columns

Index(['date', 'arrival', 'fhv', 'yellow', 'vehicle', 'ifmon', 'iftue',
       'ifwed', 'ifthu', 'iffri', 'ifsat', 'ifsun', 'if0', 'if1', 'if2', 'if3',
       'if4', 'if5', 'if6', 'if7', 'if8', 'if9', 'if10', 'if11', 'if12',
       'if13', 'if14', 'if15', 'if16', 'if17', 'if18', 'if19', 'if20', 'if21',
       'if22', 'if23', 'maxtemp', 'mintemp', 'avgtemp', 'departure', 'hdd',
       'cdd', 'participation', 'newsnow', 'snowdepth', 'ifSnow', 'Hour', 'Dow',
       'Date'],
      dtype='object')

In [25]:
selected_columns = ['Date', 'Hour', 'Dow', 'arrival','maxtemp', 'mintemp', 'avgtemp', 'departure', 'hdd',
       'cdd', 'participation', 'newsnow', 'snowdepth', 'ifSnow']

In [26]:
extDf = extDf[selected_columns]

In [27]:
print(comm_data.shape)
print(extDf.shape)

(8760, 26)
(8760, 14)


In [28]:
comm_data['Date'] = pd.to_datetime(comm_data['Date'])
extDf['Date'] = pd.to_datetime(extDf['Date'])

In [29]:
comm_data = pd.merge(comm_data,extDf, on=['Date', 'Hour'], how='inner')
print(comm_data.shape)
comm_data['Date'] = comm_data['Date'].dt.date
comm_data.head()

(8760, 38)


Unnamed: 0,Date,Hour,0.0,0.1,0.2,1.0,1.1,1.2,1.3,2.0,...,maxtemp,mintemp,avgtemp,departure,hdd,cdd,participation,newsnow,snowdepth,ifSnow
0,2018-01-01,0,119,62,116,4,18,3,14,5,...,19,7,13.0,-20.4,52,0,0.0,0.0,0,0
1,2018-01-01,1,162,60,244,8,29,10,47,1,...,19,7,13.0,-20.4,52,0,0.0,0.0,0,0
2,2018-01-01,2,178,65,174,12,23,7,55,4,...,19,7,13.0,-20.4,52,0,0.0,0.0,0,0
3,2018-01-01,3,134,37,126,15,16,8,35,3,...,19,7,13.0,-20.4,52,0,0.0,0.0,0,0
4,2018-01-01,4,82,22,71,6,11,3,21,3,...,19,7,13.0,-20.4,52,0,0.0,0.0,0,0


### Train Test split

In [30]:
sep = int(0.75*len(comm_data))
sep

6570

In [31]:
trainData = comm_data[:sep]
testData = comm_data[sep:]

In [32]:
trainData.shape

(6570, 38)

In [33]:
testData.shape

(2190, 38)

### Lag Variables

In [34]:
trainData.columns

Index(['Date', 'Hour', '0.0', '0.1', '0.2', '1.0', '1.1', '1.2', '1.3', '2.0',
       '2.1', '2.2', '2.3', '3.0', '3.1', '3.2', '4.0', '4.1', '4.2', '4.3',
       '4.4', '4.5', '5.0', '5.1', '5.2', '5.3', 'Dow', 'arrival', 'maxtemp',
       'mintemp', 'avgtemp', 'departure', 'hdd', 'cdd', 'participation',
       'newsnow', 'snowdepth', 'ifSnow'],
      dtype='object')

In [35]:
lagColumns = ['0.0', '0.1', '0.2', '1.0', '1.1', '1.2', '1.3', '2.0',
       '2.1', '2.2', '2.3', '3.0', '3.1', '3.2', '4.0', '4.1', '4.2', '4.3',
       '4.4', '4.5', '5.0', '5.1', '5.2', '5.3', 'arrival']

DateColumns = ['Date']

targetColumns = ['0.0', '0.1', '0.2', '1.0', '1.1', '1.2', '1.3', '2.0',
       '2.1', '2.2', '2.3', '3.0', '3.1', '3.2', '4.0', '4.1', '4.2', '4.3',
       '4.4', '4.5', '5.0', '5.1', '5.2', '5.3']

In [36]:
maxlag = 12

dataset_train = addLag(trainData, maxlag, lagColumns)

dataset_train.shape

(6558, 338)

In [37]:
dataset_test = addLag(testData, maxlag, lagColumns)
dataset_test.shape

(2178, 338)

### Modelling

In [38]:
X_train = dataset_train.drop(targetColumns+DateColumns , axis = 1)
X_test = dataset_test.drop(targetColumns+DateColumns , axis = 1)
y_train = dataset_train[targetColumns]
y_test = dataset_test[targetColumns]

In [39]:
X_train.shape, X_test.shape

((6558, 313), (2178, 313))

In [40]:
y_train.shape, y_test.shape

((6558, 24), (2178, 24))

### Hyperparameter Tuning

In [41]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 150, stop = 300, num = 3)]
# Number of features to consider at every split
max_features = ['sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(50, 110, num = 5)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2,3,4]
# Minimum number of samples required at each leaf node
min_samples_leaf = [2,3]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)

{'n_estimators': [150, 225, 300], 'max_features': ['sqrt'], 'max_depth': [50, 65, 80, 95, 110, None], 'min_samples_split': [2, 3, 4], 'min_samples_leaf': [2, 3], 'bootstrap': [True, False]}


In [42]:
if tune_hyp_params:
    rf = RandomForestRegressor()
    rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 5, \
                                   cv = 5, verbose=2, random_state=42, n_jobs = -1)
    rf_random.fit(X_train, y_train)
    rf_random.best_params_

### Training the Best Model

In [43]:
rf2 = RandomForestRegressor(random_state = 2019, n_estimators=150, 
                           min_samples_split=3,
                           min_samples_leaf= 2, 
                           max_features= 'sqrt',
                           max_depth= None, 
                           bootstrap= False)

In [44]:
rf2.fit(X_train,y_train)

RandomForestRegressor(bootstrap=False, criterion='mse', max_depth=None,
           max_features='sqrt', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=2, min_samples_split=3,
           min_weight_fraction_leaf=0.0, n_estimators=150, n_jobs=None,
           oob_score=False, random_state=2019, verbose=0, warm_start=False)

In [45]:
rf2.score(X_train,y_train)

0.9933729298498395

In [46]:
rf2.score(X_test,y_test)

0.8678606842804438

### Predict

In [47]:
comm_prediction = rf2.predict(X_test)
comm_prediction.shape

(2178, 24)

### Evaluate

In [48]:
get_rmse(y_test, comm_prediction)

0.0    38.416679
0.1    16.343590
0.2    28.553248
1.0     2.610056
1.1     5.264041
1.2     2.308935
1.3     7.063376
2.0     1.830884
2.1     1.400305
2.2     3.166412
2.3     1.228400
3.0    13.696948
3.1     3.108930
3.2     8.806441
4.0     0.003858
4.1     1.592959
4.2     3.443168
4.3     2.057263
4.4     0.398831
4.5     3.776520
5.0     0.320071
5.1     0.358834
5.2     0.287723
5.3     0.304863
dtype: float64

In [49]:
r2_score(y_test, comm_prediction, multioutput='variance_weighted')

0.8678606842804438

In [50]:
y_test.mean()

0.0    218.755280
0.1     65.337925
0.2    131.567952
1.0      4.525253
1.1     11.123508
1.2      3.013774
1.3     12.388889
2.0      2.239669
2.1      1.560606
2.2      6.719927
2.3      1.229568
3.0     41.160698
3.1      6.921028
3.2     32.717172
4.0      0.000000
4.1      1.878329
4.2      7.618916
4.3      2.679063
4.4      0.138659
4.5      7.906336
5.0      0.098255
5.1      0.120753
5.2      0.077594
5.3      0.082185
dtype: float64

### Edge Level Evaluation

In [51]:
comm_prediction.shape

(2178, 24)

In [52]:
edge_prediction_df = pd.DataFrame(comm_prediction)
edge_prediction_df.columns = y_test.columns
edge_prediction_df.head(2)

Unnamed: 0,0.0,0.1,0.2,1.0,1.1,1.2,1.3,2.0,2.1,2.2,...,4.0,4.1,4.2,4.3,4.4,4.5,5.0,5.1,5.2,5.3
0,417.225556,94.556667,93.518889,1.227778,1.938889,0.432222,1.86,1.814444,0.477778,2.383333,...,0.0,0.484444,6.89,0.606667,0.022222,1.501111,0.016667,0.033333,0.0,0.0
1,455.847778,130.477778,133.895556,0.756667,2.546667,0.874444,1.711111,0.963333,0.404444,2.557778,...,0.0,0.418889,6.287778,0.323333,0.012222,1.484444,0.0,0.018889,0.018889,0.005556


In [53]:
boroughs = list(edge_prediction_df.columns)
for bor in boroughs:
    print(bor)
    
    weight_df = zone_weights[zone_weights.Borough == bor]
    
    print(len(weight_df.DOLocationID))
    
    for b_zone,z_weight in zip(weight_df.DOLocationID.values,weight_df.zone_weight.values):        
        edge_prediction_df[b_zone] = edge_prediction_df[bor] * z_weight

0.0
14
0.1
4
0.2
21
1.0
15
1.1
17
1.2
20
1.3
10
2.0
14
2.1
19
2.2
8
2.3
10
3.0
6
3.1
4
3.2
8
4.0
1
4.1
18
4.2
23
4.3
12
4.4
5
4.5
9
5.0
4
5.1
8
5.2
4
5.3
3


In [54]:
select_cols = [c for c in edge_prediction_df.columns if c not in boroughs]
edge_prediction_df = edge_prediction_df[select_cols]
edge_prediction_df.shape

(2178, 257)

In [55]:
edge_testData = edge_data[sep+maxlag:]
select_cols = [c for c in edge_testData.columns if c not in ['Date','Hour']]
edge_testData = edge_testData[select_cols]
edge_testData.shape

(2178, 257)

In [56]:
edge_prediction_df = edge_prediction_df[edge_testData.columns]
edge_prediction_df.head(2)

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,254,255,256,257,258,259,260,261,262,263
0,6.104064,0.000223,0.027294,2.201208,0.003238,0.0,0.428844,0.002251,0.014687,0.130305,...,0.03603,0.298004,0.263211,0.056355,0.086535,0.047702,0.090044,2.66104,4.416922,6.195094
1,6.669112,0.000204,0.023105,3.151576,0.001835,0.00234,0.424083,0.002226,0.012699,0.118916,...,0.0305,0.27415,0.242142,0.074021,0.078972,0.04038,0.04799,3.809941,3.882032,5.444867


In [57]:
edge_testData.head(2)

DOLocationID,1,2,3,4,5,6,7,8,9,10,...,254,255,256,257,258,259,260,261,262,263
6582,4,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,2,0,2
6583,4,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,1,1


In [58]:
get_rmse(edge_testData.values, edge_prediction_df.values)

2.7783091026153666

In [59]:
r2_score(edge_testData.values, edge_prediction_df.values, multioutput='variance_weighted')

0.5252507832704036