In [3]:
%cd ..

/home/urwa/Documents/side_projects/urban/urban_traffic_prediciton


In [1]:
import pandas as pd
from sklearn.decomposition import PCA
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
# from sklearn.ensemble import RandomForestRegressor

from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.multioutput import MultiOutputRegressor

In [8]:
import torch
from pt_forest.random_forest import TorchRandomForestClassifier
from pt_forest.random_forest import TorchRandomForestRegressor

In [7]:
# Import of the model
# from Sklearn_PyTorch import TorchRandomForestClassifier


# Initialisation of the model
my_model = TorchRandomForestClassifier(nb_trees=100, nb_samples=3, max_depth=5, bootstrap=True)

# Definition of the input data

my_data = torch.FloatTensor([[0,1,2.4],[1,2,1],[4,2,0.2],[8,3,0.4], [4,1,0.4]])
my_label = torch.LongTensor([0,1,0,0,1])

# Fitting function
my_model.fit(my_data, my_label)

# Prediction function
my_vector = torch.FloatTensor([1,2,1.4])
my_result = my_model.predict(my_vector)

### Helper Functions

In [9]:
def loadData(file):
    data = pd.read_csv(file)
    print('Raw shape: ',data.shape)
    data['Date'] = pd.to_datetime(data.Date)
    data['DOLocationID'] = data['DOLocationID'].astype(str)
    print('Days: ',len(set(data.Date)))
    return data

In [10]:
def getTimeSeries(df):
    table = pd.pivot_table(df, values='vehicle_count', index=['Date','Hour'],
                    columns=['DOLocationID'], aggfunc=np.sum, fill_value=0)
    return table

In [11]:
def zscoreNormalizeSpatial(matrix):
    m = matrix.copy()
    for i in range(m.shape[0]):
        m[i, :] = (m[i, :] - m[i, :].mean()) / (m[i, :].std()+1e-10)
        
    return m

In [12]:
def standardize(matrix):
    m = matrix.copy()
    scaler = StandardScaler()
    scaler.fit(m)
    t = scaler.transform(m)
    return scaler, t

In [13]:
def inverse_standardize(matrix, scaler):
    t = matrix.copy()
    return scaler.inverse_transform(t)

In [14]:
def addLag(dataset, maxlag, lagColumns):
    dataset_list = [dataset]

    for l in range(1, maxlag+1):
        df = dataset.shift(l)
        df = df[lagColumns]
        df.columns = [c+'_lag_'+str(l) for c in df.columns]
        dataset_list.append(df)

    dataset = pd.concat(dataset_list, axis=1).dropna()
    return dataset

In [15]:
def get_rmse(matrix1, matrix2):
    sumSquareError = np.mean(np.power(matrix1 - matrix2,2))
    rmse = np.power(sumSquareError,0.5)
    return rmse

In [16]:
def get_weights(rawdata, zontoBorough):
    
    rawdata['Borough'] = rawdata['DOLocationID'].apply(lambda x:zontoBorough[x])
    
    borough_df = rawdata[['vehicle_count','Borough']].groupby(by='Borough').sum().reset_index()

    zone_df = rawdata[['vehicle_count','DOLocationID']].groupby(by='DOLocationID').sum().reset_index()

    zone_df['Borough'] = zone_df['DOLocationID'].apply(lambda x:zontoBorough[x])

    zone_df = pd.merge(borough_df, zone_df, on=['Borough'], how='inner')

    zone_df['zone_weight'] = zone_df.vehicle_count_y / zone_df.vehicle_count_x

    zone_df = zone_df[['Borough', 'DOLocationID', 'zone_weight']]

    return zone_df

#### Load Raw Data

In [17]:
hub = 'Jfk'
tune_hyp_params = False

In [18]:
dataDir = '/home/urwa/Documents/side_projects/urban/data/processedData/'
file = dataDir + hub + 'VehiceByHour.csv'

In [19]:
rawdata = loadData(file)

Raw shape:  (2260080, 4)
Days:  365


In [20]:
rawdata.head(2)

Unnamed: 0,DOLocationID,Date,Hour,vehicle_count
0,1,2018-01-01,0,1.0
1,2,2018-01-01,0,0.0


In [21]:
edge_data = getTimeSeries(rawdata)
edge_data = edge_data.reset_index()
edge_data.head(3)

DOLocationID,Date,Hour,1,10,100,101,102,106,107,108,...,90,91,92,93,94,95,96,97,98,99
0,2018-01-01,0,1,7,0,0,0,1,2,1,...,3,1,1,0,1,6,0,1,0,0
1,2018-01-01,1,0,4,0,1,0,1,4,0,...,3,4,1,0,0,2,0,0,0,0
2,2018-01-01,2,0,1,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


### Merge External Data Features

In [22]:
externalDataDir = "/home/urwa/Documents/side_projects/urban/data/HongData/"
extFile = externalDataDir + hub.upper() + ".csv"

In [23]:
extDf = pd.read_csv(extFile)
print(extDf.shape)
extDf.head(2)

(8760, 46)


Unnamed: 0,date,arrival,fhv,yellow,vehicle,ifmon,iftue,ifwed,ifthu,iffri,...,maxtemp,mintemp,avgtemp,departure,hdd,cdd,participation,newsnow,snowdepth,ifSnow
0,18/1/1 0:00,6,263,174,437,1,0,0,0,0,...,18,7,12.5,-21.2,52,0,0.0,0.0,0,0
1,18/1/1 1:00,6,138,133,271,1,0,0,0,0,...,18,7,12.5,-21.2,52,0,0.0,0.0,0,0


In [24]:
extDf['date'] = pd.to_datetime(extDf['date'], yearfirst=True)
extDf.head(2)

Unnamed: 0,date,arrival,fhv,yellow,vehicle,ifmon,iftue,ifwed,ifthu,iffri,...,maxtemp,mintemp,avgtemp,departure,hdd,cdd,participation,newsnow,snowdepth,ifSnow
0,2018-01-01 00:00:00,6,263,174,437,1,0,0,0,0,...,18,7,12.5,-21.2,52,0,0.0,0.0,0,0
1,2018-01-01 01:00:00,6,138,133,271,1,0,0,0,0,...,18,7,12.5,-21.2,52,0,0.0,0.0,0,0


In [25]:
min(extDf.date), max(extDf.date)

(Timestamp('2018-01-01 00:00:00'), Timestamp('2018-12-31 23:00:00'))

In [26]:
extDf['Hour'] = extDf['date'].dt.hour
extDf['Dow'] = extDf['date'].dt.dayofweek
extDf['Date'] = extDf['date'].dt.date

In [27]:
extDf.columns

Index(['date', 'arrival', 'fhv', 'yellow', 'vehicle', 'ifmon', 'iftue',
       'ifwed', 'ifthu', 'iffri', 'ifsat', 'ifsun', 'if0', 'if1', 'if2', 'if3',
       'if4', 'if5', 'if6', 'if7', 'if8', 'if9', 'if10', 'if11', 'if12',
       'if13', 'if14', 'if15', 'if16', 'if17', 'if18', 'if19', 'if20', 'if21',
       'if22', 'if23', 'maxtemp', 'mintemp', 'avgtemp', 'departure', 'hdd',
       'cdd', 'participation', 'newsnow', 'snowdepth', 'ifSnow', 'Hour', 'Dow',
       'Date'],
      dtype='object')

In [28]:
selected_columns = ['Date', 'Hour', 'Dow', 'arrival','maxtemp', 'mintemp', 'avgtemp', 'departure', 'hdd',
       'cdd', 'participation', 'newsnow', 'snowdepth', 'ifSnow']

In [29]:
extDf = extDf[selected_columns]

In [30]:
print(edge_data.shape)
print(extDf.shape)

(8760, 260)
(8760, 14)


In [31]:
edge_data['Date'] = pd.to_datetime(edge_data['Date'])
extDf['Date'] = pd.to_datetime(extDf['Date'])

In [32]:
edge_data = pd.merge(edge_data,extDf, on=['Date', 'Hour'], how='inner')
print(edge_data.shape)
edge_data['Date'] = edge_data['Date'].dt.date
edge_data.head()

(8760, 272)


Unnamed: 0,Date,Hour,1,10,100,101,102,106,107,108,...,maxtemp,mintemp,avgtemp,departure,hdd,cdd,participation,newsnow,snowdepth,ifSnow
0,2018-01-01,0,1,7,0,0,0,1,2,1,...,18,7,12.5,-21.2,52,0,0.0,0.0,0,0
1,2018-01-01,1,0,4,0,1,0,1,4,0,...,18,7,12.5,-21.2,52,0,0.0,0.0,0,0
2,2018-01-01,2,0,1,0,0,0,0,0,0,...,18,7,12.5,-21.2,52,0,0.0,0.0,0,0
3,2018-01-01,3,0,0,0,0,0,0,0,0,...,18,7,12.5,-21.2,52,0,0.0,0.0,0,0
4,2018-01-01,4,0,3,0,0,1,0,0,1,...,18,7,12.5,-21.2,52,0,0.0,0.0,0,0


### Train Test split

In [33]:
sep = int(0.75*len(edge_data))
sep

6570

In [34]:
trainData = edge_data[:sep]
testData = edge_data[sep:]

In [35]:
trainData.shape

(6570, 272)

In [36]:
testData.shape

(2190, 272)

### Lag Variables

In [37]:
trainData.columns

Index(['Date', 'Hour', '1', '10', '100', '101', '102', '106', '107', '108',
       ...
       'maxtemp', 'mintemp', 'avgtemp', 'departure', 'hdd', 'cdd',
       'participation', 'newsnow', 'snowdepth', 'ifSnow'],
      dtype='object', length=272)

In [38]:
DateColumns = ['Date']

ext_columns = ['Dow', 'arrival','maxtemp', 'mintemp', 'avgtemp', 'departure', 'hdd',
       'cdd', 'participation', 'newsnow', 'snowdepth', 'ifSnow']

targetColumns = [c for c in trainData.columns if c not in ext_columns and \
                c not in DateColumns and c != 'Hour']

lagColumns = targetColumns + ['arrival']

In [39]:
len(targetColumns)

258

In [40]:
maxlag = 2

dataset_train = addLag(trainData, maxlag, lagColumns)

dataset_train.shape

(6568, 790)

In [41]:
dataset_test = addLag(testData, maxlag, lagColumns)
dataset_test.shape

(2188, 790)

### Modelling

In [42]:
X_train = dataset_train.drop(targetColumns+DateColumns , axis = 1)
X_test = dataset_test.drop(targetColumns+DateColumns , axis = 1)
y_train = dataset_train[targetColumns]
y_test = dataset_test[targetColumns]

In [43]:
X_train.shape, X_test.shape

((6568, 531), (2188, 531))

In [44]:
y_train.shape, y_test.shape

((6568, 258), (2188, 258))

### Hyperparameter Tuning

In [45]:
# # Number of trees in random forest
# n_estimators = [int(x) for x in np.linspace(start = 150, stop = 300, num = 3)]
# # Number of features to consider at every split
# max_features = ['sqrt']
# # Maximum number of levels in tree
# max_depth = [int(x) for x in np.linspace(50, 110, num = 5)]
# max_depth.append(None)
# # Minimum number of samples required to split a node
# min_samples_split = [2,3,4]
# # Minimum number of samples required at each leaf node
# min_samples_leaf = [2,3]
# # Method of selecting samples for training each tree
# bootstrap = [True, False]
# # Create the random grid
# random_grid = {'n_estimators': n_estimators,
#                'max_features': max_features,
#                'max_depth': max_depth,
#                'min_samples_split': min_samples_split,
#                'min_samples_leaf': min_samples_leaf,
#                'bootstrap': bootstrap}
# print(random_grid)

In [46]:
# if tune_hyp_params:
#     rf = RandomForestRegressor()
#     rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 5, \
#                                    cv = 5, verbose=2, random_state=42, n_jobs = -1)
#     rf_random.fit(X_train, y_train)
#     print(rf_random.best_params_)

### Training the Best Model

In [48]:
rf2 = TorchRandomForestRegressor( nb_trees=150, 
                           nb_samples=3,
                           max_depth=-1, 
                           bootstrap= False)

In [49]:
# rf2 = RandomForestRegressor(random_state = 2019, n_estimators=150, 
#                            min_samples_split=3,
#                            min_samples_leaf= 2, 
#                            max_features= 'sqrt',
#                            max_depth= None, 
#                            bootstrap= False)

In [53]:
x_train_tensor = torch.FloatTensor(X_train.values)
y_train_tensor = torch.FloatTensor(y_train.values)

In [55]:
rf2.fit(x_train_tensor,y_train_tensor)

KeyboardInterrupt: 

In [42]:
rf2.score(X_train,y_train)



0.8996653036132534

In [43]:
rf2.score(X_test,y_test)



0.5486398644609263

### Predict

In [44]:
prediction = rf2.predict(X_test)
prediction.shape

(2188, 258)

### Evaluate

In [45]:
get_rmse(y_test, prediction)

1      1.124957
10     3.392780
100    3.310208
101    0.968098
102    0.910948
         ...   
95     2.714594
96     0.341477
97     2.218022
98     1.052829
99     0.067556
Length: 258, dtype: float64

In [46]:
r2_score(y_test, prediction, multioutput='variance_weighted')

0.5486398644609263

In [47]:
y_test.values.mean()

2.3208320933066906

In [48]:
y_test.values.std()

4.157138343329969

In [49]:
r2_score(y_test, prediction)

0.23791552311060307