In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

import matplotlib.pyplot as plt
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

In [2]:
torch.manual_seed(2020)
np.random.seed(2020)

In [3]:
config = {'lr': 0.00034439316653688684,
 'layers': 3,
 'step_size': 11,
 'gamma': 0.761795969995615,
 'bptt': 19,
 'dropout': 0.1227497445640586}

In [4]:
config['lr'] = config['lr'] * config['gamma'] **4
config['lr']

0.00011598697153799081

In [5]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [6]:
dataset = pd.read_csv('/home/urwa/Documents/side_projects/urban/data/featureData/jfk.csv')

In [7]:
dataset.shape

(8757, 1049)

In [8]:
dataset.head(3)

Unnamed: 0,Date,Hour,1,10,100,101,102,106,107,108,...,91_lag_3,92_lag_3,93_lag_3,94_lag_3,95_lag_3,96_lag_3,97_lag_3,98_lag_3,99_lag_3,arrival_lag_3
0,2018-01-01,3,0,0,0,0,0,0,0,0,...,1.0,1.0,0.0,1.0,6.0,0.0,1.0,0.0,0.0,6.0
1,2018-01-01,4,0,3,0,0,1,0,0,1,...,4.0,1.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,6.0
2,2018-01-01,5,0,4,0,0,1,2,3,1,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,2.0


In [9]:
lag_columns = [c for c in dataset.columns if 'lag' in c]
len(lag_columns)

777

In [10]:
dataset = dataset[[c for c in dataset.columns if c not in lag_columns]]
dataset.shape

(8757, 272)

In [11]:
DateColumns = ['Date']

ext_columns = ['Dow', 'arrival','maxtemp', 'mintemp', 'avgtemp', 'departure', 'hdd',
       'cdd', 'participation', 'newsnow', 'snowdepth', 'ifSnow']

targetColumns = [c for c in dataset.columns if c not in ext_columns and \
                c not in DateColumns and c not in lag_columns and c != 'Hour']
len(targetColumns)

258

In [12]:
features_cols = [c for c in dataset.columns if c not in targetColumns and c not in DateColumns]
len(features_cols)

13

In [13]:
class LSTM(nn.Module):
    def __init__(self, feat_size=1, hidden_layer_size=100, network_size=1, layers=1, communities=10, dropout=0, at_mat=None):
        super().__init__()
        
        # aggregation
        if at_mat != None:
            self.attachment_matrix = torch.nn.Parameter(at_mat)
            self.attachment_matrix.requires_grad = False
        else:
            self.attachment_matrix = torch.nn.Parameter(torch.randn(network_size,communities))
            self.attachment_matrix.requires_grad = True
        
        
        self.hidden_layer_size = hidden_layer_size
        
        self.hidden_cell = (torch.zeros(layers,1,self.hidden_layer_size),
                    torch.zeros(layers,1,self.hidden_layer_size))
        
        lstm_input = communities + feat_size
        self.lstm = nn.LSTM(input_size=lstm_input, hidden_size=hidden_layer_size, num_layers=layers, dropout=dropout)

        #disaggregation
#         self.linear_1 = nn.Linear(hidden_layer_size, hidden_layer_size)
        self.linear_2 = nn.Linear(hidden_layer_size, network_size)


    def forward(self, input_seq, feat):
        
        w = F.softmax(self.attachment_matrix, dim=1)
        x = torch.matmul(input_seq, self.attachment_matrix)
        x = torch.cat((x,feat),axis=1)

        
        lstm_out, self.hidden_cell = self.lstm(x.view(len(input_seq) ,1, -1), self.hidden_cell)
        
        predictions = self.linear_2(lstm_out.view(len(input_seq), -1))
#         predictions = F.relu(predictions)
#         predictions = self.linear_2(predictions)
        
        return predictions

In [14]:
def evaluate(model):
    model.eval()
    prediction = []
    with torch.no_grad():
        for feat,seq, labels in test_inout_seq:
            model.hidden_cell = (torch.zeros(layers, 1, model.hidden_layer_size).to(device),
                            torch.zeros(layers, 1, model.hidden_layer_size).to(device))
            prediction.append(model(seq,feat)[-1])

    y_test_ = torch.stack([labels[-1] for feat,seq, labels in test_inout_seq], axis=0).detach().cpu().numpy()
    y_pred_ = torch.stack(prediction).detach().cpu().numpy()

    res = y_pred_ - y_test_
    r2 = r2_score(y_test_, y_pred_, multioutput='variance_weighted')
    rmse = mean_squared_error(y_test_, y_pred_)
    mae = mean_absolute_error(y_test_, y_pred_)
#     print("r2: ",r2)
    return (res, r2, rmse, mae)

In [15]:
def get_at_mat(targetColumns):
    comms = pd.read_csv('/home/urwa/Documents/side_projects/urban/UrbanTemporalNetworks/Data/ZonetoComm.csv')  
    communities = list(set(comms.start_community))

    mapping = dict(zip(comms.start_id, comms.start_community))
    comm_to_index = dict(zip(communities,range(len(communities))))
    col_to_index = dict(zip(targetColumns,range(len(targetColumns))))

    attach = torch.zeros(len(targetColumns), len(communities))

    for t_c in targetColumns:
        com = mapping[int(t_c)]
        x_i = col_to_index[t_c]
        y_i = comm_to_index[com]

        attach[x_i,y_i] = 1

    return attach

In [16]:
def create_inout_sequences(x,y, tw):
    inout_seq = []
    L = len(x)
    for i in range(L-tw):
        train_seq_x = x[i:i+tw]
        train_seq_y = y[i:i+tw]
#         train_seq = torch.cat((train_seq_x,train_seq_y),axis=1)
        
#         train_label = y[i+tw:i+tw+1]
        train_label = y[i+1:i+tw+1]
        inout_seq.append((train_seq_x, train_seq_y ,train_label))
    return inout_seq

In [17]:
pretrained_weights = '/home/urwa/Documents/side_projects/urban/urban_traffic_prediciton/pipeline/fix_attachment/jfk.pt'

In [18]:
bptt = config['bptt']

R2List = []
residual_list = []

for m in range(2,13):
    month_index  = pd.to_datetime(dataset.Date).dt.month == m
    
    print('-------------------------------------------------')
    print('-------------------------------------------------')
    print("Month: ", m)


    trainData = dataset[~month_index]
    testData = dataset[month_index]

    print("\n train test split")
    print(trainData.shape)
    print(testData.shape)


    print("\n ")
    X_train = trainData[features_cols].values
    X_train = torch.tensor(X_train).float().to(device)
    print(X_train.shape)

    y_train = trainData[targetColumns].values
    y_train = torch.tensor(y_train).float().to(device)
    print(y_train.shape)

    X_test = testData[features_cols].values
    X_test = torch.tensor(X_test).float().to(device)
    print(X_test.shape)

    y_test = testData[targetColumns].values
    y_test = torch.tensor(y_test).float().to(device)
    print(y_test.shape)


    train_inout_seq = create_inout_sequences(X_train,y_train, bptt)
    test_inout_seq = create_inout_sequences(X_test,y_test, bptt)
    print("\n sequences")
    print(train_inout_seq[0][0].shape,train_inout_seq[0][1].shape, train_inout_seq[0][2].shape)

    at_mat = get_at_mat(targetColumns)
    print("\nattachment matrix")
    print(at_mat.shape)


    layers = config['layers']
    communities = 24
    network_size = len(targetColumns)
    feat_size = len(features_cols)
    dropout = config['dropout']

    model = LSTM(feat_size = feat_size, hidden_layer_size=communities,
                 network_size=network_size, layers=layers,
                communities=communities, dropout=dropout, at_mat=at_mat).to(device)
    
    

    loss_function = nn.L1Loss()   
    optimizer = torch.optim.Adam(model.parameters(), lr=config['lr'])
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=config['step_size'], gamma=config['gamma'])
    print("\n model inititalized")

    model.load_state_dict(torch.load(pretrained_weights))
    print("\n model loaded")
    
    residual, r2, rmse, mae = evaluate(model)
    print("Pretraining R2: ",r2)

    
    best_r2 = r2
    best_residual = residual
    torch.save(model.state_dict(), 'data/'+'jfk_'+str(m)+'.pt')
    np.save('data/'+'jfk_'+str(m)+'.npy', best_residual)

    epochs = 60
    
    print("\n Training model...")
    for i in range(epochs):
        model.train()
        for feat,seq, labels in train_inout_seq:
            optimizer.zero_grad()
            model.hidden_cell = (torch.zeros(layers, 1, model.hidden_layer_size).to(device),
                            torch.zeros(layers, 1, model.hidden_layer_size).to(device))

            y_pred = model(seq, feat)

            single_loss = loss_function(y_pred, labels)
            single_loss.backward()
            optimizer.step()

        scheduler.step()
    #     if i%1 == 1:
        residual, r2, rmse, mae = evaluate(model)
        print(f'epoch: {i:3} loss: {single_loss.item():10.8f} r2: {r2:5.3f} rmse: {rmse:5.3f} mae: {mae:5.3f}')

        if r2 > best_r2:
            best_r2 = r2
            best_residual = residual
            torch.save(model.state_dict(), 'data/'+'jfk_'+str(m)+'.pt')
            np.save('data/'+'jfk_'+str(m)+'.npy', best_residual)

    print(f'epoch: {i:3} loss: {single_loss.item():10.10f}')
    print("bet_r2: ", best_r2)

    R2List.append(best_r2)
    residual_list.append(best_residual)

-------------------------------------------------
-------------------------------------------------
Month:  2

 train test split
(8085, 272)
(672, 272)

 
torch.Size([8085, 13])
torch.Size([8085, 258])
torch.Size([672, 13])
torch.Size([672, 258])

 sequences
torch.Size([19, 13]) torch.Size([19, 258]) torch.Size([19, 258])

attachment matrix
torch.Size([258, 24])

 model inititalized

 model loaded
Pretraining R2:  0.529468481632609

 Training model...
epoch:   0 loss: 1.45043051 r2: 0.482 rmse: 3.307 mae: 1.013
epoch:   1 loss: 1.31491649 r2: 0.501 rmse: 3.184 mae: 1.002
epoch:   2 loss: 1.28961730 r2: 0.514 rmse: 3.101 mae: 0.986
epoch:   3 loss: 1.30294001 r2: 0.530 rmse: 2.999 mae: 0.976
epoch:   4 loss: 1.29502463 r2: 0.516 rmse: 3.088 mae: 0.985
epoch:   5 loss: 1.29154372 r2: 0.524 rmse: 3.038 mae: 0.980
epoch:   6 loss: 1.28005242 r2: 0.535 rmse: 2.968 mae: 0.970
epoch:   7 loss: 1.34761381 r2: 0.520 rmse: 3.063 mae: 0.979
epoch:   8 loss: 1.29694831 r2: 0.516 rmse: 3.088 mae: 0

epoch:  59 loss: 1.25896513 r2: 0.568 rmse: 3.264 mae: 0.992
epoch:  59 loss: 1.2589651346
bet_r2:  0.5972399097890982
-------------------------------------------------
-------------------------------------------------
Month:  4

 train test split
(8037, 272)
(720, 272)

 
torch.Size([8037, 13])
torch.Size([8037, 258])
torch.Size([720, 13])
torch.Size([720, 258])

 sequences
torch.Size([19, 13]) torch.Size([19, 258]) torch.Size([19, 258])

attachment matrix
torch.Size([258, 24])

 model inititalized

 model loaded
Pretraining R2:  0.6083282515195536

 Training model...
epoch:   0 loss: 1.33773279 r2: 0.566 rmse: 3.282 mae: 1.020
epoch:   1 loss: 1.31826878 r2: 0.572 rmse: 3.241 mae: 1.009
epoch:   2 loss: 1.30559552 r2: 0.561 rmse: 3.325 mae: 1.017
epoch:   3 loss: 1.33061659 r2: 0.571 rmse: 3.248 mae: 1.007
epoch:   4 loss: 1.41485465 r2: 0.570 rmse: 3.251 mae: 1.020
epoch:   5 loss: 1.48819971 r2: 0.578 rmse: 3.195 mae: 1.011
epoch:   6 loss: 1.32695758 r2: 0.569 rmse: 3.258 mae: 1.0

epoch:  57 loss: 1.26664054 r2: 0.604 rmse: 3.156 mae: 0.996
epoch:  58 loss: 1.26752377 r2: 0.605 rmse: 3.150 mae: 0.995
epoch:  59 loss: 1.27900386 r2: 0.604 rmse: 3.154 mae: 0.996
epoch:  59 loss: 1.2790038586
bet_r2:  0.6267548650792119
-------------------------------------------------
-------------------------------------------------
Month:  6

 train test split
(8037, 272)
(720, 272)

 
torch.Size([8037, 13])
torch.Size([8037, 258])
torch.Size([720, 13])
torch.Size([720, 258])

 sequences
torch.Size([19, 13]) torch.Size([19, 258]) torch.Size([19, 258])

attachment matrix
torch.Size([258, 24])

 model inititalized

 model loaded
Pretraining R2:  0.5933234870001999

 Training model...
epoch:   0 loss: 1.32166719 r2: 0.571 rmse: 3.142 mae: 0.998
epoch:   1 loss: 1.32582808 r2: 0.572 rmse: 3.136 mae: 0.998
epoch:   2 loss: 1.41923749 r2: 0.574 rmse: 3.122 mae: 1.003
epoch:   3 loss: 1.32975638 r2: 0.576 rmse: 3.112 mae: 0.999
epoch:   4 loss: 1.30326009 r2: 0.557 rmse: 3.245 mae: 1.0

epoch:  55 loss: 1.28004336 r2: 0.533 rmse: 3.287 mae: 1.029
epoch:  56 loss: 1.27638471 r2: 0.532 rmse: 3.290 mae: 1.029
epoch:  57 loss: 1.25225246 r2: 0.534 rmse: 3.279 mae: 1.028
epoch:  58 loss: 1.27978718 r2: 0.534 rmse: 3.281 mae: 1.028
epoch:  59 loss: 1.29239357 r2: 0.532 rmse: 3.295 mae: 1.030
epoch:  59 loss: 1.2923935652
bet_r2:  0.5512411542818053
-------------------------------------------------
-------------------------------------------------
Month:  8

 train test split
(8013, 272)
(744, 272)

 
torch.Size([8013, 13])
torch.Size([8013, 258])
torch.Size([744, 13])
torch.Size([744, 258])

 sequences
torch.Size([19, 13]) torch.Size([19, 258]) torch.Size([19, 258])

attachment matrix
torch.Size([258, 24])

 model inititalized

 model loaded
Pretraining R2:  0.5207402905815595

 Training model...
epoch:   0 loss: 1.45408428 r2: 0.488 rmse: 3.455 mae: 1.078
epoch:   1 loss: 1.44571555 r2: 0.497 rmse: 3.395 mae: 1.069
epoch:   2 loss: 1.36047864 r2: 0.501 rmse: 3.367 mae: 1.0

epoch:  53 loss: 1.26068616 r2: 0.612 rmse: 3.341 mae: 1.031
epoch:  54 loss: 1.24865568 r2: 0.612 rmse: 3.336 mae: 1.030
epoch:  55 loss: 1.27210343 r2: 0.614 rmse: 3.319 mae: 1.028
epoch:  56 loss: 1.28986943 r2: 0.615 rmse: 3.316 mae: 1.027
epoch:  57 loss: 1.28985429 r2: 0.614 rmse: 3.318 mae: 1.028
epoch:  58 loss: 1.29340327 r2: 0.615 rmse: 3.312 mae: 1.026
epoch:  59 loss: 1.25854576 r2: 0.616 rmse: 3.306 mae: 1.027
epoch:  59 loss: 1.2585457563
bet_r2:  0.6349401338057566
-------------------------------------------------
-------------------------------------------------
Month:  10

 train test split
(8013, 272)
(744, 272)

 
torch.Size([8013, 13])
torch.Size([8013, 258])
torch.Size([744, 13])
torch.Size([744, 258])

 sequences
torch.Size([19, 13]) torch.Size([19, 258]) torch.Size([19, 258])

attachment matrix
torch.Size([258, 24])

 model inititalized

 model loaded
Pretraining R2:  0.6334234351737693

 Training model...
epoch:   0 loss: 1.41324997 r2: 0.602 rmse: 3.608 mae: 1.

epoch:  51 loss: 1.26965928 r2: 0.576 rmse: 3.830 mae: 1.053
epoch:  52 loss: 1.29015815 r2: 0.576 rmse: 3.826 mae: 1.053
epoch:  53 loss: 1.27964044 r2: 0.575 rmse: 3.836 mae: 1.053
epoch:  54 loss: 1.24988806 r2: 0.576 rmse: 3.832 mae: 1.053
epoch:  55 loss: 1.28018224 r2: 0.575 rmse: 3.842 mae: 1.054
epoch:  56 loss: 1.28104019 r2: 0.575 rmse: 3.841 mae: 1.054
epoch:  57 loss: 1.27064764 r2: 0.576 rmse: 3.834 mae: 1.053
epoch:  58 loss: 1.25706339 r2: 0.574 rmse: 3.844 mae: 1.054
epoch:  59 loss: 1.26212490 r2: 0.576 rmse: 3.832 mae: 1.053
epoch:  59 loss: 1.2621248960
bet_r2:  0.5775233248991875
-------------------------------------------------
-------------------------------------------------
Month:  12

 train test split
(8013, 272)
(744, 272)

 
torch.Size([8013, 13])
torch.Size([8013, 258])
torch.Size([744, 13])
torch.Size([744, 258])

 sequences
torch.Size([19, 13]) torch.Size([19, 258]) torch.Size([19, 258])

attachment matrix
torch.Size([258, 24])

 model inititalized

 mode

In [19]:
R2List

[0.5506309348043397,
 0.5972399097890982,
 0.6083282515195536,
 0.6267548650792119,
 0.5933234870001999,
 0.5512411542818053,
 0.5207402905815595,
 0.6349401338057566,
 0.6334234351737693,
 0.5775233248991875,
 0.5531640653233972]

In [24]:
np.mean(R2List)

0.586119077477989

In [20]:
residual_list

[array([[ 8.3843988e-01,  3.4259129e+00, -2.6372004e-01, ...,
          2.0339184e+00, -1.1042626e+00,  4.1553318e-05],
        [ 8.1632847e-01,  6.0170403e+00,  7.1888571e+00, ...,
         -5.9521580e+00, -1.5044886e-01,  1.9242421e-05],
        [ 7.4045676e-01,  5.5556431e+00,  1.5694609e+00, ...,
         -8.8799286e-01, -2.1621661e+00,  1.2608720e-05],
        ...,
        [-1.3746703e-01,  3.5684948e+00,  3.7512650e+00, ...,
         -2.1051121e+00, -1.0627986e+00,  1.2192690e-05],
        [ 6.9374144e-01,  1.7612743e+00,  1.6717048e+00, ...,
         -2.0053911e+00, -1.9521338e-01,  5.5752657e-06],
        [ 6.5782297e-01,  3.3353701e+00,  2.7849641e+00, ...,
          2.4795446e+00,  8.3695906e-01,  3.1513514e-06]], dtype=float32),
 array([[ 1.0036776e+00,  4.8286819e-01,  4.2021017e+00, ...,
          9.9893236e-01,  1.1398672e+00,  5.8864476e-05],
        [ 9.6979946e-01, -5.4186373e+00,  2.3234272e+00, ...,
         -1.7519712e+00,  1.2915612e+00,  5.1110939e-05],
        [-

In [None]:
0.530
0.5506309348043397
0.5972399097890982
0.6083282515195536
0.6267548650792119

In [54]:
bptt = config['bptt']

resdf_list = []

for m in range(1,13):
    month_index  = pd.to_datetime(dataset.Date).dt.month == m
    
    print('-------------------------------------------------')
    print('-------------------------------------------------')
    print("Month: ", m)

    testData = dataset[month_index]
    testData = testData[bptt:]
    date = testData['Date']
    hour = testData['Hour']
    
    print("\n test shape")
    print(testData.shape)

    residual = np.load('data/'+'jfk_'+str(m)+'.npy')
    print("\n residual")
    print(residual.shape)

    res_df = pd.DataFrame(residual)
    res_df.columns = targetColumns
    res_df['Date'] = testData['Date'].values
    res_df['Hour'] = testData['Hour'].values
    res_df = res_df[['Date', 'Hour'] + targetColumns]
    
    print(res_df.head())
    resdf_list.append(res_df)

-------------------------------------------------
-------------------------------------------------
Month:  1

 test shape
(722, 272)

 residual
(722, 258)
         Date  Hour         1        10       100       101       102  \
0  2018-01-01    22  1.073634 -3.288347  3.073706 -0.965400 -0.819268   
1  2018-01-01    23  0.250151 -3.579380 -0.048968  1.032399  1.154741   
2  2018-01-02     0 -1.733820  0.225997  0.682261 -1.058100 -0.916202   
3  2018-01-02     1 -0.902148 -3.739099  1.196124 -0.850154  0.225197   
4  2018-01-02     2 -0.000178 -0.117501  0.509563  0.001086 -0.989059   

        106        107       108  ...        90        91        92        93  \
0 -1.754588  -2.832455 -0.884940  ... -3.921993  0.348744  3.898218 -0.940414   
1 -0.800730 -15.068804  0.149996  ... -2.709341  0.696428 -5.188190  0.015907   
2  1.089349  -1.090286  0.160533  ...  0.753269  1.398265  0.725189 -0.990966   
3 -2.829922  -3.289001  0.039112  ... -1.984349  1.384919 -1.415277 -0.008739   


In [57]:
all_res_df = pd.concat(resdf_list)
all_res_df.head()

Unnamed: 0,Date,Hour,1,10,100,101,102,106,107,108,...,90,91,92,93,94,95,96,97,98,99
0,2018-01-01,22,1.073634,-3.288347,3.073706,-0.9654,-0.819268,-1.754588,-2.832455,-0.88494,...,-3.921993,0.348744,3.898218,-0.940414,0.002937,3.439336,0.000458,5.367305,0.71476,1e-05
1,2018-01-01,23,0.250151,-3.57938,-0.048968,1.032399,1.154741,-0.80073,-15.068804,0.149996,...,-2.709341,0.696428,-5.18819,0.015907,0.001502,0.811171,0.000237,2.080948,-0.590178,-9e-06
2,2018-01-02,0,-1.73382,0.225997,0.682261,-1.0581,-0.916202,1.089349,-1.090286,0.160533,...,0.753269,1.398265,0.725189,-0.990966,0.001376,1.753793,0.000151,2.080092,1.233203,4e-06
3,2018-01-02,1,-0.902148,-3.739099,1.196124,-0.850154,0.225197,-2.829922,-3.289001,0.039112,...,-1.984349,1.384919,-1.415277,-0.008739,-0.000585,0.979212,5.6e-05,-0.613677,-1.677565,-2e-06
4,2018-01-02,2,-0.000178,-0.117501,0.509563,0.001086,-0.989059,0.006134,-3.215868,0.000318,...,-3.467297,-1.637727,-0.818416,0.00146,-0.00024,-0.9633,1.5e-05,-1.661212,0.070346,-5e-06


In [58]:
all_res_df.to_csv('data/residual_jfk.csv')

In [21]:
# attachment = torch.argmax(F.softmax(model.attachment_matrix, dim=1), dim=1).detach().cpu().numpy()
# community_assignment = dict(zip(targetColumns, attachment))
# community_assignment

In [22]:
# 20 comm
# 0.505

In [23]:
# 50 comm
# 