In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

import matplotlib.pyplot as plt
from sklearn.metrics import r2_score

In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [3]:
dataset = pd.read_csv('/home/urwa/Documents/side_projects/urban/data/featureData/jfk.csv')

In [4]:
dataset.shape

(8757, 1049)

In [5]:
dataset.head(3)

Unnamed: 0,Date,Hour,1,10,100,101,102,106,107,108,...,91_lag_3,92_lag_3,93_lag_3,94_lag_3,95_lag_3,96_lag_3,97_lag_3,98_lag_3,99_lag_3,arrival_lag_3
0,2018-01-01,3,0,0,0,0,0,0,0,0,...,1.0,1.0,0.0,1.0,6.0,0.0,1.0,0.0,0.0,6.0
1,2018-01-01,4,0,3,0,0,1,0,0,1,...,4.0,1.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,6.0
2,2018-01-01,5,0,4,0,0,1,2,3,1,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,2.0


In [6]:
lag_columns = [c for c in dataset.columns if 'lag' in c]
len(lag_columns)

777

In [7]:
dataset = dataset[[c for c in dataset.columns if c not in lag_columns]]
dataset.shape

(8757, 272)

In [8]:
DateColumns = ['Date']

ext_columns = ['Dow', 'arrival','maxtemp', 'mintemp', 'avgtemp', 'departure', 'hdd',
       'cdd', 'participation', 'newsnow', 'snowdepth', 'ifSnow']

targetColumns = [c for c in dataset.columns if c not in ext_columns and \
                c not in DateColumns and c not in lag_columns and c != 'Hour']
len(targetColumns)

258

In [9]:
features_cols = [c for c in dataset.columns if c not in targetColumns and c not in DateColumns]
len(features_cols)

13

In [36]:
means = dataset[targetColumns].mean()
freq_cols = [c for c,v in zip(means.index,means.values) if v >10]
print(len(freq_cols))
targetColumns = freq_cols

7


In [37]:
sep = int(0.75*len(dataset))
print(sep)


trainData = dataset[:sep]
testData = dataset[sep:]

print(trainData.shape)
print(testData.shape)

6567
(6567, 272)
(2190, 272)


In [38]:
X_train = trainData[features_cols].values
X_train = torch.tensor(X_train).float().to(device)
print(X_train.shape)

y_train = trainData[targetColumns].values
y_train = torch.tensor(y_train).float().to(device)
print(y_train.shape)

X_test = testData[features_cols].values
X_test = torch.tensor(X_test).float().to(device)
print(X_test.shape)

y_test = testData[targetColumns].values
y_test = torch.tensor(y_test).float().to(device)
print(y_test.shape)

torch.Size([6567, 13])
torch.Size([6567, 7])
torch.Size([2190, 13])
torch.Size([2190, 7])


In [39]:
def create_inout_sequences(x,y, tw):
    inout_seq = []
    L = len(x)
    for i in range(L-tw):
        train_seq_x = x[i:i+tw]
        train_seq_y = y[i:i+tw]
        train_seq = torch.cat((train_seq_x,train_seq_y),axis=1)
        
#         train_label = y[i+tw:i+tw+1]
        train_label = y[i+1:i+tw+1]
        inout_seq.append((train_seq ,train_label))
    return inout_seq

In [40]:
bptt = 6

In [41]:
train_inout_seq = create_inout_sequences(X_train,y_train, bptt)

In [42]:
train_inout_seq[0][0].shape,train_inout_seq[0][1].shape

(torch.Size([6, 20]), torch.Size([6, 7]))

In [43]:
test_inout_seq = create_inout_sequences(X_test,y_test, bptt)

In [44]:
class LSTM(nn.Module):
    def __init__(self, input_size=1, hidden_layer_size=100, output_size=1, layers=1):
        super().__init__()
        self.hidden_layer_size = hidden_layer_size

        self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_layer_size, num_layers=layers)

        self.linear = nn.Linear(hidden_layer_size, output_size)

        self.hidden_cell = (torch.zeros(layers,1,self.hidden_layer_size),
                            torch.zeros(layers,1,self.hidden_layer_size))

    def forward(self, input_seq):
        lstm_out, self.hidden_cell = self.lstm(input_seq.view(len(input_seq) ,1, -1), self.hidden_cell)
        predictions = self.linear(lstm_out.view(len(input_seq), -1))
#         return predictions[-1]
        return predictions

In [48]:
def evaluate():
    model.eval()
    prediction = []
    with torch.no_grad():
        for seq, labels in test_inout_seq:
            model.hidden = (torch.zeros(layers, 1, model.hidden_layer_size),
                            torch.zeros(layers, 1, model.hidden_layer_size))
            prediction.append(model(seq)[-1])

    y_test_ = torch.stack([labels[-1] for seq, labels in test_inout_seq], axis=0).detach().cpu().numpy()
    y_pred_ = torch.stack(prediction).detach().cpu().numpy()

    r2 = r2_score(y_test_, y_pred_, multioutput='variance_weighted')
#     print("r2: ",r2)
    return r2

In [49]:
layers = 1
output_size = len(targetColumns)
input_size = output_size + len(features_cols)


model = LSTM(input_size = input_size, hidden_layer_size=100, output_size=output_size, layers=layers).to(device)
loss_function = nn.L1Loss()   
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [50]:
epochs = 200


for i in range(epochs):
    model.train()
    for seq, labels in train_inout_seq:
        optimizer.zero_grad()
        model.hidden_cell = (torch.zeros(layers, 1, model.hidden_layer_size).to(device),
                        torch.zeros(layers, 1, model.hidden_layer_size).to(device))

        y_pred = model(seq)

        single_loss = loss_function(y_pred, labels)
        single_loss.backward()
        optimizer.step()

    if i%10 == 1:
        r2 = evaluate()
        print(f'epoch: {i:3} loss: {single_loss.item():10.8f} r2: ',r2)
        

print(f'epoch: {i:3} loss: {single_loss.item():10.10f}')

epoch:   1 loss: 8.22168922 r2:  0.5045373936739008
epoch:  11 loss: 8.90462875 r2:  0.6064681694237081
epoch:  21 loss: 8.58030987 r2:  0.5101528451357036
epoch:  31 loss: 7.44053316 r2:  0.5602433155698084
epoch:  41 loss: 8.34074879 r2:  0.496306511047332
epoch:  51 loss: 7.15865469 r2:  0.448940431881965
epoch:  61 loss: 8.31874371 r2:  0.525833694008165
epoch:  71 loss: 8.50218105 r2:  0.560724674129063
epoch:  81 loss: 7.94589233 r2:  0.5588401767522523
epoch:  91 loss: 7.87613726 r2:  0.5254090868288577
epoch: 101 loss: 8.01165009 r2:  0.5119268845434752
epoch: 111 loss: 8.20584679 r2:  0.49460093927951465
epoch: 121 loss: 7.97000408 r2:  0.5360012435624221
epoch: 131 loss: 7.05820465 r2:  0.5586343443332208
epoch: 141 loss: 7.21474075 r2:  0.5311458271360476
epoch: 151 loss: 7.45701599 r2:  0.5472722139090085
epoch: 161 loss: 7.21871471 r2:  0.5527091813003975
epoch: 171 loss: 7.17404032 r2:  0.5791202534385461
epoch: 181 loss: 7.02158022 r2:  0.5642774890940351
epoch: 191 loss

In [51]:
evaluate()

0.5725690134835787

0.5829984326364023

In [34]:
# 5: 35 zones
# 0.5829984326364023

In [None]:
# 10: 7 zones
# 0.5725690134835787