In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import scipy.stats as stats
from collections import defaultdict
from itertools import product
from sklearn.metrics import mean_absolute_error as mae
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch import optim
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [65]:
look_back = 72*3
n_features = 65
linear_node = 32
batch_size = 64

In [3]:
dat = pd.read_csv('train.csv', index_col='row_id')
def preprocess(dat):
    time_mapper = {}
    ii = 0
    for h in range(24):
        for mm in ['00','20','40']:
            hh = '{0:02d}'.format(h)
            time_mapper[hh+':'+mm] = ii
            ii += 1

    dat['unique'] = dat['x'].astype(str) + dat['y'].astype(str) + dat['direction']
    uniques = dat['unique'].unique()
    dat['day'] = pd.to_datetime(dat['time']).dt.weekday
    dat['time_stamp'] = dat['time'].apply(lambda x:time_mapper[x.split()[1][:5]])

    tmp = dat.groupby(['unique','day','time_stamp']).agg({'congestion':np.median})
    median_mapper = tmp.to_dict()['congestion']
    dat['median'] = dat.apply(lambda x: \
                              median_mapper[x['unique'],x['day'],x['time_stamp']], axis=1)
    dat['congestion-median'] = dat['congestion'] - dat['median']
    
    all_time = pd.DataFrame(pd.date_range('1991-04-01 00:00:00', '1991-09-30 11:40:00', freq='20Min'), columns=['time'])
    all_time['time'] = all_time['time'].astype(str)
    
    return uniques, median_mapper, time_mapper, all_time
uniques, median_mapper, time_mapper, all_time = preprocess(dat)

In [4]:
def getseries(unique):
    df = dat.loc[dat['unique']==unique, ['time', 'congestion-median']]
    df = pd.merge(all_time, df, left_on='time', right_on='time', how='outer')
    df = df.set_index('time')
    df['congestion-median'] = df['congestion-median'].fillna(0)
    ss = StandardScaler()
    df['congestion-median-normalized'] = ss.fit_transform(df['congestion-median'].values.reshape(-1,1)).reshape(-1)
    return df, ss

In [5]:
df,_ = getseries('00EB')

In [8]:
test_periods = [
    ['1991-09-16 12:00:00', '1991-09-16 24:00:00'],
    ['1991-09-23 12:00:00', '1991-09-23 24:00:00']]
id1 = df.index.to_list().index(test_periods[1][0])
id1, len(df)

(12636, 13140)

In [9]:
data_all = []
for unique in uniques:
    df, ss = getseries(unique)
    data_all.append(df['congestion-median-normalized'].values)
data_all = np.array(data_all).T

In [10]:
data_all.shape

(13140, 65)

In [58]:
def create_dataset(dataset, look_back=5):
    dataX, dataY = [], []
    for i in range(len(dataset)-look_back-1):
        a = dataset[i:(i+look_back),:]
        dataX.append(a)
        dataY.append(dataset[i+1:i+look_back+1,:])
    return np.array(dataX), np.array(dataY)

In [59]:
X, Y = create_dataset(data_all[:10])
X.shape, Y.shape

((4, 5, 65), (4, 5, 65))

In [42]:
def assemble(dat):
    train_loaders, test_loaders = [], []
    
    # for train/test
    train = dat[:12636]
    test = dat[12636:]

    X, y = create_dataset(train, look_back=look_back)
    train_dataset = []
    for i in range(len(X)):
        train_dataset.append((torch.tensor(X[i],dtype=torch.float32),
                              torch.tensor(y[i],dtype=torch.float32)))
    train_loaders.append(DataLoader(train_dataset, batch_size=batch_size, drop_last=False))

    X, y = create_dataset(test, look_back=look_back)
    test_dataset = []
    for i in range(len(X)):
        test_dataset.append((torch.tensor(X[i],dtype=torch.float32),
                             torch.tensor(y[i],dtype=torch.float32)))
    test_loaders.append(DataLoader(test_dataset, batch_size=batch_size, drop_last=False))
    
    train = dat[:]
    X, y = create_dataset(train, look_back=look_back)
    train_dataset = []
    for i in range(len(X)):
        train_dataset.append((torch.tensor(X[i],dtype=torch.float32),
                              torch.tensor(y[i],dtype=torch.float32)))
    train_loaders.append(DataLoader(train_dataset, batch_size=batch_size, drop_last=False))
        
    return train_loaders, test_loaders

In [43]:
train_loaders, test_loaders = assemble(data_all)

In [44]:
X, Y = next(iter(train_loaders[0]))
X.shape, Y.shape

(torch.Size([32, 72, 65]), torch.Size([32, 65]))

In [45]:
X, Y = next(iter(test_loaders[0]))
X.shape, Y.shape

(torch.Size([32, 72, 65]), torch.Size([32, 65]))

In [27]:
criterion = nn.L1Loss()
class MyModel(nn.Module):
    def __init__(self, input_feature, hidden_size, output_feature, num_layers=1):
        super(MyModel, self).__init__()
        self.linear = nn.Linear(input_feature, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, batch_first=True, num_layers=num_layers, dropout=0.2)
        ''' gru input is (N,L,H_in=H_hidden), output is (N,L,H_hidden), hidden is (num_layers, h_hidden)'''
        self.linear_out = nn.Linear(hidden_size, output_feature)
        self.hidden_size = hidden_size
        self.num_layers = num_layers
    
    def forward(self, input, hidden):
        ''' X is in the shape of (N,L,input_feature) '''
        output = F.relu(self.linear(input))
        output, hidden = self.gru(output, hidden)
        output = self.linear_out(F.relu(output))
        return output
    
    def initHidden(self, batch_size):
        return torch.zeros((self.num_layers, batch_size, self.hidden_size))

In [63]:
def evaluate(test_loader):
    model.eval()
    with torch.no_grad():
        loss = 0
        n = 0
        for batch, (x, y) in enumerate(test_loader):
            h0 = model.initHidden(len(x))
            output = model.forward(x, h0)
#             loss += criterion(output[:,-1,:],y).item() * len(x)
            loss += criterion(output,y).item() * len(x)
            n += len(x)
        loss /= n
    return loss

def train(n_epoches, train_loader, test_loader):
    optimizer = optim.Adam(model.parameters())

    best_test_loss = 100.0
    for epoch in range(n_epoches):

        curr_loss = 0.0
        model.train()

        for batch, (x, y) in enumerate(train_loader):
            h0 = model.initHidden(len(x))
            output = model.forward(x, h0)
            #print(output[-1,-1,:],y[-1])
#             loss = criterion(output[:,-1,:], y)
            loss = criterion(output, y)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            curr_loss += loss.item()*len(x)
#             print(f"{batch} {loss.item()}")

        curr_loss /= len(train_loader.dataset)
        test_loss = evaluate(test_loader)
#         if (epoch % 20 == 0):  print(f'current {epoch} training loss={loss.item()} test loss = {test_loss}')
        print(f'current {epoch} training loss={loss.item()} test loss = {test_loss}')
        if test_loss < best_test_loss:
            best_n_epoches = epoch + 1
            best_test_loss = test_loss
            print(f'updating best loss {epoch} training loss={loss.item()} test loss = {test_loss}')

        if epoch > best_n_epoches + 50:
            print('early stop')
            break
    return best_n_epoches

def retrain(n_epoches, train_loader):
    optimizer = optim.Adam(model.parameters())

    model.train()
    for epoch in range(n_epoches):

        curr_loss = 0.0
        for batch, (x, y) in enumerate(train_loader):
            h0 = model.initHidden(len(x))
            output = model.forward(x, h0)
#             loss = criterion(output[:,-1,:], y)
            loss = criterion(output, y)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            curr_loss += loss.item()*len(x)

    curr_loss /= len(train_loader.dataset)
    return curr_loss

In [66]:
train_loaders, test_loaders = assemble(data_all)

model = MyModel(n_features, linear_node, n_features, num_layers=3)    
best_n_epoches = train(300, train_loaders[0], test_loaders[0])
torch.save({'best_epoches': best_n_epoches,
            'model': model.state_dict()},
            'model_allgru_train.pickle')

model = MyModel(n_features, linear_node, n_features, num_layers=3)    
loss = retrain(best_n_epoches, train_loaders[1])
torch.save({'loss': loss,
            'best_epoches': best_n_epoches,
            'model': model.state_dict()},
            'model_allgru.pickle')

current 0 training loss=0.7065339684486389 test loss = 0.6783859962370338
updating best loss 0 training loss=0.7065339684486389 test loss = 0.6783859962370338
current 1 training loss=0.7038140892982483 test loss = 0.6750827041237196
updating best loss 1 training loss=0.7038140892982483 test loss = 0.6750827041237196
current 2 training loss=0.7022724151611328 test loss = 0.6743493736413297
updating best loss 2 training loss=0.7022724151611328 test loss = 0.6743493736413297
current 3 training loss=0.7011995315551758 test loss = 0.6734472149755897
updating best loss 3 training loss=0.7011995315551758 test loss = 0.6734472149755897
current 4 training loss=0.7005960941314697 test loss = 0.6725911743134156
updating best loss 4 training loss=0.7005960941314697 test loss = 0.6725911743134156
current 5 training loss=0.6997776031494141 test loss = 0.6719902728908154
updating best loss 5 training loss=0.6997776031494141 test loss = 0.6719902728908154
current 6 training loss=0.6982415318489075 tes

In [56]:
model

MyModel(
  (linear): Linear(in_features=65, out_features=16, bias=True)
  (gru): GRU(16, 16, num_layers=3, batch_first=True, dropout=0.2)
  (linear_out): Linear(in_features=16, out_features=65, bias=True)
)