In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import scipy.stats as stats
from collections import defaultdict
from itertools import product
from sklearn.metrics import mean_absolute_error as mae
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch import optim
from sklearn.preprocessing import StandardScaler

In [2]:
# look_back = 504
look_back = 72 * 3
batch_size = 512

In [3]:
dat = pd.read_csv('train.csv', index_col='row_id')

In [4]:
def preprocess(dat):
    time_mapper = {}
    ii = 0
    for h in range(24):
        for mm in ['00','20','40']:
            hh = '{0:02d}'.format(h)
            time_mapper[hh+':'+mm] = ii
            ii += 1

    dat['unique'] = dat['x'].astype(str) + dat['y'].astype(str) + dat['direction']
    uniques = dat['unique'].unique()
    dat['day'] = pd.to_datetime(dat['time']).dt.weekday
    dat['time_stamp'] = dat['time'].apply(lambda x:time_mapper[x.split()[1][:5]])

    tmp = dat.groupby(['unique','day','time_stamp']).agg({'congestion':np.median})
    median_mapper = tmp.to_dict()['congestion']
    dat['median'] = dat.apply(lambda x: \
                              median_mapper[x['unique'],x['day'],x['time_stamp']], axis=1)
    dat['congestion-median'] = dat['congestion'] - dat['median']
    
    all_time = pd.DataFrame(pd.date_range('1991-04-01 00:00:00', '1991-09-30 11:40:00', freq='20Min'), columns=['time'])
    all_time['time'] = all_time['time'].astype(str)
    
    return uniques, median_mapper, time_mapper, all_time

In [5]:
uniques, median_mapper, time_mapper, all_time = preprocess(dat)

In [6]:
def getseries(unique):
    df = dat.loc[dat['unique']==unique, ['time', 'congestion-median']]
    df = pd.merge(all_time, df, left_on='time', right_on='time', how='outer')
    df = df.set_index('time')
    df['congestion-median'] = df['congestion-median'].fillna(0)
    ss = StandardScaler()
    df['congestion-median-normalized'] = ss.fit_transform(df['congestion-median'].values.reshape(-1,1)).reshape(-1)
    return df, ss

In [7]:
def create_dataset(dataset, look_back=5):
    dataX, dataY = [], []
    for i in range(len(dataset)-look_back-1):
        a = dataset[i:(i+look_back)]
        dataX.append(a)
        dataY.append(dataset[i+look_back])
    return np.array(dataX), np.array(dataY)

In [8]:
def assemble(dat):
    train_loaders, test_loaders = [], []
    for period in test_periods_with_lookback:
        train = dat.loc[dat.index < period[0], 'congestion-median-normalized'].values
        test = dat.loc[(dat.index >= period[0]) & (dat.index <= period[1]), 'congestion-median-normalized'].values
        print(test[0])
        
        X, y = create_dataset(train, look_back=look_back)
        train_dataset = []
        for i in range(len(X)):
            train_dataset.append((torch.tensor(X[i],dtype=torch.float32),
                                  torch.tensor(y[i].reshape(-1,),dtype=torch.float32)))
        train_loaders.append(DataLoader(train_dataset, batch_size=batch_size, drop_last=False))
        
        X, y = create_dataset(test, look_back=look_back)
        test_dataset = []
        for i in range(len(X)):
            test_dataset.append((torch.tensor(X[i],dtype=torch.float32),
                                 torch.tensor(y[i].reshape(-1,),dtype=torch.float32)))
        test_loaders.append(DataLoader(test_dataset, batch_size=batch_size, drop_last=False))
        
    return train_loaders, test_loaders

In [9]:
criterion = nn.L1Loss()

In [14]:
class MyModel(nn.Module):
    def __init__(self, input_feature, output_feature=1):
        super(MyModel, self).__init__()
        self.fc1 = nn.Linear(input_feature, 64)
        self.fc2 = nn.Linear(64,16)
        self.fc3 = nn.Linear(16,output_feature)
        self.dropout = nn.Dropout(p=0.5)
    
    def forward(self, x):
        ''' X is in the shape of (N,L,input_feature) '''
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

In [11]:
def evaluate(test_loader):
    model.eval()
    with torch.no_grad():
        loss = 0
        n = 0
        for batch, (x, y) in enumerate(test_loader):
            output = model.forward(x)
            loss += criterion(output ,y).item() * len(x)
            n += len(x)
        loss /= n
    return loss

def train(n_epoches, train_loader, test_loader):
    optimizer = optim.Adam(model.parameters())
    
    best_test_loss = 100.0
    for epoch in range(n_epoches):
        
        curr_loss = 0.0
        model.train()
        
        n = 0
        for batch, (x, y) in enumerate(train_loader):
            output = model.forward(x)
            loss = criterion(output, y)
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            curr_loss += loss*len(x)
            n += len(x)
        
        curr_loss /= len(train_loader.dataset)
        test_loss = evaluate(test_loader)
        if (epoch % 20 == 0):  print(f'current {epoch} training loss={loss.item()} test loss = {test_loss}')
        if test_loss < best_test_loss:
            best_n_epoches = epoch + 1
            best_test_loss = test_loss
            print(f'updating best loss {epoch} training loss={loss.item()} test loss = {test_loss}')
    return best_n_epoches

def retrain(n_epoches, train_loader):
    optimizer = optim.Adam(model.parameters())
    
    model.train()
    for epoch in range(n_epoches):
        for batch, (x, y) in enumerate(train_loader):
            output = model.forward(x)
            loss = criterion(output, y)
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

In [15]:
test_periods = [
    ['1991-09-16 12:00:00', '1991-09-16 24:00:00'],
    ['1991-09-23 12:00:00', '1991-09-23 24:00:00']]

all_ss = {}
torch.manual_seed(123)
for unique in uniques[:32]:
    print(f"doing {unique}")
    
    df, ss = getseries(unique)
    print(ss.mean_, ss.scale_)
    all_ss[unique] = ss
    
    test_periods_with_lookback = []
    for period in test_periods:
        id1 = df.index.to_list().index(period[0])
        test_periods_with_lookback.append([df.index[id1-look_back], period[1]])
    
    model = MyModel(look_back)
    train_loaders, test_loaders = assemble(df)
    best_n_epoches = train(2000, train_loaders[0], test_loaders[0])
    
    model = MyModel(look_back)
    print('refitting with {best_n_epoches}')
    retrain(best_n_epoches, train_loaders[1])
    
    torch.save(model.state_dict(), 'model_'+unique+'.pickle')

doing 00EB
[0.95494673] [11.57954203]
2.0765115933378677
0.5220459717995566
current 0 training loss=0.8658768534660339 test loss = 0.831439197063446
updating best loss 0 training loss=0.8658768534660339 test loss = 0.831439197063446
updating best loss 1 training loss=0.851239800453186 test loss = 0.8220176696777344
updating best loss 2 training loss=0.8314199447631836 test loss = 0.8182315230369568
current 20 training loss=0.7292464375495911 test loss = 0.900221586227417
current 40 training loss=0.6794531345367432 test loss = 0.916517972946167
current 60 training loss=0.6488896012306213 test loss = 0.9022589921951294
current 80 training loss=0.6054600477218628 test loss = 0.8821374773979187
current 100 training loss=0.605268657207489 test loss = 0.8817000389099121
current 120 training loss=0.5959994196891785 test loss = 0.8917797803878784
current 140 training loss=0.5881227850914001 test loss = 0.8972421288490295
current 160 training loss=0.5820416808128357 test loss = 0.88736635446548

KeyboardInterrupt: 

In [None]:
test_periods

In [None]:
test = pd.read_csv('test.csv')
test

In [None]:
len(test[(test['x']==0) & (test['y']==0) & (test['direction']=='EB')])

In [None]:
ss

In [None]:
df

In [None]:
dat

In [None]:
with torch.no_grad():
    for unique in uniques[60:61]:
        print(unique)
        df, ss = getseries(unique)
        print(ss.mean_, ss.scale_)
        model.load_state_dict(torch.load('model_'+unique+'.pickle'))
        X, y = create_dataset(df['congestion-median-normalized'])
        print(X)
        predict = np.zeros(36)
        for i in range(36)[0:1]:
            X = torch.tensor(X, dtype=torch.float32).reshape(1,-1,1)
            print(X.shape)
            h0 = model.initHidden(1)
            print(model.forward(X,h0))

In [None]:
        X, y = create_dataset(df['congestion-median-normalized'], look_back=look_back)
        print(X.shape)

In [None]:
x = torch.tensor(X[0],dtype=torch.float32).reshape(1,-1,1)
h0 = model.initHidden(1)

In [None]:
x

In [None]:
y_target = torch.tensor(y[0],dtype=torch.float32).reshape(1,-1,1)
y_target

In [None]:
y_pred = model.forward(x,h0)
y_pred

In [None]:
criterion(y_pred, y_target)

In [None]:
# y_pred = y_pred.detach().numpy()
y_pred = y_pred.reshape((1,-1))

In [None]:
y_target = y_target.detach().numpy().reshape((1,-1))

In [None]:
plt.plot(y_pred.T, y_target.T, '.')

In [None]:
plt.plot(range(look_back), y_pred.T)
plt.plot(range(look_back), y_target.T,'r')

In [None]:
def evaluate(test_loader):
    model.eval()
    with torch.no_grad():
        loss = 0
        n = 0
        for batch, (x, y) in enumerate(test_loader):
            h0 = model.initHidden(len(x))
            output = model.forward(x, h0)
            loss += criterion(output[:,-10,:],y[:,-10,:]).item() * len(x)
            n += len(x)
        loss /= n
    return loss

def train(n_epoches, train_loader, test_loader):
    optimizer = optim.Adam(model.parameters())
    
    best_test_loss = 100.0
    for epoch in range(n_epoches):
        
        curr_loss = 0.0
        model.train()
        
        n = 0
        for batch, (x, y) in enumerate(train_loader):
            h0 = model.initHidden(len(x))
            output = model.forward(x, h0)
            print(output[-1,-10,:],y[-1,-10,:])
            loss = criterion(output[:,-10,:], y[:,-10,:])
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            curr_loss += loss*len(x)
            n += len(x)
        
        curr_loss /= len(train_loader.dataset)
        test_loss = evaluate(test_loader)
#         if (epoch % 20 == 0):  print(f'current {epoch} training loss={loss.item()} test loss = {test_loss}')
        print(f'current {epoch} training loss={loss.item()} test loss = {test_loss}')
        if test_loss < best_test_loss:
            best_n_epoches = epoch + 1
            best_test_loss = test_loss
            print(f'updating best loss {epoch} training loss={loss.item()} test loss = {test_loss}')
    return best_n_epoches

def retrain(n_epoches, train_loader):
    optimizer = optim.Adam(model.parameters())
    
    model.train()
    for epoch in range(n_epoches):
        for batch, (x, y) in enumerate(train_loader):
            h0 = model.initHidden(len(x))
            output = model.forward(x, h0)
            loss = criterion(output, y)
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

In [None]:
test_periods = [
    ['1991-09-16 12:00:00', '1991-09-16 24:00:00'],
    ['1991-09-23 12:00:00', '1991-09-23 24:00:00']]

all_ss = {}
torch.manual_seed(123)
for unique in uniques[0:1]:
    print(f"doing {unique}")
    
    df, ss = getseries(unique)
    print(ss.mean_, ss.scale_)
    all_ss[unique] = ss
    
    test_periods_with_lookback = []
    for period in test_periods:
        id1 = df.index.to_list().index(period[0])
        test_periods_with_lookback.append([df.index[id1-look_back], period[1]])
    
    model = MyModel(1, linear_node, 1, num_layers=3)
    train_loaders, test_loaders = assemble(df)
    best_n_epoches = train(200, train_loaders[0], test_loaders[0])
    
    model = MyModel(1, linear_node, 1, num_layers=3)
    print('refitting with {best_n_epoches}')
    retrain(best_n_epoches, train_loaders[1])
    
    torch.save(model.state_dict(), 'model_'+unique+'.pickle')