In [None]:
import numpy as np
import pandas as pd

import torch
import torch.nn.functional as F
from torch import nn, optim
from torch.utils.data import TensorDataset, DataLoader, SubsetRandomSampler

from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.model_selection import StratifiedKFold, KFold

from torchinfo import summary

from torch_lr_finder import LRFinder

from data import preprocessing, postprocessing
from modelling import LSTMModel, train_epoch, valid_epoch, L1Loss_masked
from func import get_timestamp

import timeit, copy

## Working with data and dataset

In [None]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [None]:
data_kwargs = {'u_in_cumsum': True,
          'u_in_lag12': True,
          'u_in_lag_back12': True,
          'u_in_diff12': True,
          'area_true': False,
          'u_in_mean': False,
          'u_in_last': False,
          'scaler': RobustScaler()}
train, test, features = preprocessing(train, test, **data_kwargs)

In [None]:
train

In [None]:
target = train['pressure']
train.drop(columns = ['id', 'breath_id', 'pressure'], inplace = True)
test.drop(columns = ['id', 'breath_id'], inplace = True)
input_size = train.shape[1]
features, input_size

In [None]:
# Creating dataset
train = torch.tensor(train.to_numpy()).reshape(-1,80, input_size).float()
test = torch.tensor(test.to_numpy()).reshape(-1,80, input_size).float()
target = torch.tensor(target.to_numpy()).reshape(-1,80,1).float()

train_dataset = TensorDataset(train, target)

## Setup

In [None]:
class Config:
    hidden_dim = 128
    num_layers = 4
    lr = 1.42e-03
    
    batch_size = 256
    num_workers = 4
    device = "cuda"
    
    num_epochs = 300
    k = 5
    infer_k = 5
      
cfg = Config

In [None]:
lr_finder = False
training = True
inference = False

## LR Finder

In [None]:
if lr_finder:
    model = model = LSTMModel(input_size,128,4)
    criterion = nn.L1Loss()
    optimizer = optim.Adam(model.parameters(), lr=0.000001)
    train_loader = DataLoader(train_dataset, batch_size=256, num_workers=4)
    lr_finder = LRFinder(model, optimizer, criterion, device="cuda")
    lr_finder.range_test(train_loader, end_lr=100, num_iter=100)
    lr_finder.plot() # to inspect the loss-learning rate graph
    lr_finder.reset() 

## Training

In [None]:
kf=KFold(n_splits=cfg.k,shuffle=True,random_state=42)

opt_criterion = nn.L1Loss()
val_criterion = L1Loss_masked()

device = cfg.device

if training:
    timestamp = get_timestamp()
    
    val_losses = []
    for fold, (train_idx,val_idx) in enumerate(kf.split(train)):
        print(f"Fold {fold + 1}", "\n")

        train_sampler = SubsetRandomSampler(train_idx)
        val_sampler = SubsetRandomSampler(val_idx)
        train_loader = DataLoader(train_dataset, batch_size=cfg.batch_size, sampler=train_sampler, num_workers = cfg.num_workers)
        val_loader = DataLoader(train_dataset, batch_size=cfg.batch_size, sampler=val_sampler, num_workers = cfg.num_workers)

        model = LSTMModel(input_size,cfg.hidden_dim,cfg.num_layers)
        model.to(device)
        optimizer = optim.Adam(model.parameters(), lr=cfg.lr)
        
        best_val_loss = 1000
        best_weights = model.state_dict()
        restart = 15
        
        for epoch in range(cfg.num_epochs):
            start_time = timeit.default_timer()
            train_loss = train_epoch(model,device,train_loader,opt_criterion,optimizer)
            val_loss = valid_epoch(model,device,val_loader,val_criterion)
            end_time = timeit.default_timer()
            
            total = end_time - start_time

            train_loss = np.mean(np.array(train_loss))
            val_loss = np.mean(np.array(val_loss))
            
            if val_loss < best_val_loss:
                best_val_loss = val_loss
                best_weights = copy.copy(model.state_dict())
            
            print(f"Epoch: {epoch + 1} | T loss: {train_loss:.4f} V loss: {val_loss:.4f} Best: {best_val_loss:.4f} Time: {total:.4f}")
            
            if train_loss > restart or val_loss > restart:
                # For some reason sometimes it bugs out and gets to the point with 17 mae and no exit
                # Just restart
                print("Restarting...")
                train_sampler = SubsetRandomSampler(train_idx)
                val_sampler = SubsetRandomSampler(val_idx)
                train_loader = DataLoader(train_dataset, batch_size=cfg.batch_size, sampler=train_sampler, num_workers = cfg.num_workers)
                val_loader = DataLoader(train_dataset, batch_size=cfg.batch_size, sampler=val_sampler, num_workers = cfg.num_workers)
              
                model = LSTMModel(input_size,cfg.hidden_dim,cfg.num_layers)
                model.to(device)
                optimizer = optim.Adam(model.parameters(), lr=cfg.lr)
              
                best_weights = model.state_dict() 
            
        val_losses.append(best_val_loss)    
        torch.save(best_weights, f"models/fold_{fold}_{timestamp}.pth")
        break
        
    print("Avg final val loss", np.mean(val_losses))

## Submission

In [None]:
if inference:
    timestamp = get_timestamp()
    df = pd.read_csv('data/sample_submission.csv')
    
    test_target = torch.zeros(test.shape[0],80,1).float()
    test_dataset = TensorDataset(test, test_target)
    test_loader = DataLoader(test_dataset, batch_size=cfg.batch_size, num_workers = cfg.num_workers)
    
    for fold in range(cfg.k):
        model = LSTMModel(input_size,cfg.hidden_dim,cfg.num_layers)
        model.to(device)
        filename = f'models/fold_{fold}_2021-10-14-10-18.pth'
        model.load_state_dict(torch.load(filename, map_location=device))
        model.eval()

        y_preds = []
        for x, y in test_loader:
            x = x.to(device)
            with torch.no_grad():
                y_pred = model(x).squeeze()

            y_preds.append(y_pred.cpu().numpy())

        y_preds = np.concatenate(y_preds, axis=0).ravel()
        df[fold] = y_preds

    submission = postprocessing(df, cfg.k)
    submission.to_csv(f'submissions/{timestamp}.csv', index = False)
    print('Done')