In [1]:
import sys, os

# Add utility_scripts in the current path so that they can be imported directly just like in interactive mode
sys.path.append(os.path.abspath("../usr/lib/"))
for script_folder in os.listdir("../usr/lib/"):
    sys.path.append(os.path.abspath("../usr/lib/"+script_folder))

In [2]:
from datetime import date
from statistics import mean, median
import pickle

import pandas as pd
import matplotlib
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

import torch
from torch import nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter

from lastquerytransformer import Riiid
from riiidutils import RiiidDataset, riiid_collate_fn, riiid_collate_fn_right_padding

In [3]:
loc = os.environ.get('KAGGLE_KERNEL_RUN_TYPE','Localhost')
if loc == 'Interactive' or loc == 'Localhost':
    conf = {
        'batch_size': 8,
        'train_size': 40_000,
        'epochs': 15,
        'eval_steps': 500,
        'learning_rate': 2e-4
    }
# When it is run after an api push.
elif loc == 'Batch':
    conf = {
        'batch_size': 32,
        'train_size': 400_000,
        'epochs': 30,
        'eval_steps': 2500,
        'learning_rate': 4e-4
    }

In [4]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Introduction

Le fichier train.csv comprend un peu plus de 100 millions de lignes.  
Il y a un peu moins de 400 mille user_id uniques.  
Le modèle utilisé dans ce notebook prend en entrée une série par utilisateur.

# Features utilisées
Pour chaque question de la série d'apprentissage d'un utilisateur quelconque  

1. Question ID: correspond à content_id (lorsque l'élément est une question).
2. Question part: correspond à part dans question.csv pour l'élément dont question_id correspondant à content_id
3. Answer correctness: valeur de answered_correctly de l'exemple (ou target encoding de la question ?)
4. Current question elapsed time: prior question de la question suivante.
5. Timestamp difference: current question timestamp - timestamp of the last question from the same user

# Chargement des données et instanciation des datasets

In [5]:
with open('../input/riiid-sequences/users_y.pickle', 'rb') as f:
    users_y = pickle.load(f)
with open('../input/riiid-sequences/users_cat.pickle', 'rb') as f:
    users_cat = pickle.load(f)
with open('../input/riiid-sequences/users_cont.pickle', 'rb') as f:
    users_cont = pickle.load(f)

In [6]:
seed = 12
cat_train, cat_val, cont_train, cont_val, y_train, y_val = train_test_split(users_cat, users_cont, users_y, test_size=.05, random_state=seed)

In [7]:
cat_train = cat_train[:conf['train_size']]
cont_train = cont_train[:conf['train_size']]
y_train = y_train[:conf['train_size']]

In [8]:
print("Number of train examples:", len(y_train))
print("Number of valid examples:", len(y_val))
print("Train set answered_correctly average value:", "{:.3}".format(y_train.mean()))
print("Valid set answered_correctly average value:", "{:.3}".format(y_val.mean()))
print("Train set median sequence length:", "{:.0f}".format(median([user_seq.shape[0] for user_seq in cat_train])))
print("Valid set median sequence length:", "{:.0f}".format(median([user_seq.shape[0] for user_seq in cat_val])))

Number of train examples: 40000
Number of valid examples: 19683
Train set answered_correctly average value: 0.469
Valid set answered_correctly average value: 0.475
Train set median sequence length: 40
Valid set median sequence length: 41


## Batches
Dynamic Padding: ajout de padding batch par batch pour avoir une même longueur de séquence dans chaque batch.  
Uniform size batching: on trie les utilisateurs par longueur de séquence, afin d'avoir des longueurs plus proches dans chaque batch  

Afin de mettre en œuvre ces deux stratégies on va par simplicité trier au préalable et conjointement les listes batch_cat, batch_cont et batch_y par longueur des séquences dans batch_cat/batch_cont (c'est RiiidDataset qui s'en charge). Le DataLoader utilisera une fonction collate_fn permettant d'ajouter du padding dynamiquement batch par batch.

In [9]:
train_dataset = RiiidDataset(cat_train, cont_train, y_train, sort_sequences=True)
val_dataset = RiiidDataset(cat_val, cont_val, y_val, sort_sequences=True)

In [10]:
train_loader = DataLoader(train_dataset, batch_size=conf['batch_size'], shuffle=False, collate_fn=riiid_collate_fn_right_padding, drop_last=True)
val_loader = DataLoader(val_dataset, batch_size=4*conf['batch_size'], shuffle=False, collate_fn=riiid_collate_fn_right_padding)

Without dynamic batching

In [11]:
# cat_train = pad_sequence_left([torch.tensor(el) for el in cat_train], batch_first=True)
# cont_train = pad_sequence_left([torch.tensor(el, dtype=torch.float) for el in cont_train], batch_first=True)
# #y = torch.tensor(y, dtype=torch.float)
# cat_val = pad_sequence_left([torch.tensor(el) for el in cat_val], batch_first=True)
# cont_val = pad_sequence_left([torch.tensor(el, dtype=torch.float) for el in cont_val], batch_first=True)
# #y = torch.tensor(y, dtype=torch.float)

# train_dataset = RiiidDataset(cat_train, cont_train, y_train, sort_sequences=False)
# val_dataset = RiiidDataset(cat_val, cont_val, y_val, sort_sequences=False)

# train_loader = DataLoader(train_dataset, batch_size=conf['batch_size'], shuffle=False, drop_last=True)
# val_loader = DataLoader(val_dataset, batch_size=4*conf['batch_size'], shuffle=False)

# Modèle
Pour le modèle, l'auteur s'est inspiré de la solution arrivée 3è à la compétition data Science bowl 2019 ([discussion](https://www.kaggle.com/c/data-science-bowl-2019/discussion/127891), [code](https://www.kaggle.com/limerobot/dsb2019-v77-tr-dt-aug0-5-3tta))  
La procédure de création des embeddings est expliquée, on suppose que l'auteur s'est basé dessus.
## Embeddings
On utilise un embedding catégoriel pour les 3 premières variables et un embedding continu pour les 2 dernières.
Contrairement à la solution du lien ci-dessus on utilise un embedding catégoriel par variable catégorielle plutôt qu'un embedding commun, comme recommandé [ici](https://discuss.pytorch.org/t/categorical-embeddings-can-i-only-have-1-categorical-column-per-embedding-layer/104681/3)

In [12]:
maximums = {'question_id': 13523, 'part': 7, 'answered_correctly': 3}
model = Riiid(maximums, dropout=0).to(device)
optimizer = optim.AdamW(model.parameters(), lr=conf['learning_rate'])
criterion = nn.BCEWithLogitsLoss()

completed_epochs = 0

## Loading

In [13]:
# checkpoint = torch.load('../input/lastquerytransformer40ebundlefix/lqt-2021-04-18.pt')
# completed_epochs = checkpoint['epoch']
# model.load_state_dict(checkpoint['model_state_dict'])
# optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

# Training

In [14]:
def evaluate(model, criterion, dataloader):
    model.eval()
    losses = []
    accuracies = []
    y_true = []
    y_score = []
    with torch.no_grad():
        for i, batch in enumerate(dataloader):
            x_cat = batch['cat'].to(device)
            x_cont = batch['cont'].to(device)
            y = batch['y'].to(device)
            seq_lengths = batch['lengths']#.to(device)

            ypred = model(x_cat, x_cont, seq_lengths).squeeze(1)
            loss = criterion(ypred, y)

            losses.append(float(loss))
            accuracies.append((torch.round(torch.sigmoid(ypred)) == y).float().mean().item())
            y_true.extend(y.cpu().numpy())
            y_score.extend(torch.sigmoid(ypred).cpu().numpy())

    return mean(losses), mean(accuracies), roc_auc_score(y_true, y_score)

In [15]:
print(evaluate(model, criterion, train_loader))
print(evaluate(model, criterion, val_loader))

(0.6956996984362602, 0.46925, 0.5595199986155723)
(0.6953269601255269, 0.4745332792207792, 0.5573995389198056)


In [16]:
log_dir = "tensorboard"

In [17]:
if log_dir is not None:
    writer = SummaryWriter(log_dir)

step = 0
for e in range(completed_epochs, completed_epochs+conf['epochs']):
    print("Epoch ", e)
    for batch in train_loader:
        model.train()
        x_cat = batch['cat'].to(device)
        x_cont = batch['cont'].to(device)
        y = batch['y'].to(device)
        seq_lengths = batch['lengths']#.to(device)

        ypred = model(x_cat, x_cont, seq_lengths).squeeze(1)
        loss = criterion(ypred, y)

        model.zero_grad()
        loss.backward()
        
        #nn.utils.clip_grad_value_(model.parameters(), clip_value=1.0)
        #nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()

        step += 1
        if step % conf['eval_steps'] == 0:
            train_loss, train_acc, train_auc = evaluate(model, criterion, train_loader)
            print("Step", step, end="\n")
            print("Train loss:", "{:.3f}".format(train_loss), end=" ")
            print("Train accuracy:", "{:.3f}".format(train_acc), end=' ')
            print("Train AUC:", "{:.3f}".format(train_auc), end='\n')
            if log_dir is not None:
                writer.add_scalar("train/loss", train_loss, step)
                writer.add_scalar("train/accuracy", train_acc, step)
                writer.add_scalar("train/auc", train_auc, step)
            if val_loader is not None:
                val_loss, val_acc, val_auc = evaluate(model, criterion, val_loader)
                print("Valid loss:", "{:.3f}".format(val_loss), end=" ")
                print("Valid accuracy:", "{:.3f}".format(val_acc), end=" ")
                print("Valid AUC:", "{:.3f}".format(val_auc), end='\n')
                if log_dir is not None:
                    writer.add_scalar("eval/loss", val_loss, step)
                    writer.add_scalar("eval/acc", val_acc, step)
                    writer.add_scalar("eval/auc", val_auc, step)

if log_dir is not None:
    writer.close()


Epoch  0


RuntimeError: 'lengths' argument should be a 1D CPU int64 tensor, but got 1D cuda:0 Long tensor

In [17]:
torch.save({
            'epoch': e,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            #'loss': loss,
            }, "lqt-"+str(date.today())+".pt")