In [1]:
%reset -f

In [2]:
!pip install ../input/python-datatable/datatable-0.11.0-cp37-cp37m-manylinux2010_x86_64.whl > /dev/null 2>&1

In [3]:
import gc
import random
from tqdm.notebook import tqdm
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

import seaborn as sns
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.utils.rnn as rnn_utils
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader
from pathlib import Path
import datatable as dt

In [4]:
import numpy as np
import pandas as pd

In [5]:
def añadir_features(df, features_dicts):
    
    user_pause_timestamp_1 = np.zeros(len(df), dtype = np.float32)
    user_pause_timestamp_2 = np.zeros(len(df), dtype = np.float32)
    user_pause_timestamp_3 = np.zeros(len(df), dtype = np.float32)

    for num, row in enumerate(tqdm(df[['user_id', 'timestamp']].itertuples(), total=df.shape[0])):
    
        if len(timestamp_u[row.user_id]) == 0:
            user_pause_timestamp_1[num] = np.nan
            user_pause_timestamp_2[num] = np.nan
            user_pause_timestamp_3[num] = np.nan
        elif len(timestamp_u[row.user_id]) == 1:
            user_pause_timestamp_1[num] = row.timestamp - timestamp_u[row.user_id][0]
            user_pause_timestamp_2[num] = np.nan
            user_pause_timestamp_3[num] = np.nan
        elif len(timestamp_u[row.user_id]) == 2:
            user_pause_timestamp_1[num] = row.timestamp - timestamp_u[row.user_id][1]
            user_pause_timestamp_2[num] = row.timestamp - timestamp_u[row.user_id][0]
            user_pause_timestamp_3[num] = np.nan
        elif len(timestamp_u[row.user_id]) == 3:
            user_pause_timestamp_1[num] = row.timestamp - timestamp_u[row.user_id][2]
            user_pause_timestamp_2[num] = row.timestamp - timestamp_u[row.user_id][1]
            user_pause_timestamp_3[num] = row.timestamp - timestamp_u[row.user_id][0]
    

        if len(timestamp_u[row.user_id]) == 3:
            timestamp_u[row.user_id].pop(0)
            timestamp_u[row.user_id].append(row.timestamp)
        else:
            timestamp_u[row.user_id].append(row.timestamp)
   
    user_df = pd.DataFrame({'user_pause_timestamp_1': user_pause_timestamp_1, 
                            'user_pause_timestamp_2': user_pause_timestamp_2,
                            'user_pause_timestamp_3': user_pause_timestamp_3}) 
    
    del user_pause_timestamp_1, user_pause_timestamp_2, user_pause_timestamp_3
    
    df = pd.concat([df, user_df], axis = 1)
    
    del user_df

    return df

In [6]:
path = Path('/kaggle/input')
assert path.exists()

In [7]:
%%time

data_types_dict = {
    'content_type_id': 'bool',
    'timestamp': 'int64',
    'user_id': 'int32', 
    'content_id': 'uint16', 
    'answered_correctly': 'uint8', 
    'prior_question_elapsed_time': 'float32', 
    'prior_question_had_explanation': 'bool'
}
target = 'answered_correctly'
train_df = dt.fread(path/'riiid-test-answer-prediction/train.csv', columns = set(data_types_dict.keys())).to_pandas()

CPU times: user 1min 3s, sys: 10.7 s, total: 1min 14s
Wall time: 2min 16s


In [8]:
%%time

train_df = train_df[train_df.content_type_id == False]

#arrange by timestamp
train_df = train_df.sort_values(['timestamp'], ascending=True).reset_index(drop = True)

CPU times: user 25.4 s, sys: 8.74 s, total: 34.2 s
Wall time: 35.1 s


In [9]:
del train_df['timestamp'], train_df['content_type_id']
gc.collect()

33

In [10]:
n_skill = train_df["content_id"].nunique()
print("number skills", n_skill)

number skills 13523


In [11]:
#Creamos dicionarios
#timestamp_u = defaultdict(list)
#train_df = añadir_features(train_df, timestamp_u)

In [12]:
%%time

group = train_df[['user_id', 'content_id', 'answered_correctly']].groupby('user_id').apply(lambda r: (r['content_id'].values, r['answered_correctly'].values))

del train_df
gc.collect()

CPU times: user 35.5 s, sys: 1.99 s, total: 37.5 s
Wall time: 38 s


0

In [13]:
MAX_SEQ = 240 # 210
ACCEPTED_USER_CONTENT_SIZE = 2 # 2
EMBED_SIZE = 256 # 256
BATCH_SIZE = 64+32 # 96
DROPOUT = 0.1

In [14]:
class SAKTDataset(Dataset):
    def __init__(self, group, n_skill, max_seq = 100):
        super(SAKTDataset, self).__init__()
        self.samples, self.n_skill, self.max_seq = {}, n_skill, max_seq
        
        self.user_ids = []
        for i, user_id in enumerate(group.index):
            if(i % 10000 == 0):
                print(f'Processed {i} users')
            content_id, answered_correctly = group[user_id]
            if len(content_id) >= ACCEPTED_USER_CONTENT_SIZE:
                if len(content_id) > self.max_seq:
                    total_questions = len(content_id)
                    last_pos = total_questions // self.max_seq
                    for seq in range(last_pos):
                        index = f"{user_id}_{seq}"
                        self.user_ids.append(index)
                        start = seq * self.max_seq
                        end = (seq + 1) * self.max_seq
                        self.samples[index] = (content_id[start:end], answered_correctly[start:end])
                    if len(content_id[end:]) >= ACCEPTED_USER_CONTENT_SIZE:
                        index = f"{user_id}_{last_pos + 1}"
                        self.user_ids.append(index)
                        self.samples[index] = (content_id[end:], answered_correctly[end:])
                else:
                    index = f'{user_id}'
                    self.user_ids.append(index)
                    self.samples[index] = (content_id, answered_correctly)
                
                
    def __len__(self):
        return len(self.user_ids)

    def __getitem__(self, index):
        user_id = self.user_ids[index]
        content_id, answered_correctly = self.samples[user_id]
        seq_len = len(content_id)
        
        content_id_seq = np.zeros(self.max_seq, dtype=int)
        answered_correctly_seq = np.zeros(self.max_seq, dtype=int)
        if seq_len >= self.max_seq:
            content_id_seq[:] = content_id[-self.max_seq:]
            answered_correctly_seq[:] = answered_correctly[-self.max_seq:]
        else:
            content_id_seq[-seq_len:] = content_id
            answered_correctly_seq[-seq_len:] = answered_correctly
            
        target_id = content_id_seq[1:]
        label = answered_correctly_seq[1:]
        
        x = content_id_seq[:-1].copy()
        x += (answered_correctly_seq[:-1] == 1) * self.n_skill
        
        return x, target_id, label

In [15]:
TEST_SIZE = 0.1

train, val = train_test_split(group, test_size = TEST_SIZE)

In [16]:
train_dataset = SAKTDataset(train, n_skill, max_seq = MAX_SEQ)
train_dataloader = DataLoader(train_dataset, batch_size = BATCH_SIZE, shuffle = False, num_workers=8)

del train
gc.collect()

Processed 0 users
Processed 10000 users
Processed 20000 users
Processed 30000 users
Processed 40000 users
Processed 50000 users
Processed 60000 users
Processed 70000 users
Processed 80000 users
Processed 90000 users
Processed 100000 users
Processed 110000 users
Processed 120000 users
Processed 130000 users
Processed 140000 users
Processed 150000 users
Processed 160000 users
Processed 170000 users
Processed 180000 users
Processed 190000 users
Processed 200000 users
Processed 210000 users
Processed 220000 users
Processed 230000 users
Processed 240000 users
Processed 250000 users
Processed 260000 users
Processed 270000 users
Processed 280000 users
Processed 290000 users
Processed 300000 users
Processed 310000 users
Processed 320000 users
Processed 330000 users
Processed 340000 users
Processed 350000 users


0

In [17]:
val_dataset = SAKTDataset(val, n_skill, max_seq=MAX_SEQ)
val_dataloader = DataLoader(val_dataset, batch_size = BATCH_SIZE, shuffle = False, num_workers=8)

del val
gc.collect()

Processed 0 users
Processed 10000 users
Processed 20000 users
Processed 30000 users


0

In [18]:
sample_batch = next(iter(train_dataloader))
sample_batch[0].shape, sample_batch[1].shape, sample_batch[2].shape

(torch.Size([96, 239]), torch.Size([96, 239]), torch.Size([96, 239]))

In [19]:
class FFN(nn.Module):
    def __init__(self, state_size = 200, forward_expansion = 1, bn_size=MAX_SEQ - 1, dropout=0.2):
        super(FFN, self).__init__()
        self.state_size = state_size
        
        self.lr1 = nn.Linear(state_size, forward_expansion * state_size)
        self.relu = nn.ReLU()
        self.bn = nn.BatchNorm1d(bn_size)
        self.lr2 = nn.Linear(forward_expansion * state_size, state_size)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        x = self.relu(self.lr1(x))
        x = self.bn(x)
        x = self.lr2(x)
        return self.dropout(x)

FFN()

FFN(
  (lr1): Linear(in_features=200, out_features=200, bias=True)
  (relu): ReLU()
  (bn): BatchNorm1d(239, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (lr2): Linear(in_features=200, out_features=200, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)

In [20]:
def future_mask(seq_length):
    future_mask = (np.triu(np.ones([seq_length, seq_length]), k = 1)).astype('bool')
    return torch.from_numpy(future_mask)

future_mask(5)

tensor([[False,  True,  True,  True,  True],
        [False, False,  True,  True,  True],
        [False, False, False,  True,  True],
        [False, False, False, False,  True],
        [False, False, False, False, False]])

In [21]:
class TransformerBlock(nn.Module):
    def __init__(self, embed_dim, heads = 8, dropout = DROPOUT, forward_expansion = 1):
        super(TransformerBlock, self).__init__()
        self.multi_att = nn.MultiheadAttention(embed_dim=embed_dim, num_heads=heads, dropout=dropout)
        self.dropout = nn.Dropout(dropout)
        self.layer_normal = nn.LayerNorm(embed_dim)
        self.ffn = FFN(embed_dim, forward_expansion = forward_expansion, dropout=dropout)
        self.layer_normal_2 = nn.LayerNorm(embed_dim)
        

    def forward(self, value, key, query, att_mask):
        att_output, att_weight = self.multi_att(value, key, query, attn_mask=att_mask)
        att_output = self.dropout(self.layer_normal(att_output + value))
        att_output = att_output.permute(1, 0, 2) # att_output: [s_len, bs, embed] => [bs, s_len, embed]
        x = self.ffn(att_output)
        x = self.dropout(self.layer_normal_2(x + att_output))
        return x.squeeze(-1), att_weight
    
class Encoder(nn.Module):
    def __init__(self, n_skill, max_seq=100, embed_dim=128, dropout = DROPOUT, forward_expansion = 1, num_layers=1, heads = 8):
        super(Encoder, self).__init__()
        self.n_skill, self.embed_dim = n_skill, embed_dim
        self.embedding = nn.Embedding(2 * n_skill + 1, embed_dim)
        self.pos_embedding = nn.Embedding(max_seq - 1, embed_dim)
        self.e_embedding = nn.Embedding(n_skill+1, embed_dim)
        self.layers = nn.ModuleList([TransformerBlock(embed_dim, forward_expansion = forward_expansion) for _ in range(num_layers)])
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x, question_ids):
        device = x.device
        x = self.embedding(x)
        pos_id = torch.arange(x.size(1)).unsqueeze(0).to(device)
        pos_x = self.pos_embedding(pos_id)
        x = self.dropout(x + pos_x)
        x = x.permute(1, 0, 2) # x: [bs, s_len, embed] => [s_len, bs, embed]
        e = self.e_embedding(question_ids)
        e = e.permute(1, 0, 2)
        for layer in self.layers:
            att_mask = future_mask(e.size(0)).to(device)
            x, att_weight = layer(e, x, x, att_mask=att_mask)
            x = x.permute(1, 0, 2)
        x = x.permute(1, 0, 2)
        return x, att_weight

class SAKTModel(nn.Module):
    def __init__(self, n_skill, max_seq = 100, embed_dim = 128, dropout = DROPOUT, forward_expansion = 1, enc_layers=1, heads = 8):
        super(SAKTModel, self).__init__()
        self.encoder = Encoder(n_skill, max_seq, embed_dim, dropout, forward_expansion, num_layers=enc_layers)
        self.pred = nn.Linear(embed_dim, 1)
        
    def forward(self, x, question_ids):
        x, att_weight = self.encoder(x, question_ids)
        x = self.pred(x)
        return x.squeeze(-1), att_weight

In [22]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [23]:
# Main changes are possibility of forward expansion and stacking of encoding layers
def create_model():
    return SAKTModel(n_skill, max_seq = MAX_SEQ, embed_dim = EMBED_SIZE, forward_expansion = 1, enc_layers = 1, heads = 8, dropout = 0.1)
model = create_model()
model

SAKTModel(
  (encoder): Encoder(
    (embedding): Embedding(27047, 256)
    (pos_embedding): Embedding(239, 256)
    (e_embedding): Embedding(13524, 256)
    (layers): ModuleList(
      (0): TransformerBlock(
        (multi_att): MultiheadAttention(
          (out_proj): _LinearWithBias(in_features=256, out_features=256, bias=True)
        )
        (dropout): Dropout(p=0.1, inplace=False)
        (layer_normal): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (ffn): FFN(
          (lr1): Linear(in_features=256, out_features=256, bias=True)
          (relu): ReLU()
          (bn): BatchNorm1d(239, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (lr2): Linear(in_features=256, out_features=256, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (layer_normal_2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
      )
    )
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (pred): Linear(in_features=256, out_featu

In [24]:
LR = 2e-3
EPOCHS = 12
MODEL_PATH = 'SAKT_model.pt'

In [25]:
def load_from_item(item):
    x = item[0].to(device).long()
    target_id = item[1].to(device).long()
    label = item[2].to(device).float()
    target_mask = (target_id != 0)
    return x, target_id, label, target_mask

def update_stats(tbar, train_loss, loss, output, label, num_corrects, num_total, labels, outs):
    train_loss.append(loss.item())
    pred = (torch.sigmoid(output) >= 0.5).long()
    num_corrects += (pred == label).sum().item()
    num_total += len(label)
    labels.extend(label.view(-1).data.cpu().numpy())
    outs.extend(output.view(-1).data.cpu().numpy())
    tbar.set_description('loss - {:.4f}'.format(loss))
    return num_corrects, num_total

def train_epoch(model, dataloader, optim, criterion, scheduler, device="cpu"):
    model.train()
    
    train_loss = []
    num_corrects = 0
    num_total = 0
    labels = []
    outs = []
    
    tbar = tqdm(dataloader)
    for item in tbar:
        x, target_id, label, target_mask = load_from_item(item)
        
        optim.zero_grad()
        output, _ = model(x, target_id)
        
        output = torch.masked_select(output, target_mask)
        label = torch.masked_select(label, target_mask)
        
        loss = criterion(output, label)
        loss.backward()
        optim.step()
        scheduler.step()
        
        tbar.set_description('loss - {:.4f}'.format(loss))

def val_epoch(model, val_iterator, criterion, device="cpu"):
    model.eval()

    train_loss = []
    num_corrects = 0
    num_total = 0
    labels = []
    outs = []

    tbar = tqdm(val_iterator)
    for item in tbar:
        x, target_id, label, target_mask = load_from_item(item)

        with torch.no_grad():
            output, atten_weight = model(x, target_id)
        
        output = torch.masked_select(output, target_mask)
        label = torch.masked_select(label, target_mask)

        loss = criterion(output, label)
        
        num_corrects, num_total = update_stats(tbar, train_loss, loss, output, label, num_corrects, num_total, labels, outs)

    acc = num_corrects / num_total
    auc = roc_auc_score(labels, outs)
    loss = np.average(train_loss)

    return loss, acc, auc

In [26]:
def Training():
    optimizer = torch.optim.Adam(model.parameters(), lr=LR)
    criterion = nn.BCEWithLogitsLoss()
    scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=LR, 
                                                    steps_per_epoch=len(train_dataloader), epochs=EPOCHS)
    model.to(device)
    criterion.to(device)
    best_auc = 0.0
    for epoch in range(EPOCHS):
        train_epoch(model, train_dataloader, optimizer, criterion, scheduler, device)
        val_loss, avl_acc, val_auc = val_epoch(model, val_dataloader, criterion, device)
        print(f"Epoca - {epoch + 1}   Validation_loss - {val_loss:.3f}   Accuracy - {avl_acc:.4f}   AUC - {val_auc:.4f}")
        if best_auc < val_auc:
            print(f'Epoca - {epoch + 1} best model with Validation AUC: {val_auc}')
            best_auc = val_auc
        torch.save(model.state_dict(), MODEL_PATH)

In [27]:
Training()

HBox(children=(FloatProgress(value=0.0, max=6574.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=726.0), HTML(value='')))


Epoca - 1   Validation_loss - 0.549   Accuracy - 0.7187   AUC - 0.7534
Epoca - 1 best model with Validation AUC: 0.7533643652807313


HBox(children=(FloatProgress(value=0.0, max=6574.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=726.0), HTML(value='')))


Epoca - 2   Validation_loss - 0.544   Accuracy - 0.7229   AUC - 0.7592
Epoca - 2 best model with Validation AUC: 0.7591650253645812


HBox(children=(FloatProgress(value=0.0, max=6574.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=726.0), HTML(value='')))


Epoca - 3   Validation_loss - 0.542   Accuracy - 0.7244   AUC - 0.7611
Epoca - 3 best model with Validation AUC: 0.7611063410910075


HBox(children=(FloatProgress(value=0.0, max=6574.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=726.0), HTML(value='')))


Epoca - 4   Validation_loss - 0.540   Accuracy - 0.7259   AUC - 0.7633
Epoca - 4 best model with Validation AUC: 0.7633256093201465


HBox(children=(FloatProgress(value=0.0, max=6574.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=726.0), HTML(value='')))


Epoca - 5   Validation_loss - 0.538   Accuracy - 0.7270   AUC - 0.7651
Epoca - 5 best model with Validation AUC: 0.7651458551849438


HBox(children=(FloatProgress(value=0.0, max=6574.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=726.0), HTML(value='')))


Epoca - 6   Validation_loss - 0.537   Accuracy - 0.7279   AUC - 0.7665
Epoca - 6 best model with Validation AUC: 0.7664733254910177


HBox(children=(FloatProgress(value=0.0, max=6574.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=726.0), HTML(value='')))


Epoca - 7   Validation_loss - 0.536   Accuracy - 0.7287   AUC - 0.7676
Epoca - 7 best model with Validation AUC: 0.7676180699264152


HBox(children=(FloatProgress(value=0.0, max=6574.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=726.0), HTML(value='')))


Epoca - 8   Validation_loss - 0.536   Accuracy - 0.7290   AUC - 0.7683
Epoca - 8 best model with Validation AUC: 0.7683133159594803


HBox(children=(FloatProgress(value=0.0, max=6574.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=726.0), HTML(value='')))


Epoca - 9   Validation_loss - 0.536   Accuracy - 0.7292   AUC - 0.7691
Epoca - 9 best model with Validation AUC: 0.769099419014581


HBox(children=(FloatProgress(value=0.0, max=6574.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=726.0), HTML(value='')))


Epoca - 10   Validation_loss - 0.535   Accuracy - 0.7296   AUC - 0.7696
Epoca - 10 best model with Validation AUC: 0.7696472943014665


HBox(children=(FloatProgress(value=0.0, max=6574.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=726.0), HTML(value='')))


Epoca - 11   Validation_loss - 0.535   Accuracy - 0.7299   AUC - 0.7700
Epoca - 11 best model with Validation AUC: 0.7700359225540041


HBox(children=(FloatProgress(value=0.0, max=6574.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=726.0), HTML(value='')))


Epoca - 12   Validation_loss - 0.535   Accuracy - 0.7299   AUC - 0.7701
Epoca - 12 best model with Validation AUC: 0.7700808036475638


In [28]:
LR = LR/10.
EPOCHS = 3

Training()

HBox(children=(FloatProgress(value=0.0, max=6574.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=726.0), HTML(value='')))


Epoca - 1   Validation_loss - 0.535   Accuracy - 0.7296   AUC - 0.7697
Epoca - 1 best model with Validation AUC: 0.7696741050243949


HBox(children=(FloatProgress(value=0.0, max=6574.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=726.0), HTML(value='')))


Epoca - 2   Validation_loss - 0.535   Accuracy - 0.7298   AUC - 0.7699
Epoca - 2 best model with Validation AUC: 0.7698910820978767


HBox(children=(FloatProgress(value=0.0, max=6574.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=726.0), HTML(value='')))


Epoca - 3   Validation_loss - 0.535   Accuracy - 0.7298   AUC - 0.7700
Epoca - 3 best model with Validation AUC: 0.769984999399729


In [29]:
class TestDataset(Dataset):
    def __init__(self, samples, test_df, n_skill, max_seq = 100):
        super(TestDataset, self).__init__()
        self.samples, self.user_ids, self.test_df = samples, [x for x in test_df["user_id"].unique()], test_df
        self.n_skill, self.max_seq = n_skill, max_seq

    def __len__(self):
        return self.test_df.shape[0]
    
    def __getitem__(self, index):
        test_info = self.test_df.iloc[index]
        
        user_id = test_info['user_id']
        target_id = test_info['content_id']
        
        content_id_seq = np.zeros(self.max_seq, dtype=int)
        answered_correctly_seq = np.zeros(self.max_seq, dtype=int)
        
        if user_id in self.samples.index:
            content_id, answered_correctly = self.samples[user_id]
            
            seq_len = len(content_id)
            
            if seq_len >= self.max_seq:
                content_id_seq = content_id[-self.max_seq:]
                answered_correctly_seq = answered_correctly[-self.max_seq:]
            else:
                content_id_seq[-seq_len:] = content_id
                answered_correctly_seq[-seq_len:] = answered_correctly
                
        x = content_id_seq[1:].copy()
        x += (answered_correctly_seq[1:] == 1) * self.n_skill
        
        questions = np.append(content_id_seq[2:], [target_id])
        
        return x, questions

In [30]:
import riiideducation

env = riiideducation.make_env()
iter_test = env.iter_test()

In [31]:
import psutil

model.eval()

prev_test_df = None

for (test_df, sample_prediction_df) in tqdm(iter_test):
    
    if (prev_test_df is not None) & (psutil.virtual_memory().percent<90):
        print(psutil.virtual_memory().percent)
        prev_test_df['answered_correctly'] = eval(test_df['prior_group_answers_correct'].iloc[0])
        prev_test_df = prev_test_df[prev_test_df.content_type_id == False]
        prev_group = prev_test_df[['user_id', 'content_id', 'answered_correctly']].groupby('user_id').apply(lambda r: (
            r['content_id'].values,
            r['answered_correctly'].values))
        for prev_user_id in prev_group.index:
            prev_group_content = prev_group[prev_user_id][0]
            prev_group_answered_correctly = prev_group[prev_user_id][1]
            if prev_user_id in group.index:
                group[prev_user_id] = (np.append(group[prev_user_id][0], prev_group_content), 
                                       np.append(group[prev_user_id][1], prev_group_answered_correctly))
            else:
                group[prev_user_id] = (prev_group_content, prev_group_answered_correctly)
            
            if len(group[prev_user_id][0]) > MAX_SEQ:
                new_group_content = group[prev_user_id][0][-MAX_SEQ:]
                new_group_answered_correctly = group[prev_user_id][1][-MAX_SEQ:]
                group[prev_user_id] = (new_group_content, new_group_answered_correctly)
                
    prev_test_df = test_df.copy()
    test_df = test_df[test_df.content_type_id == False]
    
    test_dataset = TestDataset(group, test_df, n_skill, max_seq = MAX_SEQ)
    test_dataloader = DataLoader(test_dataset, batch_size=len(test_df), shuffle=False)
    
    item = next(iter(test_dataloader))
    x = item[0].to(device).long()
    target_id = item[1].to(device).long()
    
    with torch.no_grad():
        output, _ = model(x, target_id)
        
    output = torch.sigmoid(output)
    output = output[:, -1]
    test_df['answered_correctly'] = output.cpu().numpy()
    env.predict(test_df.loc[test_df['content_type_id'] == 0, ['row_id', 'answered_correctly']])

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

26.2
26.2
26.0



In [32]:
# Save to pickle to usage in other notebooks
group.to_pickle('/kaggle/working/group.pkl')