In [43]:
import pandas as pd
from sklearn.model_selection import train_test_split

import torch
from torch import nn
from torch.nn.utils.rnn import pad_sequence

# Introduction

Le fichier train.csv comprend un peu plus de 100 millions de lignes.  
Il y a un peu moins de 400 mille user_id uniques.  
Le modèle utilisé dans ce notebook prend en entrée une série par utilisateur.

# Features utilisées
Pour chaque question de la série d'apprentissage d'un utilisateur quelconque  

1. Question ID: correspond à content_id (lorsque l'élément est une question).
2. Question part: correspond à part dans question.csv pour l'élément dont question_id correspondant à content_id
3. Answer correctness: valeur de answered_correctly de l'exemple (ou target encoding de la question ?)
4. Current question elapsed time: prior question de la question suivante.
5. Timestamp difference: current question timestamp - timestamp of the last question from the same user

# Chargement des données

In [2]:
questions_df = pd.read_csv('../input/riiid-test-answer-prediction/questions.csv', usecols=[0, 3])
questions_df

Unnamed: 0,question_id,part
0,0,1
1,1,1
2,2,1
3,3,1
4,4,1
...,...,...
13518,13518,5
13519,13519,5
13520,13520,5
13521,13521,5


In [3]:
data_types_dict={
    #'row_id': 'int64',
    'timestamp': 'int64',
    'user_id': 'int32',
    'content_id': 'int16',
    'content_type_id': 'int8',
    #'task_container_id': 'int16',
    #'user_answer': 'int8',
    'answered_correctly': 'int8',
    'prior_question_elapsed_time': 'float32', 
    #'prior_question_had_explanation': 'boolean',
    }

In [4]:
train = pd.read_csv('../input/riiid-test-answer-prediction/train.csv', usecols=data_types_dict.keys(), dtype=data_types_dict)[:1_000_000]#, skipfooter=90_000_000)

In [5]:
train

Unnamed: 0,timestamp,user_id,content_id,content_type_id,answered_correctly,prior_question_elapsed_time
0,0,115,5692,0,1,
1,56943,115,5716,0,1,37000.0
2,118363,115,128,0,1,55000.0
3,131167,115,7860,0,1,19000.0
4,137965,115,7922,0,1,11000.0
...,...,...,...,...,...,...
999995,26482248,20949024,8803,0,1,14000.0
999996,26516686,20949024,4664,0,1,17000.0
999997,26537967,20949024,4108,0,0,18000.0
999998,26590240,20949024,5014,0,0,6000.0


In [6]:
train = train[train.content_type_id == 0].drop('content_type_id', axis=1)

In [7]:
train = train.merge(questions_df, left_on="content_id", right_on="question_id").drop('content_id', axis=1)

In [8]:
train.sort_values(['user_id', 'timestamp'], inplace=True)

In [9]:
train

Unnamed: 0,timestamp,user_id,answered_correctly,prior_question_elapsed_time,question_id,part
0,0,115,1,,5692,5
337,56943,115,1,37000.0,5716,5
579,118363,115,1,55000.0,128,1
760,131167,115,1,19000.0,7860,1
963,137965,115,1,11000.0,7922,1
...,...,...,...,...,...,...
207187,26482248,20949024,1,14000.0,8803,5
89100,26516686,20949024,1,17000.0,4664,5
86033,26537967,20949024,0,18000.0,4108,5
283089,26590240,20949024,0,6000.0,5014,5


# Construction des entrées

Pour le moment on conserve le prior question elapsed time au lieu de construire le current question elapsed time

In [10]:
train['timestamp_diff'] = train.groupby('user_id')['timestamp'].diff().fillna(0)

In [11]:
train.drop('timestamp', axis=1, inplace=True)

In [12]:
train

Unnamed: 0,user_id,answered_correctly,prior_question_elapsed_time,question_id,part,timestamp_diff
0,115,1,,5692,5,0.0
337,115,1,37000.0,5716,5,56943.0
579,115,1,55000.0,128,1,61420.0
760,115,1,19000.0,7860,1,12804.0
963,115,1,11000.0,7922,1,6798.0
...,...,...,...,...,...,...
207187,20949024,1,14000.0,8803,5,25769.0
89100,20949024,1,17000.0,4664,5,34438.0
86033,20949024,0,18000.0,4108,5,21281.0
283089,20949024,0,6000.0,5014,5,52273.0


In [36]:
batch = [torch.tensor(x.drop('user_id', axis=1).values) for _, x in train.groupby('user_id')]

In [39]:
batch[1].size()

torch.Size([30, 5])

In [41]:
x_train = pad_sequence(batch)

# Modèle

In [45]:
encoder_layer = nn.TransformerEncoderLayer(d_model=512, nhead=8)
transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=1)