In [1]:
import sys, os

# Add utility_scripts in the current path so that they can be imported directly just like in interactive mode
sys.path.append(os.path.abspath("../usr/lib/"))
for script_folder in os.listdir("../usr/lib/"):
    sys.path.append(os.path.abspath("../usr/lib/"+script_folder))

In [2]:
import pickle 
import pandas as pd

# Chargement de train et jointure avec questions

In [3]:
questions_df = pd.read_csv('../input/riiid-test-answer-prediction/questions.csv', usecols=[0, 1, 3])
questions_df[:5]

Unnamed: 0,question_id,bundle_id,part
0,0,0,1
1,1,1,1
2,2,2,1
3,3,3,1
4,4,4,1


In [4]:
data_types_dict={
    #'row_id': 'int64',
    'timestamp': 'int64',
    'user_id': 'int32',
    'content_id': 'int16',
    'content_type_id': 'int8',
    #'task_container_id': 'int16',
    #'user_answer': 'int8',
    'answered_correctly': 'int8',
    'prior_question_elapsed_time': 'float32', 
    #'prior_question_had_explanation': 'boolean',
    }

In [5]:
train = pd.read_csv('../input/riiid-test-answer-prediction/train.csv', usecols=data_types_dict.keys(), dtype=data_types_dict)
# skipfooter forces the use of the python engine which is much slower and too slow for this file.

In [6]:
train = train[train.content_type_id == 0].drop('content_type_id', axis=1)
train = train.merge(questions_df, left_on="content_id", right_on="question_id").drop('content_id', axis=1)

In [8]:
train.sort_values(['user_id', 'timestamp'], inplace=True)

In [9]:
train[:5]

Unnamed: 0,timestamp,user_id,answered_correctly,prior_question_elapsed_time,question_id,bundle_id,part
0,0,115,1,,5692,5692,5
36514,56943,115,1,37000.0,5716,5716,5
60505,118363,115,1,55000.0,128,128,1
80124,131167,115,1,19000.0,7860,7860,1
101857,137965,115,1,11000.0,7922,7922,1


In [10]:
train.reset_index().to_feather('train-subset.feather')

# Construction des entrées et de sorties

In [3]:
train = pd.read_feather('train-subset.feather').drop(['index'], axis=1)
train

Unnamed: 0,timestamp,user_id,answered_correctly,prior_question_elapsed_time,question_id,bundle_id,part
0,0,115,1,,5692,5692,5
1,56943,115,1,37000.0,5716,5716,5
2,118363,115,1,55000.0,128,128,1
3,131167,115,1,19000.0,7860,7860,1
4,137965,115,1,11000.0,7922,7922,1
...,...,...,...,...,...,...,...
99271295,428564420,2147482888,1,18000.0,3586,3586,5
99271296,428585000,2147482888,1,14000.0,6341,6341,5
99271297,428613475,2147482888,1,14000.0,4212,4212,5
99271298,428649406,2147482888,0,22000.0,6343,6343,5


In [6]:
users_y = train.groupby('user_id', sort=False).tail(1)['answered_correctly'].values
users_y

array([1, 0, 1, ..., 1, 1, 1], dtype=int8)

In [22]:
with open('users_y.pickle', 'wb') as f:
    pickle.dump(users_y, f)

In [4]:
train.min() # on vérifie que le min est proche de 0 afin de ne pas perdre de place dans nn.embedding où les vecteurs sont alloués à partir de l'index 0

timestamp                        0.0
user_id                        115.0
answered_correctly               0.0
prior_question_elapsed_time      0.0
question_id                      0.0
bundle_id                        0.0
part                             1.0
dtype: float64

In [5]:
train['answered_correctly'] = train['answered_correctly']+1 # Incorrect answer becomes 1 and correct 2 to let 0 be used for padding
train['question_id'] = train['question_id'] + 1 # so that 0 can be used for padding

We offset the output embeddings by one position to make sure that predictions for position $i$ can depend only on the known features at position less than $i$: 'answered_correctly' now means 'last question answered_correctly'. Look ahead mask is not required because we only predict the last answer's correctness.

In [6]:
train['answered_correctly'] = train.groupby('user_id', sort=False)['answered_correctly'].shift(fill_value=3) # 3 is ≃ <sos> token

We offset 'prior_question_elapsed_time' by one position to get 'current_question_elapsed_time'

In [7]:
train['current_question_elapsed_time'] = train.groupby('user_id', sort=False)['prior_question_elapsed_time'].shift(-1, fill_value=0) # 0 is ≃ <eos> token
# due to how we applied the shift, in a bundle of questions only the last question will have the correct elapsed time value, the others will still have the prior elapsed time
# therefore, we only keep the last value per bundle and do a backward fill
train['current_question_elapsed_time'] = train.groupby(['user_id', 'bundle_id'], sort=False)['current_question_elapsed_time'].tail(1)
train['current_question_elapsed_time'] = train['current_question_elapsed_time'].bfill()
train.drop('prior_question_elapsed_time', axis=1, inplace=True)

Timestamp_diff is the difference from the past question timestamp to the current question timestamp clipped by a maximum value of 3 days (2,592×10⁵ seconds).  
Questions pertaining to a same bundle will have a timestamp diff of 0 except for the first question of the bundle. We apply a forward fill to broadcast this value to the whole bundle.

In [8]:
train['timestamp_diff'] = train.groupby('user_id', sort=False)['timestamp'].diff().replace(0, method='ffill').fillna(0)
train['timestamp_diff'].clip(lower=None, upper=2.592*1e5, inplace=True)
train.drop(columns='timestamp', inplace=True)

In [9]:
train = train[['user_id', 'question_id', 'part', 'answered_correctly', 'current_question_elapsed_time', 'timestamp_diff']]

In [12]:
train

Unnamed: 0,user_id,question_id,part,answered_correctly,current_question_elapsed_time,timestamp_diff
0,115,5693,5,3,37000.0,0.0
1,115,5717,5,2,55000.0,56943.0
2,115,129,1,2,19000.0,61420.0
3,115,7861,1,2,11000.0,12804.0
4,115,7923,1,2,5000.0,6798.0
...,...,...,...,...,...,...
99271295,2147482888,3587,5,2,14000.0,21548.0
99271296,2147482888,6342,5,2,14000.0,20580.0
99271297,2147482888,4213,5,2,22000.0,28475.0
99271298,2147482888,6344,5,2,29000.0,35931.0


In [13]:
train.dtypes

user_id                            int32
question_id                        int64
part                               int64
answered_correctly                  int8
current_question_elapsed_time    float32
timestamp_diff                   float64
dtype: object

In [10]:
train.reset_index(drop=True).to_feather('train-engineered.feather')

# Séquence par utilisateur

In [4]:
train = pd.read_feather('train-engineered.feather')
train

Unnamed: 0,user_id,question_id,part,answered_correctly,current_question_elapsed_time,timestamp_diff
0,115,5693,5,3,37000.0,0.0
1,115,5717,5,2,55000.0,56943.0
2,115,129,1,2,19000.0,61420.0
3,115,7861,1,2,11000.0,12804.0
4,115,7923,1,2,5000.0,6798.0
...,...,...,...,...,...,...
99271295,2147482888,3587,5,2,14000.0,21548.0
99271296,2147482888,6342,5,2,14000.0,20580.0
99271297,2147482888,4213,5,2,22000.0,28475.0
99271298,2147482888,6344,5,2,29000.0,35931.0


In [5]:
#batch = [x.drop('user_id', axis=1).values for _, x in train.groupby('user_id')]
users_cat = [x[['question_id', 'part', 'answered_correctly']].values for _, x in train.groupby('user_id', sort=False)]

In [9]:
with open('users_cat_4.pickle', 'wb') as f:
    pickle.dump(users_cat, f, protocol=4)

In [None]:
with open('users_cat_5.pickle', 'wb') as f:
    pickle.dump(users_cat, f, protocol=5)

In [14]:
users_cont = [x[['current_question_elapsed_time', 'timestamp_diff']].values for _, x in train.groupby('user_id', sort=False)]

In [15]:
with open('users_cont_4.pickle', 'wb') as f:
    pickle.dump(users_cont, f, protocol=4)