<a href="https://colab.research.google.com/github/nisarahamedk/kaggle-riid/blob/master/notebooks/train_user_state_dict_for_inference_encdec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%capture
!pip install gcsfs
!pip install kaggle
!pip install datatable
!pip install -U modin[all] # -U for upgrade in case you have an older version

In [2]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [3]:
# Copy Kaggle API key
!mkdir -p ~/.kaggle && cp /content/drive/My\ Drive/Projects/Kaggle/api_key/kaggle.json ~/.kaggle/

In [4]:
!kaggle datasets download -d rohanrao/riiid-train-data-multiple-formats

Downloading riiid-train-data-multiple-formats.zip to /content
100% 4.16G/4.17G [02:10<00:00, 29.1MB/s]
100% 4.17G/4.17G [02:10<00:00, 34.3MB/s]


In [5]:
!unzip riiid-train-data-multiple-formats

Archive:  riiid-train-data-multiple-formats.zip
  inflating: riiid_train.feather     
  inflating: riiid_train.h5          
  inflating: riiid_train.jay         
  inflating: riiid_train.parquet     
  inflating: riiid_train.pkl.gzip    


In [2]:
import pickle
import math

import gcsfs
import modin.pandas as pd
import numpy as np
from tqdm import tqdm, trange
import tensorflow as tf
from datatable import dt, f, by, count


np.random.seed(42)
tf.random.set_seed(42)

In [7]:
SEQ_LEN = 512

In [8]:
table = dt.fread("riiid_train.jay")

In [10]:
table = table[f.content_type_id==0, :]
table.head()

   | row_id  timestamp  user_id  content_id  content_type_id  task_container_id  user_answer  answered_correctly    pr…  …
-- + ------  ---------  -------  ----------  ---------------  -----------------  -----------  ------------------  -----   
 0 |      0          0      115        5692                0                  1            3                   1     NA  …
 1 |      1      56943      115        5716                0                  2            2                   1  37000  …
 2 |      2     118363      115         128                0                  0            0                   1  55000  …
 3 |      3     131167      115        7860                0                  3            0                   1  19000  …
 4 |      4     137965      115        7922                0                  4            1                   1  11000  …
 5 |      5     157063      115         156                0                  5            2                   1   5000  …
 6 |      6     

Unnamed: 0_level_0,row_id,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation
Unnamed: 0_level_1,▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪,▪▪▪▪,▪,▪▪▪▪,▪▪▪▪,▪▪▪▪,▪▪▪▪▪▪▪▪,▪
0,0,0,115,5692,0,1,3,1,,
1,1,56943,115,5716,0,2,2,1,37000.0,0.0
2,2,118363,115,128,0,0,0,1,55000.0,0.0
3,3,131167,115,7860,0,3,0,1,19000.0,0.0
4,4,137965,115,7922,0,4,1,1,11000.0,0.0
5,5,157063,115,156,0,5,2,1,5000.0,0.0
6,6,176092,115,51,0,6,0,1,17000.0,0.0
7,7,194190,115,50,0,7,3,1,17000.0,0.0
8,8,212463,115,7896,0,8,2,1,16000.0,0.0
9,9,230983,115,7863,0,9,0,1,16000.0,0.0


In [11]:
user_table = table[-(SEQ_LEN-1):, :, by("user_id")] # last SEQ_LEN-1 rows of all the user ids, this is the history we need at inference time

In [13]:
user_table.head()

   | user_id  row_id  timestamp  content_id  content_type_id  task_container_id  user_answer  answered_correctly    pr…  …
-- + -------  ------  ---------  ----------  ---------------  -----------------  -----------  ------------------  -----   
 0 |     115       0          0        5692                0                  1            3                   1     NA  …
 1 |     115       1      56943        5716                0                  2            2                   1  37000  …
 2 |     115       2     118363         128                0                  0            0                   1  55000  …
 3 |     115       3     131167        7860                0                  3            0                   1  19000  …
 4 |     115       4     137965        7922                0                  4            1                   1  11000  …
 5 |     115       5     157063         156                0                  5            2                   1   5000  …
 6 |     115    

Unnamed: 0_level_0,user_id,row_id,timestamp,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation
Unnamed: 0_level_1,▪▪▪▪,▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪,▪,▪▪▪▪,▪▪▪▪,▪▪▪▪,▪▪▪▪▪▪▪▪,▪
0,115,0,0,5692,0,1,3,1,,
1,115,1,56943,5716,0,2,2,1,37000.0,0.0
2,115,2,118363,128,0,0,0,1,55000.0,0.0
3,115,3,131167,7860,0,3,0,1,19000.0,0.0
4,115,4,137965,7922,0,4,1,1,11000.0,0.0
5,115,5,157063,156,0,5,2,1,5000.0,0.0
6,115,6,176092,51,0,6,0,1,17000.0,0.0
7,115,7,194190,50,0,7,3,1,17000.0,0.0
8,115,8,212463,7896,0,8,2,1,16000.0,0.0
9,115,9,230983,7863,0,9,0,1,16000.0,0.0


In [4]:

dtypes_train = {
    'row_id': 'int64',
    'timestamp': 'int64',
    'user_id': 'int32',
    'content_id': 'int16',
    'content_type_id': 'int8',
    'task_container_id': 'int16',
    'user_answer': 'int8',
    'answered_correctly': 'int8',
    'prior_question_elapsed_time': 'float32',
    'prior_question_had_explanation': 'int8'
    }

dtypes_questions = {
    "question_id": "",
    "bundle_id": "",
    "correct_answer": "",
    "part": "int16",
    "tags": "",
}

dtypes_lectures = {
    "lecture_id": "",
    "part": "",
    "tag": "",
    "type_of": "",
}

In [15]:
# convert to pandas DF required cols
train_df = user_table[
                      f.content_type_id==0, 
                      [
                       f.timestamp, 
                       f.user_id, 
                       f.content_id, 
                       f.task_container_id, 
                       f.prior_question_elapsed_time, 
                       f.prior_question_had_explanation,
                       f.answered_correctly
                       ]
                      ].to_pandas()
train_df.head()

Unnamed: 0,timestamp,user_id,content_id,task_container_id,prior_question_elapsed_time,prior_question_had_explanation,answered_correctly
0,0,115,5692,1,,,1
1,56943,115,5716,2,37000.0,False,1
2,118363,115,128,0,55000.0,False,1
3,131167,115,7860,3,19000.0,False,1
4,137965,115,7922,4,11000.0,False,1


In [22]:
del table
del user_table

In [16]:

DATA_PATH = 'gs://kds-e7d6db6554e83e3f4182aa828879e31bf5c122e568c9ee97ab5d891f'
questions_df = pd.read_csv(DATA_PATH + "/questions.csv", index_col="question_id")
questions_df.fillna("-1", inplace=True) # tags Nan filled with -1
questions_df.head()

Unnamed: 0_level_0,bundle_id,correct_answer,part,tags
question_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0,0,1,51 131 162 38
1,1,1,1,131 36 81
2,2,0,1,131 101 162 92
3,3,0,1,131 149 162 29
4,4,3,1,131 5 162 38


In [17]:
# join question for feaures
train_df = train_df.join(questions_df, on="content_id")

In [18]:
# 0, 1, 2 are special tokens, so increment 3
indicator_cols = ["content_id", "task_container_id", "part", "answered_correctly", "prior_question_had_explanation"]
for c in indicator_cols:
  train_df[c] = train_df[c] + 3
train_df.head()

Unnamed: 0,timestamp,user_id,content_id,task_container_id,prior_question_elapsed_time,prior_question_had_explanation,answered_correctly,bundle_id,correct_answer,part,tags
0,0,115,5695,4,,,4,5692,3,8,151
1,56943,115,5719,5,37000.0,3.0,4,5716,2,8,168
2,118363,115,131,3,55000.0,3.0,4,128,0,4,131 149 92
3,131167,115,7863,6,19000.0,3.0,4,7860,0,4,131 104 81
4,137965,115,7925,7,11000.0,3.0,4,7922,1,4,131 149 92


In [19]:
# same treatment for the tags
train_df["tags"] = train_df["tags"].apply(lambda row: " ".join([str(int(x)+3) for x in row.split(" ")]))

In [20]:
train_df.head()

Unnamed: 0,timestamp,user_id,content_id,task_container_id,prior_question_elapsed_time,prior_question_had_explanation,answered_correctly,bundle_id,correct_answer,part,tags
0,0,115,5695,4,,,4,5692,3,8,154
1,56943,115,5719,5,37000.0,3.0,4,5716,2,8,171
2,118363,115,131,3,55000.0,3.0,4,128,0,4,134 152 95
3,131167,115,7863,6,19000.0,3.0,4,7860,0,4,134 107 84
4,137965,115,7925,7,11000.0,3.0,4,7922,1,4,134 152 95


In [24]:
import gc
gc.collect()

227

In [5]:
from dask.distributed import Client, progress
client = Client(n_workers=2, threads_per_worker=8, memory_limit='4GB')
client

0,1
Client  Scheduler: tcp://127.0.0.1:38469  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 2  Cores: 16  Memory: 8.00 GB


In [6]:
import dask
import dask.dataframe as dd

In [23]:
train_df.to_csv("train_df.csv")

In [7]:
train_df = dd.read_csv("train_df.csv")

In [8]:
user_groups = train_df.groupby("user_id")

In [None]:
user_groups_arr = user_groups.apply(
  lambda rows: (
      rows["timestamp"].values.astype(dtypes_train["timestamp"]),
      rows["content_id"].values.astype(dtypes_train["content_id"]), 
      rows["task_container_id"].values.astype(dtypes_train["task_container_id"]), 
      rows["prior_question_elapsed_time"].shift(-1, fill_value=-1).values.astype(dtypes_train["prior_question_elapsed_time"]), # last question for the user doesnt have elapsed time. fill with -1
      rows["prior_question_had_explanation"].shift(-1, fill_value=2).values.astype(dtypes_train["prior_question_had_explanation"]), # last question for the user doesnt have "had_explanation". fill with 2
      rows["part"].values.astype(dtypes_questions["part"]),
      rows["tags"].values,
      rows["answered_correctly"].values.astype(dtypes_train["answered_correctly"]),
      ), meta=object
  ).compute()



In [None]:
user_groups_dict = user_groups.to_dict()

In [None]:
max_seq_len, c = 0, 0
for k, v in user_groups_dict.items():
  if v.shape[0] == 127:
    c += 1
  max_seq_len = max(max_seq_len, v.shape[0])

max_seq_len, c

(127, 30349)

In [None]:
import pickle
!mkdir data
with open("data/state_dict.pkl", "wb") as f:
  pickle.dump(user_groups_dict, f)

mkdir: cannot create directory ‘data’: File exists


In [None]:

state_dict = {}

with open("data/state_dict.pkl", "rb") as f:
  state_dict = pickle.load(f)

state_dict[115].shape

(46, 5)

#### Upload to Kaggle

In [None]:
!kaggle datasets init -p data/

Data package template written to: data/dataset-metadata.json


In [None]:

# id and title only alphanumeric and "-"
meta = """
{
  "licenses": [
    {
      "name": "CC0-1.0"
    }
  ], 
  "id": "nisarahamedk/riid-state-dict-0-1",
  "title": "riid-state-dict-0-1"
}
"""
with open("data/dataset-metadata.json", "w") as f:
  f.write(meta)

In [None]:
# create
!kaggle datasets create -p data/ --dir-mode tar -u

Starting upload for file state_dict.pkl
100% 974M/974M [00:14<00:00, 70.9MB/s]
Upload successful: state_dict.pkl (974MB)
Your public Dataset is being created. Please check progress at https://www.kaggle.com/nisarahamedk/riid-state-dict-0-1
