<a href="https://colab.research.google.com/github/nisarahamedk/kaggle-riid/blob/master/notebooks/train_user_state_dict_for_inference.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%capture
!pip install gcsfs
!pip install kaggle
!pip install datatable

In [2]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# Copy Kaggle API key
!mkdir -p ~/.kaggle && cp /content/drive/My\ Drive/Projects/Kaggle/api_key/kaggle.json ~/.kaggle/

In [4]:
!kaggle datasets download -d rohanrao/riiid-train-data-multiple-formats

riiid-train-data-multiple-formats.zip: Skipping, found more recently modified local copy (use --force to force download)


In [None]:
!unzip riiid-train-data-multiple-formats

In [1]:
import pickle
import math

import gcsfs
import pandas as pd
import numpy as np
from tqdm import tqdm, trange
import tensorflow as tf
from datatable import dt, f, by, count


np.random.seed(42)
tf.random.set_seed(42)

In [2]:
table = dt.fread("riiid_train.jay")

In [3]:
table

          |    row_id  timestamp     user_id  content_id  content_type_id  task_container_id  user_answer  answered_…  …
--------- + ---------  ---------  ----------  ----------  ---------------  -----------------  -----------  ----------   
        0 |         0          0         115        5692                0                  1            3           1  …
        1 |         1      56943         115        5716                0                  2            2           1  …
        2 |         2     118363         115         128                0                  0            0           1  …
        3 |         3     131167         115        7860                0                  3            0           1  …
        4 |         4     137965         115        7922                0                  4            1           1  …
        5 |         5     157063         115         156                0                  5            2           1  …
        6 |         6     176092

Unnamed: 0_level_0,row_id,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation
Unnamed: 0_level_1,▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪,▪▪▪▪,▪,▪▪▪▪,▪▪▪▪,▪▪▪▪,▪▪▪▪▪▪▪▪,▪
0,0,0,115,5692,0,1,3,1,,
1,1,56943,115,5716,0,2,2,1,37000,0
2,2,118363,115,128,0,0,0,1,55000,0
3,3,131167,115,7860,0,3,0,1,19000,0
4,4,137965,115,7922,0,4,1,1,11000,0
5,5,157063,115,156,0,5,2,1,5000,0
6,6,176092,115,51,0,6,0,1,17000,0
7,7,194190,115,50,0,7,3,1,17000,0
8,8,212463,115,7896,0,8,2,1,16000,0
9,9,230983,115,7863,0,9,0,1,16000,0


In [4]:
user_table = table[-127:, :, by("user_id")] # last 127 rows of all the user ids

In [5]:
user_table

         |    user_id     row_id  timestamp  content_id  content_type_id  task_container_id  user_answer  answered_c…  …
-------- + ----------  ---------  ---------  ----------  ---------------  -----------------  -----------  -----------   
       0 |        115          0          0        5692                0                  1            3            1  …
       1 |        115          1      56943        5716                0                  2            2            1  …
       2 |        115          2     118363         128                0                  0            0            1  …
       3 |        115          3     131167        7860                0                  3            0            1  …
       4 |        115          4     137965        7922                0                  4            1            1  …
       5 |        115          5     157063         156                0                  5            2            1  …
       6 |        115          6

Unnamed: 0_level_0,user_id,row_id,timestamp,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation
Unnamed: 0_level_1,▪▪▪▪,▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪,▪,▪▪▪▪,▪▪▪▪,▪▪▪▪,▪▪▪▪▪▪▪▪,▪
0,115,0,0,5692,0,1,3,1,,
1,115,1,56943,5716,0,2,2,1,37000,0
2,115,2,118363,128,0,0,0,1,55000,0
3,115,3,131167,7860,0,3,0,1,19000,0
4,115,4,137965,7922,0,4,1,1,11000,0
5,115,5,157063,156,0,5,2,1,5000,0
6,115,6,176092,51,0,6,0,1,17000,0
7,115,7,194190,50,0,7,3,1,17000,0
8,115,8,212463,7896,0,8,2,1,16000,0
9,115,9,230983,7863,0,9,0,1,16000,0


In [6]:
dtypes_train = {
    'row_id': 'int64',
    'timestamp': 'int64',
    'user_id': 'int32',
    'content_id': 'int16',
    'content_type_id': 'int8',
    'task_container_id': 'int16',
    'user_answer': 'int8',
    'answered_correctly': 'int8',
    'prior_question_elapsed_time': 'float32',
    'prior_question_had_explanation': 'boolean'
    }

dtypes_questions = {
    "question_id": "",
    "bundle_id": "",
    "correct_answer": "",
    "part": "int16",
    "tags": "",
}

dtypes_lectures = {
    "lecture_id": "",
    "part": "",
    "tag": "",
    "type_of": "",
}

In [7]:
# convert to pandas DF required cols
train_df = user_table[f.content_type_id==0, [f.user_id, f.content_id, f.task_container_id, f.prior_question_elapsed_time, f.answered_correctly]].to_pandas()
train_df.head()

Unnamed: 0,user_id,content_id,task_container_id,prior_question_elapsed_time,answered_correctly
0,115,5692,1,,1
1,115,5716,2,37000.0,1
2,115,128,0,55000.0,1
3,115,7860,3,19000.0,1
4,115,7922,4,11000.0,1


In [8]:
DATA_PATH = 'gs://kds-7cd35ed419a621f754ec32f0c3616d2e9282a698c5eeaabc814bd7f6'
questions_df = pd.read_csv(DATA_PATH + "/questions.csv", usecols=[0,3], index_col="question_id")
questions_df.head()

Unnamed: 0_level_0,part
question_id,Unnamed: 1_level_1
0,1
1,1
2,1
3,1
4,1


In [9]:
# join question for feaures
train_df = train_df.join(questions_df, on="content_id")

In [10]:
# 0 used for padding so increment all indicator cols
indicator_cols = ["content_id", "task_container_id", "part", "answered_correctly"]
for c in indicator_cols:
    train_df[c] = train_df[c] + 1

In [11]:
# fillna and convert milliseconds to minutes.
train_df['prior_question_elapsed_time'] = train_df["prior_question_elapsed_time"].fillna(0).astype(np.float32) / 60000

In [12]:
train_df = train_df[["user_id", "content_id","task_container_id","prior_question_elapsed_time", "part", "answered_correctly"]].copy()
train_df.head()

Unnamed: 0,user_id,content_id,task_container_id,prior_question_elapsed_time,part,answered_correctly
0,115,5693,2,0.0,6,2
1,115,5717,3,0.616667,6,2
2,115,129,1,0.916667,2,2
3,115,7861,4,0.316667,2,2
4,115,7923,5,0.183333,2,2


In [13]:
user_groups = train_df.groupby("user_id").apply(
                lambda row: row.values[:, 1:] # exclude user_id
            )

In [14]:
user_groups.head()

user_id
115     [[5693.0, 2.0, 0.0, 6.0, 2.0], [5717.0, 3.0, 0...
124     [[7901.0, 1.0, 0.0, 2.0, 2.0], [7877.0, 2.0, 0...
2746    [[5274.0, 1.0, 0.0, 6.0, 1.0], [759.0, 2.0, 0....
5382    [[3945.0, 2.0, 0.4000000059604645, 6.0, 1.0], ...
8623    [[3916.0, 1.0, 0.0, 6.0, 2.0], [4751.0, 2.0, 0...
dtype: object

In [15]:
user_groups_dict = user_groups.to_dict()

In [16]:
max_seq_len, c = 0, 0
for k, v in user_groups_dict.items():
  if v.shape[0] == 127:
    c += 1
  max_seq_len = max(max_seq_len, v.shape[0])

max_seq_len, c

(127, 30349)

In [18]:
import pickle
!mkdir data
with open("data/state_dict.pkl", "wb") as f:
  pickle.dump(user_groups_dict, f)

mkdir: cannot create directory ‘data’: File exists


In [20]:

state_dict = {}

with open("data/state_dict.pkl", "rb") as f:
  state_dict = pickle.load(f)

state_dict[115].shape

(46, 5)

#### Upload to Kaggle

In [21]:
!kaggle datasets init -p data/

Data package template written to: data/dataset-metadata.json


In [22]:

# id and title only alphanumeric and "-"
meta = """
{
  "licenses": [
    {
      "name": "CC0-1.0"
    }
  ], 
  "id": "nisarahamedk/riid-state-dict-0-1",
  "title": "riid-state-dict-0-1"
}
"""
with open("data/dataset-metadata.json", "w") as f:
  f.write(meta)

In [23]:
# create
!kaggle datasets create -p data/ --dir-mode tar -u

Starting upload for file state_dict.pkl
100% 974M/974M [00:14<00:00, 70.9MB/s]
Upload successful: state_dict.pkl (974MB)
Your public Dataset is being created. Please check progress at https://www.kaggle.com/nisarahamedk/riid-state-dict-0-1
