<a href="https://colab.research.google.com/github/nisarahamedk/kaggle-riid/blob/master/notebooks/riid_tfrecords.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
%%capture
!pip install gcsfs
!pip install kaggle
!pip install datatable

In [3]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [9]:
# Copy Kaggle API key
!mkdir -p ~/.kaggle && cp /content/drive/My\ Drive/Projects/Kaggle/api_key/kaggle.json ~/.kaggle/

In [10]:
!kaggle datasets download -d rohanrao/riiid-train-data-multiple-formats

Downloading riiid-train-data-multiple-formats.zip to /content
100% 4.16G/4.17G [00:52<00:00, 83.4MB/s]
100% 4.17G/4.17G [00:52<00:00, 84.5MB/s]


In [11]:
!unzip riiid-train-data-multiple-formats

Archive:  riiid-train-data-multiple-formats.zip
  inflating: riiid_train.feather     
  inflating: riiid_train.h5          
  inflating: riiid_train.jay         
  inflating: riiid_train.parquet     
  inflating: riiid_train.pkl.gzip    


In [12]:

import math

import gcsfs
import pandas as pd
import numpy as np
from tqdm import tqdm, trange
import tensorflow as tf
from datatable import dt, f, by, count


np.random.seed(42)
tf.random.set_seed(42)

In [13]:
table = dt.fread("riiid_train.jay")

In [14]:
table.head(4)

   | row_id  timestamp  user_id  content_id  content_type_id  task_container_id  user_answer  answered_correctly    pr…  …
-- + ------  ---------  -------  ----------  ---------------  -----------------  -----------  ------------------  -----   
 0 |      0          0      115        5692                0                  1            3                   1     NA  …
 1 |      1      56943      115        5716                0                  2            2                   1  37000  …
 2 |      2     118363      115         128                0                  0            0                   1  55000  …
 3 |      3     131167      115        7860                0                  3            0                   1  19000  …

[4 rows x 10 columns]


Unnamed: 0_level_0,row_id,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation
Unnamed: 0_level_1,▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪,▪▪▪▪,▪,▪▪▪▪,▪▪▪▪,▪▪▪▪,▪▪▪▪▪▪▪▪,▪
0,0,0,115,5692,0,1,3,1,,
1,1,56943,115,5716,0,2,2,1,37000.0,0.0
2,2,118363,115,128,0,0,0,1,55000.0,0.0
3,3,131167,115,7860,0,3,0,1,19000.0,0.0


In [15]:
users_df = table[:, [f.user_id, f.content_id]].to_pandas()
users_df.head()

Unnamed: 0,user_id,content_id
0,115,5692
1,115,5716
2,115,128
3,115,7860
4,115,7922


In [16]:
f"{len(users_df):,}"

'101,230,332'

In [17]:
user_groups = users_df.groupby("user_id")
len(user_groups)

393656

In [18]:
user_groups_counts = user_groups.count().cumsum().reset_index()

In [19]:
user_groups_counts.head()

Unnamed: 0,user_id,content_id
0,115,46
1,124,76
2,2746,96
3,5382,224
4,8623,336


#### Cross Validation

##### Simple K Fold

In [20]:
from sklearn.model_selection import KFold

kf = KFold(n_splits=32, shuffle=False) #, random_state=42)

for fold, (idx_train, idx_valid) in enumerate(kf.split(X=user_groups_counts)):
    user_groups_counts.loc[idx_valid, "fold"] = fold
    
user_groups_counts.head()

Unnamed: 0,user_id,content_id,fold
0,115,46,0.0
1,124,76,0.0
2,2746,96,0.0
3,5382,224,0.0
4,8623,336,0.0


In [21]:
user_groups_counts["fold"].value_counts()[:4]

23.0    12302
18.0    12302
2.0     12302
17.0    12302
14.0    12302
9.0     12302
4.0     12302
Name: fold, dtype: int64

In [23]:
folds_arr = user_groups_counts.groupby("fold").apply(lambda r: r["content_id"].values)
folds_arr[:4]

fold
0.0    [46, 76, 96, 224, 336, 353, 624, 1874, 8338, 8...
1.0    [3112067, 3112612, 3113940, 3113957, 3113987, ...
2.0    [6268883, 6268926, 6268970, 6269018, 6269078, ...
3.0    [9329606, 9329636, 9329726, 9329757, 9329778, ...
4.0    [12423772, 12424008, 12424038, 12424068, 12424...
dtype: object

In [24]:
folds_arr[0][0]

46

In [25]:
folds_partitions = {}
fold_start = 0
for fold in range(32):
  folds_partitions[fold] = {}
  folds_partitions[fold]["start"] = int(fold_start)
  
  end = folds_arr[fold][-1]
  folds_partitions[fold]["end"] = int(end)
  
  fold_start = end
  
folds_partitions

{0: {'end': 3111976, 'start': 0},
 1: {'end': 6268678, 'start': 3111976},
 2: {'end': 9329569, 'start': 6268678},
 3: {'end': 12423404, 'start': 9329569},
 4: {'end': 15699848, 'start': 12423404},
 5: {'end': 18835690, 'start': 15699848},
 6: {'end': 21951383, 'start': 18835690},
 7: {'end': 25203902, 'start': 21951383},
 8: {'end': 28361734, 'start': 25203902},
 9: {'end': 31461135, 'start': 28361734},
 10: {'end': 34743785, 'start': 31461135},
 11: {'end': 38007120, 'start': 34743785},
 12: {'end': 41263135, 'start': 38007120},
 13: {'end': 44518260, 'start': 41263135},
 14: {'end': 47748042, 'start': 44518260},
 15: {'end': 50878303, 'start': 47748042},
 16: {'end': 54003891, 'start': 50878303},
 17: {'end': 57139684, 'start': 54003891},
 18: {'end': 60255648, 'start': 57139684},
 19: {'end': 63311425, 'start': 60255648},
 20: {'end': 66455546, 'start': 63311425},
 21: {'end': 69592621, 'start': 66455546},
 22: {'end': 72718669, 'start': 69592621},
 23: {'end': 75794315, 'start': 72

#### Lets process 1 Fold

In [38]:
fold = 3

fold_df = table[folds_partitions[fold]["start"]:folds_partitions[fold]["end"], :].to_pandas()
fold_df.head()

Unnamed: 0,row_id,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation
0,9329569,0,202836341,4207,False,0,3,1,,
1,9329570,13323,202836341,4097,False,1,2,1,11000.0,False
2,9329571,27321,202836341,6659,False,2,1,0,9000.0,False
3,9329572,54416,202836341,6650,False,3,1,1,12000.0,False
4,9329573,74648,202836341,5653,False,4,2,0,24000.0,False


In [93]:
dtypes_train = {
    'row_id': 'int64',
    'timestamp': 'int64',
    'user_id': 'int32',
    'content_id': 'int16',
    'content_type_id': 'int8',
    'task_container_id': 'int16',
    'user_answer': 'int8',
    'answered_correctly': 'int8',
    'prior_question_elapsed_time': 'float32',
    'prior_question_had_explanation': 'boolean'
    }

dtypes_questions = {
    "question_id": "",
    "bundle_id": "",
    "correct_answer": "",
    "part": "int16",
    "tags": "",
}

dtypes_lectures = {
    "lecture_id": "",
    "part": "",
    "tag": "",
    "type_of": "",
}

In [39]:
DATA_PATH = 'gs://kds-0e01f6e460a5b44782a68afc7e7608a54fe969152a185f32b5dba119'
questions_df = pd.read_csv(DATA_PATH + "/questions.csv")
questions_df.head()

Unnamed: 0,question_id,bundle_id,correct_answer,part,tags
0,0,0,0,1,51 131 162 38
1,1,1,1,1,131 36 81
2,2,2,0,1,131 101 162 92
3,3,3,0,1,131 149 162 29
4,4,4,3,1,131 5 162 38


In [40]:
# removing lecture rows.
fold_df = fold_df[fold_df.answered_correctly != -1]

In [41]:
fold_df = fold_df.join(questions_df, on="content_id")
fold_df.head()

Unnamed: 0,row_id,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,question_id,bundle_id,correct_answer,part,tags
0,9329569,0,202836341,4207,False,0,3,1,,,4207,4207,3,5,89
1,9329570,13323,202836341,4097,False,1,2,1,11000.0,False,4097,4097,2,5,55
2,9329571,27321,202836341,6659,False,2,1,0,9000.0,False,6659,6659,3,5,53
3,9329572,54416,202836341,6650,False,3,1,1,12000.0,False,6650,6650,1,5,53
4,9329573,74648,202836341,5653,False,4,2,0,24000.0,False,5653,5653,1,5,96


In [42]:
drop_cols = [
             "row_id", 
             "timestamp", 
             "content_type_id", 
             "user_answer", 
             "prior_question_had_explanation", 
             "question_id", 
             "bundle_id", 
             "correct_answer", 
             "tags"
             ]
fold_df = fold_df.drop(drop_cols, axis=1)
fold_df.head()

Unnamed: 0,user_id,content_id,task_container_id,answered_correctly,prior_question_elapsed_time,part
0,202836341,4207,0,1,,5
1,202836341,4097,1,1,11000.0,5
2,202836341,6659,2,0,9000.0,5
3,202836341,6650,3,1,12000.0,5
4,202836341,5653,4,0,24000.0,5


In [43]:
# 0 is used for padding, so increment 1
indicator_cols = ["content_id", "task_container_id", "part"]
for c in indicator_cols:
  fold_df[c] = fold_df[c] + 1
fold_df.head()

Unnamed: 0,user_id,content_id,task_container_id,answered_correctly,prior_question_elapsed_time,part
0,202836341,4208,1,1,,6
1,202836341,4098,2,1,11000.0,6
2,202836341,6660,3,0,9000.0,6
3,202836341,6651,4,1,12000.0,6
4,202836341,5654,5,0,24000.0,6


In [94]:
user_groups = fold_df.groupby("user_id")
user_groups_arr = user_groups.apply(
    lambda rows: (
        rows["content_id"].values.astype(dtypes_train["content_id"]), 
        rows["task_container_id"].values.astype(dtypes_train["task_container_id"]), 
        rows["prior_question_elapsed_time"].values.astype(dtypes_train["prior_question_elapsed_time"]),
        rows["part"].values.astype(dtypes_questions["part"]),
        (rows["answered_correctly"].shift(fill_value=2)+1).values.astype(dtypes_train["answered_correctly"]), # previous question answered as a feature, 2 as fill (because first q would not have this val), +1 since we consider 0 as padding.
        rows["answered_correctly"].values.astype(dtypes_train["answered_correctly"]),
        )
    )
user_groups_arr

user_id
202836341    ([4208, 4098, 6660, 6651, 5654, 9694, 5653, 52...
202838719    ([7901, 7877, 176, 1279, 2065, 2066, 2064, 336...
202840484    ([7901, 7877, 176, 1279, 2064, 2066, 2065, 336...
202846170    ([6669, 285, 5962, 3659, 5301, 176, 218, 840, ...
202846211    ([6266, 4661, 5499, 4555, 9700, 4881, 4089, 66...
                                   ...                        
270318132    ([7901, 7877, 176, 1279, 2065, 2064, 2066, 336...
270336415    ([7901, 7877, 176, 1279, 2066, 2064, 2065, 336...
270342519    ([7901, 7877, 176, 1279, 2065, 2066, 2064, 336...
270348524    ([4519, 326, 489, 218, 5845, 9301, 4124, 8208,...
270349461    ([4517, 5024, 218, 4083, 6375, 4047, 6361, 424...
Length: 12302, dtype: object

In [134]:
user_groups_arr[202836341][0] # content_id

array([ 4208,  4098,  6660,  6651,  5654,  9694,  5653,   527,   658,
         386, 10688,  1375,   912,   219, 10687,   885,  1151,  5718,
        3874,  4674,  8355,  6676,  3999,  5457,  8905,  4261,  4979,
        6475,  9372,  6252,  6454,  3683,  9452,  3802,  5583,  4459,
        9598], dtype=int16)

In [138]:
user_groups_arr[202836341][1] # task_container_id

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
       35, 36, 37], dtype=int16)

In [95]:
user_groups_arr[202836341][4] # prev_answered_correctly

array([3, 2, 2, 1, 2, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 1, 1, 2, 2,
       2, 2, 2, 2, 1, 2, 2, 1, 2, 1, 1, 2, 2, 2, 2], dtype=int8)

In [96]:
user_groups_arr[202836341][5] + 1 # answered_correctly +1 just for comparing with above

array([2, 2, 1, 2, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 1, 1, 2, 2, 2,
       2, 2, 2, 1, 2, 2, 1, 2, 1, 1, 2, 2, 2, 2, 1], dtype=int8)

In [112]:
for i in range(6):
  print(user_groups_arr[202836341][i].dtype)

int16
int16
float32
int16
int8
int8


In [113]:
def _bytes_feature(value):
  """Returns a bytes_list from a string / byte."""
  if isinstance(value, type(tf.constant(0))):
    value = value.numpy() # BytesList won't unpack a string from an EagerTensor.
  return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _float_feature(value):
  """Returns a float_list from a float / double."""
  return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

def _int64_feature(value):
  """Returns an int64_list from a bool / enum / int / uint."""
  return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

In [114]:
def serialize_example(user_arr):
  feature = {
      "content_id": _bytes_feature(tf.io.serialize_tensor(user_arr[0])),
      "task_container_id": _bytes_feature(tf.io.serialize_tensor(user_arr[1])),
      "prior_question_elapsed_time": _bytes_feature(tf.io.serialize_tensor(user_arr[2])),
      "part": _bytes_feature(tf.io.serialize_tensor(user_arr[3])),
      "prev_answered_correctly": _bytes_feature(tf.io.serialize_tensor(user_arr[4])),
      "answered_correctly": _bytes_feature(tf.io.serialize_tensor(user_arr[5])),
  }
  proto_example = tf.train.Example(features=tf.train.Features(feature=feature))
  serialized_example = proto_example.SerializeToString()
  return serialized_example

In [115]:
feature_desc = {
    "content_id": tf.io.FixedLenFeature([], tf.string),
    "task_container_id": tf.io.FixedLenFeature([], tf.string),
    "prior_question_elapsed_time": tf.io.FixedLenFeature([], tf.string),
    "part": tf.io.FixedLenFeature([], tf.string),
    "prev_answered_correctly": tf.io.FixedLenFeature([], tf.string),
    "answered_correctly": tf.io.FixedLenFeature([], tf.string),
}

In [145]:
def create_tfrecords_from_user_groups(user_groups_arr, part):

  with tf.io.TFRecordWriter(f"tfrec_{part}.tfrec") as writer:
    for user_arr in user_groups_arr:
      example = serialize_example(user_arr)

      writer.write(example)

In [117]:
create_tfrecords_from_user_groups(user_groups_arr, 0)

### Read TFRecords and Create TF Dataset

In [118]:
SEQ_LEN = 200

In [119]:
def parse_example(example):
  example = tf.io.parse_single_example(example, feature_desc)

  content_id = tf.io.parse_tensor(example["content_id"], tf.int16)
  task_container_id = tf.io.parse_tensor(example["task_container_id"], tf.int16)
  prior_question_elapsed_time = tf.io.parse_tensor(example["prior_question_elapsed_time"], tf.float32)
  part = tf.io.parse_tensor(example["part"], tf.int16)
  prev_answered_correctly = tf.io.parse_tensor(example["prev_answered_correctly"], tf.int8)
  answered_correctly = tf.io.parse_tensor(example["answered_correctly"], tf.int8)
  
  return tf.stack([
      tf.cast(content_id, tf.float32),
      tf.cast(task_container_id, tf.float32),
      tf.cast(prior_question_elapsed_time, tf.float32),
      tf.cast(part, tf.float32),
      tf.cast(prev_answered_correctly, tf.float32),
      tf.cast(answered_correctly, tf.float32)
  ])

In [120]:
filenames = tf.io.gfile.glob("*.tfrec")
filenames

['./tfrecoreds.tfrec']

In [121]:
dataset = tf.data.TFRecordDataset(filenames)

In [122]:
dataset = dataset.map(parse_example)

In [123]:
@tf.function
def pad(a, seq_len, max_seq_len):
  s = max_seq_len - seq_len
  # making [[0, 0], [s, 0]]
  r = tf.stack([s, tf.constant(0)])
  t = tf.stack([tf.constant([0, 0]), r])
  
  return tf.pad(a, t) # ,1 to debug

@tf.function
def trim(a, seq_len,  max_seq_len):
  start = tf.squeeze(tf.random.uniform((1,), maxval=(seq_len-max_seq_len), dtype=tf.int32))
  # https://www.quora.com/How-does-tf-slice-work-in-TensorFlow
  begin = tf.stack([tf.constant(0), start])
  size = tf.stack([tf.shape(a)[0], max_seq_len])
  
  return tf.slice(a, begin, size) # , start - to debug

@tf.function
def pad_or_trim(a):
  seq_len = tf.shape(a)[-1]
  max_seq_len = SEQ_LEN
  fn = tf.cond(tf.less_equal(seq_len, max_seq_len), lambda: pad(a, seq_len, max_seq_len), lambda: trim(a, seq_len, max_seq_len))
  return fn

In [124]:
dataset = dataset.map(pad_or_trim) # every sample is padded if len < SEQ_LEN or randomly trimmed to SEQ_LEN

In [135]:
for i in dataset.take(1):
  print(i.shape)

(6, 200)


In [136]:
i[0]

<tf.Tensor: shape=(200,), dtype=float32, numpy=
array([    0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
           0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
           0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
           0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
           0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
           0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
           0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
           0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
           0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
           0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
           0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
           0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
           0.,     0.,     0.,     0.,     0.,     0.,     0.,     0.,
           0.,     0.,     0.

In [137]:
i[1]

<tf.Tensor: shape=(200,), dtype=float32, numpy=
array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  2.,  3.,  4.,  5.,  6.,
        7.,  8.,

In [139]:
data_batch = dataset.batch(128, drop_remainder=True)

for item in data_batch.take(5):
  print(item.shape)

(128, 6, 200)
(128, 6, 200)
(128, 6, 200)
(128, 6, 200)
(128, 6, 200)


#### Lets process all the folds

In [143]:
!rm -f riiid_train*.* riiid-train*.* *.tfrec

In [None]:
for fold, parition in folds_partitions.items():
  fold_df = table[folds_partitions[fold]["start"]:folds_partitions[fold]["end"], :].to_pandas()
  fold_df = fold_df[fold_df.answered_correctly != -1]
  fold_df = fold_df.join(questions_df, on="content_id")
  drop_cols = [
             "row_id", 
             "timestamp", 
             "content_type_id", 
             "user_answer", 
             "prior_question_had_explanation", 
             "question_id", 
             "bundle_id", 
             "correct_answer", 
             "tags"
             ]
  fold_df = fold_df.drop(drop_cols, axis=1)

  # 0 is used for padding, so increment 1
  indicator_cols = ["content_id", "task_container_id", "part"]
  for c in indicator_cols:
    fold_df[c] = fold_df[c] + 1

  user_groups = fold_df.groupby("user_id")
  user_groups_arr = user_groups.apply(
      lambda rows: (
          rows["content_id"].values.astype(dtypes_train["content_id"]), 
          rows["task_container_id"].values.astype(dtypes_train["task_container_id"]), 
          rows["prior_question_elapsed_time"].values.astype(dtypes_train["prior_question_elapsed_time"]),
          rows["part"].values.astype(dtypes_questions["part"]),
          (rows["answered_correctly"].shift(fill_value=2)+1).values.astype(dtypes_train["answered_correctly"]),
          rows["answered_correctly"].values.astype(dtypes_train["answered_correctly"]),
          )
      )
  create_tfrecords_from_user_groups(user_groups_arr, fold)


#### Padded Batching

`padded_shapes` argument for specifying for each item in the dataset which axis and pad length.  

[None], [None], [None], [None] - Since our dataset is returning a tuple of 4. and we want each of them to be padded to the maximum length in a batch.  


https://stackoverflow.com/a/49848103/7812715

In [None]:
d = dataset.padded_batch(32, padded_shapes=([None], [None], [None], [None]), drop_remainder=True)


In [None]:
for e in d.take(1):
  print(e)

## Sequence Example

In [None]:
context = tf.train.Features(
    feature = {
        "user_id": _int64_feature(115)
    }
)

content_ids = [_int64_feature(x) for x in user_groups_arr[115][0]]

In [None]:
sequence_example = tf.train.SequenceExample(
    context = context,
    feature_lists = tf.train.FeatureLists(feature_list={
        "content_id": tf.train.FeatureList(feature=content_ids)
    })
)

In [None]:
serialized = sequence_example.SerializeToString()

In [None]:
serialized

In [None]:
context_feat_desc = {
    "user_id": tf.io.FixedLenFeature([], tf.int64, default_value=0),
}

sequence_feat_desc = {
    "content_id": tf.io.VarLenFeature(tf.int64)
}

In [None]:
context, seq = tf.io.parse_single_sequence_example(serialized, context_feat_desc, sequence_feat_desc)

In [None]:
print(seq["content_id"].values)