<a href="https://colab.research.google.com/github/nisarahamedk/kaggle-riid/blob/master/notebooks/riid_tfrecords.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [229]:
%%capture
!pip install gcsfs

In [230]:

import math

import gcsfs
import pandas as pd
import numpy as np
from tqdm import tqdm, trange
import tensorflow as tf

In [231]:
# DATA_PATH = "/kaggle/input/riiid-test-answer-prediction/"
DATA_PATH = "gs://kds-e80dfc3d272252bbf34c627d756f891826dab0c19f30ec0fc3ac1979"

In [261]:
dtypes_train = {
    'user_id': 'int32',
    'content_id': 'int16',
}

In [263]:
df = pd.read_csv(DATA_PATH + "/train.csv", dtype=dtypes_train, usecols=dtypes_train.keys())
df.head()

Unnamed: 0,user_id,content_id
0,115,5692
1,115,5716
2,115,128
3,115,7860
4,115,7922


In [267]:
f"{len(df):,}"

'101,230,332'

In [266]:
user_groups = df.groupby("user_id")
len(user_groups)

393656

In [232]:
dtypes_train = {
#     'row_id': 'int64',
#     'timestamp': 'int64',
    'user_id': 'int32',
    'content_id': 'int16',
#     'content_type_id': 'int8',
    'task_container_id': 'int16',
#     'user_answer': 'int8',
    'answered_correctly': 'int8',
    'prior_question_elapsed_time': 'float32',
#     'prior_question_had_explanation': 'boolean'
    }

dtypes_questions = {
    "question_id": "",
    "bundle_id": "",
    "correct_answer": "",
    "part": "",
    "tags": "",
}

dtypes_lectures = {
    "lecture_id": "",
    "part": "",
    "tag": "",
    "type_of": "",
}

In [233]:
train_df = pd.read_csv(DATA_PATH + "/train.csv", dtype=dtypes_train, usecols=dtypes_train.keys(), nrows=1e6)
train_df.head()

Unnamed: 0,user_id,content_id,task_container_id,answered_correctly,prior_question_elapsed_time
0,115,5692,1,1,0.0
1,115,5716,2,1,0.616667
2,115,128,0,1,0.916667
3,115,7860,3,1,0.316667
4,115,7922,4,1,0.183333


In [234]:
user_groups = train_df.groupby("user_id")
user_groups_arr = user_groups.apply(
    lambda rows: (
        rows["content_id"].values, 
        rows["task_container_id"].values, 
        rows["prior_question_elapsed_time"].values,
        # rows["part"].values,
        rows["answered_correctly"].values,
        )
    )
user_groups_arr[115][0]

array([5692, 5716,  128, 7860, 7922,  156,   51,   50, 7896, 7863,  152,
        104,  108, 7900, 7901, 7971,   25,  183, 7926, 7927,    4, 7984,
         45,  185,   55, 7876,    6,  172, 7898,  175,  100, 7859,   57,
       7948,  151,  167, 7897, 7882, 7962, 1278, 2065, 2064, 2063, 3363,
       3365, 3364], dtype=int16)

In [235]:
def _bytes_feature(value):
  """Returns a bytes_list from a string / byte."""
  if isinstance(value, type(tf.constant(0))):
    value = value.numpy() # BytesList won't unpack a string from an EagerTensor.
  return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _float_feature(value):
  """Returns a float_list from a float / double."""
  return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

def _int64_feature(value):
  """Returns an int64_list from a bool / enum / int / uint."""
  return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

In [236]:
def serialize_example(user_arr):
  feature = {
      "content_id": _bytes_feature(tf.io.serialize_tensor(user_arr[0])),
      "task_container_id": _bytes_feature(tf.io.serialize_tensor(user_arr[1])),
      "prior_question_elapsed_time": _bytes_feature(tf.io.serialize_tensor(user_arr[2])),
      "answered_correctly": _bytes_feature(tf.io.serialize_tensor(user_arr[3])),
  }
  proto_example = tf.train.Example(features=tf.train.Features(feature=feature))
  serialized_example = proto_example.SerializeToString()
  return serialized_example

In [237]:
feature_desc = {
    "content_id": tf.io.FixedLenFeature([], tf.string),
    "task_container_id": tf.io.FixedLenFeature([], tf.string),
    "prior_question_elapsed_time": tf.io.FixedLenFeature([], tf.string),
    "answered_correctly": tf.io.FixedLenFeature([], tf.string),
}

In [238]:
example = serialize_example(user_groups_arr[115])
tf.io.parse_tensor(tf.io.parse_single_example(example, feature_desc)["content_id"], tf.int16)

<tf.Tensor: shape=(46,), dtype=int16, numpy=
array([5692, 5716,  128, 7860, 7922,  156,   51,   50, 7896, 7863,  152,
        104,  108, 7900, 7901, 7971,   25,  183, 7926, 7927,    4, 7984,
         45,  185,   55, 7876,    6,  172, 7898,  175,  100, 7859,   57,
       7948,  151,  167, 7897, 7882, 7962, 1278, 2065, 2064, 2063, 3363,
       3365, 3364], dtype=int16)>

In [239]:
def create_tfrecords_from_user_groups(user_groups_arr):

  with tf.io.TFRecordWriter("tfrecoreds.tfrec") as writer:
    for user_arr in user_groups_arr:
      example = serialize_example(user_arr)

      writer.write(example)

In [240]:
create_tfrecords_from_user_groups(user_groups_arr)

### Read TFRecords and Create TF Dataset

In [241]:
SEQ_LEN = 200

In [242]:
def parse_example(example):
  example = tf.io.parse_single_example(example, feature_desc)

  content_id = tf.io.parse_tensor(example["content_id"], tf.int16)
  task_container_id = tf.io.parse_tensor(example["task_container_id"], tf.int16)
  prior_question_elapsed_time = tf.io.parse_tensor(example["prior_question_elapsed_time"], tf.float32)
  answered_correctly = tf.io.parse_tensor(example["answered_correctly"], tf.int8)
  
  return tf.stack([
      tf.cast(content_id, tf.float32),
      tf.cast(task_container_id, tf.float32),
      tf.cast(prior_question_elapsed_time, tf.float32),
      tf.cast(answered_correctly, tf.float32)
  ])

In [243]:
filenames = tf.io.gfile.glob("*.tfrec")
filenames

['./tfrecoreds.tfrec']

In [244]:
dataset = tf.data.TFRecordDataset(filenames)

In [245]:
dataset = dataset.map(parse_example)

In [254]:
@tf.function
def pad(a, seq_len, max_seq_len):
  s = max_seq_len - seq_len
  # making [[0, 0], [s, 0]]
  r = tf.stack([s, tf.constant(0)])
  t = tf.stack([tf.constant([0, 0]), r])
  
  return tf.pad(a, t) # ,1 to debug

@tf.function
def trim(a, seq_len,  max_seq_len):
  start = tf.squeeze(tf.random.uniform((1,), maxval=(seq_len-max_seq_len), dtype=tf.int32))
  # https://www.quora.com/How-does-tf-slice-work-in-TensorFlow
  begin = tf.stack([tf.constant(0), start])
  size = tf.stack([tf.shape(a)[0], max_seq_len])
  
  return tf.slice(a, begin, size) # , start - to debug

@tf.function
def pad_or_trim(a):
  seq_len = tf.shape(a)[-1]
  max_seq_len = SEQ_LEN
  fn = tf.cond(tf.less_equal(seq_len, max_seq_len), lambda: pad(a, seq_len, max_seq_len), lambda: trim(a, seq_len, max_seq_len))
  return fn

In [255]:
dataset = dataset.map(pad_or_trim) # every sample is padded if len < SEQ_LEN or randomly trimmed to SEQ_LEN

In [258]:
for i in dataset.take(9):
  print(i.shape)

(4, 200)
(4, 200)
(4, 200)
(4, 200)
(4, 200)
(4, 200)
(4, 200)
(4, 200)
(4, 200)


In [260]:
data_batch = dataset.batch(128, drop_remainder=True)

for item in data_batch:
  print(item.shape)

(128, 4, 200)
(128, 4, 200)
(128, 4, 200)
(128, 4, 200)
(128, 4, 200)
(128, 4, 200)
(128, 4, 200)
(128, 4, 200)
(128, 4, 200)
(128, 4, 200)
(128, 4, 200)
(128, 4, 200)
(128, 4, 200)
(128, 4, 200)
(128, 4, 200)
(128, 4, 200)
(128, 4, 200)
(128, 4, 200)
(128, 4, 200)
(128, 4, 200)
(128, 4, 200)
(128, 4, 200)
(128, 4, 200)
(128, 4, 200)
(128, 4, 200)
(128, 4, 200)
(128, 4, 200)
(128, 4, 200)
(128, 4, 200)


#### Padded Batching

`padded_shapes` argument for specifying for each item in the dataset which axis and pad length.  

[None], [None], [None], [None] - Since our dataset is returning a tuple of 4. and we want each of them to be padded to the maximum length in a batch.  


https://stackoverflow.com/a/49848103/7812715

In [None]:
d = dataset.padded_batch(32, padded_shapes=([None], [None], [None], [None]), drop_remainder=True)


In [None]:
for e in d.take(1):
  print(e)

## Sequence Example

In [None]:
context = tf.train.Features(
    feature = {
        "user_id": _int64_feature(115)
    }
)

content_ids = [_int64_feature(x) for x in user_groups_arr[115][0]]

In [None]:
sequence_example = tf.train.SequenceExample(
    context = context,
    feature_lists = tf.train.FeatureLists(feature_list={
        "content_id": tf.train.FeatureList(feature=content_ids)
    })
)

In [None]:
serialized = sequence_example.SerializeToString()

In [None]:
serialized

In [None]:
context_feat_desc = {
    "user_id": tf.io.FixedLenFeature([], tf.int64, default_value=0),
}

sequence_feat_desc = {
    "content_id": tf.io.VarLenFeature(tf.int64)
}

In [None]:
context, seq = tf.io.parse_single_sequence_example(serialized, context_feat_desc, sequence_feat_desc)

In [None]:
print(seq["content_id"].values)