<a href="https://colab.research.google.com/github/nisarahamedk/kaggle-riid/blob/master/notebooks/riid_tfrecords_for_encoder_decoder.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%capture
!pip install gcsfs
!pip install kaggle
!pip install datatable

In [2]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [3]:
# Copy Kaggle API key
!mkdir -p ~/.kaggle && cp /content/drive/My\ Drive/Projects/Kaggle/api_key/kaggle.json ~/.kaggle/

In [4]:
!kaggle datasets download -d rohanrao/riiid-train-data-multiple-formats

Downloading riiid-train-data-multiple-formats.zip to /content
100% 4.17G/4.17G [00:54<00:00, 36.6MB/s]
100% 4.17G/4.17G [00:54<00:00, 82.4MB/s]


In [5]:
!unzip riiid-train-data-multiple-formats

Archive:  riiid-train-data-multiple-formats.zip
  inflating: riiid_train.feather     
  inflating: riiid_train.h5          
  inflating: riiid_train.jay         
  inflating: riiid_train.parquet     
  inflating: riiid_train.pkl.gzip    


In [3]:
import pickle
import math

import gcsfs
import pandas as pd
import numpy as np
from tqdm import tqdm, trange
import tensorflow as tf
from datatable import dt, f, by, count


np.random.seed(42)
tf.random.set_seed(42)

In [4]:
table = dt.fread("riiid_train.jay")

In [5]:
table.head(4)

   | row_id  timestamp  user_id  content_id  content_type_id  task_container_id  user_answer  answered_correctly    pr…  …
-- + ------  ---------  -------  ----------  ---------------  -----------------  -----------  ------------------  -----   
 0 |      0          0      115        5692                0                  1            3                   1     NA  …
 1 |      1      56943      115        5716                0                  2            2                   1  37000  …
 2 |      2     118363      115         128                0                  0            0                   1  55000  …
 3 |      3     131167      115        7860                0                  3            0                   1  19000  …

[4 rows x 10 columns]


Unnamed: 0_level_0,row_id,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation
Unnamed: 0_level_1,▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪,▪▪▪▪,▪,▪▪▪▪,▪▪▪▪,▪▪▪▪,▪▪▪▪▪▪▪▪,▪
0,0,0,115,5692,0,1,3,1,,
1,1,56943,115,5716,0,2,2,1,37000.0,0.0
2,2,118363,115,128,0,0,0,1,55000.0,0.0
3,3,131167,115,7860,0,3,0,1,19000.0,0.0


Filter lectures

In [6]:
table = table[f.content_type_id==0, :]

In [7]:
table.head(4)

   | row_id  timestamp  user_id  content_id  content_type_id  task_container_id  user_answer  answered_correctly    pr…  …
-- + ------  ---------  -------  ----------  ---------------  -----------------  -----------  ------------------  -----   
 0 |      0          0      115        5692                0                  1            3                   1     NA  …
 1 |      1      56943      115        5716                0                  2            2                   1  37000  …
 2 |      2     118363      115         128                0                  0            0                   1  55000  …
 3 |      3     131167      115        7860                0                  3            0                   1  19000  …

[4 rows x 10 columns]


Unnamed: 0_level_0,row_id,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation
Unnamed: 0_level_1,▪▪▪▪,▪▪▪▪▪▪▪▪,▪▪▪▪,▪▪▪▪,▪,▪▪▪▪,▪▪▪▪,▪▪▪▪,▪▪▪▪▪▪▪▪,▪
0,0,0,115,5692,0,1,3,1,,
1,1,56943,115,5716,0,2,2,1,37000.0,0.0
2,2,118363,115,128,0,0,0,1,55000.0,0.0
3,3,131167,115,7860,0,3,0,1,19000.0,0.0


Embedding sizes

In [8]:
max_dict = table[:, [dt.max(f.content_id), dt.max(f.task_container_id)]].to_dict()

In [9]:
embed_size_dict = {k: v[0] + 3 for k, v in max_dict.items()} # +3 (0-pad, 1-start token, 2-fill token)
embed_size_dict

{'content_id': 13525, 'task_container_id': 10002}

In [10]:
embed_size_dict["had_explanation"] = 5
embed_size_dict["answered_correctly"] = 5
embed_size_dict

{'answered_correctly': 5,
 'content_id': 13525,
 'had_explanation': 5,
 'task_container_id': 10002}

In [None]:
# # check NA - MEMORY ERROR
# df = table.to_pandas()
# df.describe()

In [None]:
del df

In [11]:
users_df = table[:, [f.user_id, f.content_id]].to_pandas()
users_df.head()

Unnamed: 0,user_id,content_id
0,115,5692
1,115,5716
2,115,128
3,115,7860
4,115,7922


In [12]:
f"{len(users_df):,}"

'99,271,300'

In [13]:
user_groups = users_df.groupby("user_id")
len(user_groups)

393656

In [14]:
user_groups_counts = user_groups.count().cumsum().reset_index()

In [15]:
user_groups_counts.head()

Unnamed: 0,user_id,content_id
0,115,46
1,124,76
2,2746,95
3,5382,220
4,8623,329


#### Cross Validation

##### Simple K Fold

In [16]:
from sklearn.model_selection import KFold

kf = KFold(n_splits=32, shuffle=False) #, random_state=42)

for fold, (idx_train, idx_valid) in enumerate(kf.split(X=user_groups_counts)):
    user_groups_counts.loc[idx_valid, "fold"] = fold
    
user_groups_counts.head()

Unnamed: 0,user_id,content_id,fold
0,115,46,0.0
1,124,76,0.0
2,2746,95,0.0
3,5382,220,0.0
4,8623,329,0.0


In [17]:
user_groups_counts["fold"].value_counts()[:4]

23.0    12302
18.0    12302
2.0     12302
17.0    12302
14.0    12302
9.0     12302
4.0     12302
Name: fold, dtype: int64

In [18]:
folds_arr = user_groups_counts.groupby("fold").apply(lambda r: r["content_id"].values)
folds_arr[:4]

fold
0.0    [46, 76, 95, 220, 329, 346, 611, 1854, 8137, 8...
1.0    [3050494, 3051035, 3052324, 3052341, 3052371, ...
2.0    [6147261, 6147304, 6147348, 6147396, 6147456, ...
3.0    [9147234, 9147264, 9147354, 9147385, 9147404, ...
4.0    [12181787, 12182019, 12182049, 12182079, 12182...
dtype: object

In [19]:
folds_arr[0][0]

46

In [20]:
folds_partitions = {}
fold_start = 0
for fold in range(32):
  folds_partitions[fold] = {}
  folds_partitions[fold]["start"] = int(fold_start)
  
  end = folds_arr[fold][-1]
  folds_partitions[fold]["end"] = int(end)
  
  fold_start = end
  
folds_partitions

{0: {'end': 3050404, 'start': 0},
 1: {'end': 6147066, 'start': 3050404},
 2: {'end': 9147197, 'start': 6147066},
 3: {'end': 12181423, 'start': 9147197},
 4: {'end': 15393592, 'start': 12181423},
 5: {'end': 18469283, 'start': 15393592},
 6: {'end': 21523288, 'start': 18469283},
 7: {'end': 24715578, 'start': 21523288},
 8: {'end': 27810532, 'start': 24715578},
 9: {'end': 30850557, 'start': 27810532},
 10: {'end': 34069534, 'start': 30850557},
 11: {'end': 37270224, 'start': 34069534},
 12: {'end': 40463742, 'start': 37270224},
 13: {'end': 43655954, 'start': 40463742},
 14: {'end': 46822744, 'start': 43655954},
 15: {'end': 49892293, 'start': 46822744},
 16: {'end': 52958176, 'start': 49892293},
 17: {'end': 56034812, 'start': 52958176},
 18: {'end': 59091702, 'start': 56034812},
 19: {'end': 62087479, 'start': 59091702},
 20: {'end': 65171416, 'start': 62087479},
 21: {'end': 68245278, 'start': 65171416},
 22: {'end': 71312347, 'start': 68245278},
 23: {'end': 74329239, 'start': 71

#### Lets process 1 Fold

In [21]:
fold = 3

fold_df = table[folds_partitions[fold]["start"]:folds_partitions[fold]["end"], :].to_pandas()
fold_df.head()

Unnamed: 0,row_id,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation
0,9329569,0,202836341,4207,False,0,3,1,,
1,9329570,13323,202836341,4097,False,1,2,1,11000.0,False
2,9329571,27321,202836341,6659,False,2,1,0,9000.0,False
3,9329572,54416,202836341,6650,False,3,1,1,12000.0,False
4,9329573,74648,202836341,5653,False,4,2,0,24000.0,False


In [57]:
dtypes_train = {
    'row_id': 'int64',
    'timestamp': 'int64',
    'user_id': 'int32',
    'content_id': 'int16',
    'content_type_id': 'int8',
    'task_container_id': 'int16',
    'user_answer': 'int8',
    'answered_correctly': 'int8',
    'prior_question_elapsed_time': 'float32',
    'prior_question_had_explanation': 'int8'
    }

dtypes_questions = {
    "question_id": "",
    "bundle_id": "",
    "correct_answer": "",
    "part": "int16",
    "tags": "",
}

dtypes_lectures = {
    "lecture_id": "",
    "part": "",
    "tag": "",
    "type_of": "",
}

##### Loading the Question DF for features

In [27]:
DATA_PATH = 'gs://kds-e7d6db6554e83e3f4182aa828879e31bf5c122e568c9ee97ab5d891f'
questions_df = pd.read_csv(DATA_PATH + "/questions.csv")
questions_df.fillna("-1", inplace=True) # tags Nan filled with -1
questions_df.head()

Unnamed: 0,question_id,bundle_id,correct_answer,part,tags
0,0,0,0,1,51 131 162 38
1,1,1,1,1,131 36 81
2,2,2,0,1,131 101 162 92
3,3,3,0,1,131 149 162 29
4,4,4,3,1,131 5 162 38


In [28]:
questions_df.describe()

Unnamed: 0,question_id,bundle_id,correct_answer,part
count,13523.0,13523.0,13523.0,13523.0
mean,6761.0,6760.510907,1.455298,4.264956
std,3903.89818,3903.857783,1.149707,1.652553
min,0.0,0.0,0.0,1.0
25%,3380.5,3379.5,0.0,3.0
50%,6761.0,6761.0,1.0,5.0
75%,10141.5,10140.0,3.0,5.0
max,13522.0,13522.0,3.0,7.0


In [29]:
questions_df["part"].unique()

array([1, 2, 3, 4, 5, 6, 7])

In [30]:
# embedding size for part
embed_size_dict["part"] = max(questions_df["part"]) + 3
embed_size_dict

{'answered_correctly': 5,
 'content_id': 13525,
 'had_explanation': 5,
 'part': 10,
 'task_container_id': 10002}

In [31]:
# embedding size for tags/skills
from collections import Counter
skill_count = Counter()

for idx, row in questions_df.iterrows():
  skill_count.update([int(x) for x in row["tags"].split(" ")])

skill_count.most_common()[:5]

[(92, 2269), (38, 2256), (81, 1969), (29, 1707), (136, 1033)]

In [35]:
max(skill_count.keys()) # -1 is there, needs to be replaced with FILL TOKEN

187

In [32]:
embed_size_dict["tags"] = max(skill_count.keys()) + 3
embed_size_dict

{'answered_correctly': 5,
 'content_id': 13525,
 'had_explanation': 5,
 'part': 10,
 'tags': 190,
 'task_container_id': 10002}

In [37]:
# storing embedding size info for uploading to kaggle.
!mkdir data
with open("data/emb_sz.pkl", "wb") as f:
  pickle.dump(embed_size_dict, f)

In [38]:
len(fold_df)

3034226

In [39]:
# removing lecture rows. # already filtered in the table.
fold_df = fold_df[fold_df.answered_correctly != -1]

In [40]:
len(fold_df)

3034226

In [41]:
fold_df = fold_df.join(questions_df, on="content_id")
fold_df.head()

Unnamed: 0,row_id,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,question_id,bundle_id,correct_answer,part,tags
0,9329569,0,202836341,4207,False,0,3,1,,,4207,4207,3,5,89
1,9329570,13323,202836341,4097,False,1,2,1,11000.0,False,4097,4097,2,5,55
2,9329571,27321,202836341,6659,False,2,1,0,9000.0,False,6659,6659,3,5,53
3,9329572,54416,202836341,6650,False,3,1,1,12000.0,False,6650,6650,1,5,53
4,9329573,74648,202836341,5653,False,4,2,0,24000.0,False,5653,5653,1,5,96


In [42]:
drop_cols = [
             "row_id", 
             "content_type_id", 
             "user_answer", 
             "question_id", 
             "bundle_id", 
             "correct_answer", 
             ]
fold_df = fold_df.drop(drop_cols, axis=1)
fold_df.head()

Unnamed: 0,timestamp,user_id,content_id,task_container_id,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,part,tags
0,0,202836341,4207,0,1,,,5,89
1,13323,202836341,4097,1,1,11000.0,False,5,55
2,27321,202836341,6659,2,0,9000.0,False,5,53
3,54416,202836341,6650,3,1,12000.0,False,5,53
4,74648,202836341,5653,4,0,24000.0,False,5,96


In [43]:
# 0, 1, 2 are special tokens, so increment 3
indicator_cols = ["content_id", "task_container_id", "part", "answered_correctly", "prior_question_had_explanation"]
for c in indicator_cols:
  fold_df[c] = fold_df[c] + 3
fold_df.head()

Unnamed: 0,timestamp,user_id,content_id,task_container_id,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,part,tags
0,0,202836341,4210,3,4,,,8,89
1,13323,202836341,4100,4,4,11000.0,3.0,8,55
2,27321,202836341,6662,5,3,9000.0,3.0,8,53
3,54416,202836341,6653,6,4,12000.0,3.0,8,53
4,74648,202836341,5656,7,3,24000.0,3.0,8,96


In [49]:
fold_df["tags"] = fold_df["tags"].apply(lambda row: " ".join([str(int(x)+3) for x in row.split(" ")]))
fold_df.head()

Unnamed: 0,timestamp,user_id,content_id,task_container_id,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,part,tags
0,0,202836341,4210,3,4,,,8,92
1,13323,202836341,4100,4,4,11000.0,3.0,8,58
2,27321,202836341,6662,5,3,9000.0,3.0,8,56
3,54416,202836341,6653,6,4,12000.0,3.0,8,56
4,74648,202836341,5656,7,3,24000.0,3.0,8,99


In [None]:
# convert milliseconds to minutes.
# fold_df['prior_question_elapsed_time'] = fold_df["prior_question_elapsed_time"].fillna(0).astype(np.float32) / 60000

In [53]:
fold_df["prior_question_elapsed_time"].shift(-1, fill_value=-1)

0          11000.0
1           9000.0
2          12000.0
3          24000.0
4          18000.0
            ...   
3034221    30000.0
3034222    22000.0
3034223    16000.0
3034224    24000.0
3034225       -1.0
Name: prior_question_elapsed_time, Length: 3034226, dtype: float64

In [264]:
user_groups = fold_df.groupby("user_id")
user_groups_arr = user_groups.apply(
    lambda rows: (
        rows["timestamp"].values.astype(dtypes_train["timestamp"]),
        rows["content_id"].values.astype(dtypes_train["content_id"]), 
        rows["task_container_id"].values.astype(dtypes_train["task_container_id"]), 
        rows["prior_question_elapsed_time"].shift(-1, fill_value=-1).values.astype(dtypes_train["prior_question_elapsed_time"]), # last question for the user doesnt have elapsed time. fill with -1
        rows["prior_question_had_explanation"].shift(-1, fill_value=2).values.astype(dtypes_train["prior_question_had_explanation"]), # last question for the user doesnt have "had_explanation". fill with 2
        rows["part"].values.astype(dtypes_questions["part"]),
        rows["tags"].values,
        rows["answered_correctly"].values.astype(dtypes_train["answered_correctly"]),
        )
    )
user_groups_arr

user_id
202836341    ([0, 13323, 27321, 54416, 74648, 98153, 124582...
202838719    ([0, 20765, 34470, 53561, 143508, 143508, 1435...
202840484    ([0, 25519, 51157, 69732, 142880, 142880, 1428...
202846170    ([0, 46330283, 46370059, 46405738, 46463023, 4...
202846211    ([0, 18215, 56847, 97923, 119309, 130868, 1593...
                                   ...                        
270318132    ([0, 23337, 47339, 67200, 144138, 144138, 1441...
270336415    ([0, 26062, 54909, 74292, 160420, 160420, 1604...
270342519    ([0, 24870, 42015, 59003, 2351562520, 23515625...
270348524    ([0, 54456, 74629, 96746, 113878, 155274, 1697...
270349461    ([0, 33316, 60163, 84567, 115328, 153216, 1728...
Length: 12302, dtype: object

In [265]:
# Inserting START TOKEN before every feature
START_TOKEN = 1
user_groups_arr  = user_groups_arr.apply(
    lambda row: (
        np.insert(row[0], 0, START_TOKEN), # timestamp
        np.insert(row[1], 0, START_TOKEN), # content_id
        np.insert(row[2], 0, START_TOKEN), # task_container_id
        np.insert(row[3], 0, START_TOKEN), # prior_question_elapsed_time
        np.insert(row[4], 0, START_TOKEN), # prior_question_had_explanation
        np.insert(row[5], 0, START_TOKEN), # part
        np.insert(row[6], 0, str(START_TOKEN)), # tags
        np.insert(row[7], 0, START_TOKEN), # answered_correctly
        )
    )
user_groups_arr,

(user_id
 202836341    ([1, 0, 13323, 27321, 54416, 74648, 98153, 124...
 202838719    ([1, 0, 20765, 34470, 53561, 143508, 143508, 1...
 202840484    ([1, 0, 25519, 51157, 69732, 142880, 142880, 1...
 202846170    ([1, 0, 46330283, 46370059, 46405738, 46463023...
 202846211    ([1, 0, 18215, 56847, 97923, 119309, 130868, 1...
                                    ...                        
 270318132    ([1, 0, 23337, 47339, 67200, 144138, 144138, 1...
 270336415    ([1, 0, 26062, 54909, 74292, 160420, 160420, 1...
 270342519    ([1, 0, 24870, 42015, 59003, 2351562520, 23515...
 270348524    ([1, 0, 54456, 74629, 96746, 113878, 155274, 1...
 270349461    ([1, 0, 33316, 60163, 84567, 115328, 153216, 1...
 Length: 12302, dtype: object,)

In [266]:
user_groups_arr[202836341][0] # timestamp, # notice the START TOKEN 1

array([        1,         0,     13323,     27321,     54416,     74648,
           98153,    124582,    441080,    529206,    614479,    659124,
          725671,    797211,    856950,    947630,   1013574,   1050703,
         1138335,   1190778,   1268457,   1328690,   1407451,   1490639,
         1530949,   1597977,   1661693,   1737007,  22591286,  22681611,
        22747917, 146599675, 146671224, 146708991, 146739735, 146771695,
       146801927, 146836636])

In [267]:
user_groups_arr[202836341][1] # content_id

array([    1,  4210,  4100,  6662,  6653,  5656,  9696,  5655,   529,
         660,   388, 10690,  1377,   914,   221, 10689,   887,  1153,
        5720,  3876,  4676,  8357,  6678,  4001,  5459,  8907,  4263,
        4981,  6477,  9374,  6254,  6456,  3685,  9454,  3804,  5585,
        4461,  9600], dtype=int16)

In [268]:
user_groups_arr[202836341][2] # task_container_id

array([ 1,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
       19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
       36, 37, 38, 39], dtype=int16)

In [269]:
user_groups_arr[202836341][3] # prior_question_elapsed_time

array([ 1.0e+00,  1.1e+04,  9.0e+03,  1.2e+04,  2.4e+04,  1.8e+04,
        2.1e+04,  2.4e+04,  2.8e+04,  1.7e+04,  3.9e+04,  1.7e+04,
        2.0e+04,  1.6e+04,  1.7e+04,  2.2e+04,  1.6e+04,  1.4e+04,
        9.0e+03,  1.7e+04,  2.2e+04,  1.2e+04,  1.5e+04,  1.7e+04,
        1.2e+04,  1.3e+04,  1.5e+04,  2.1e+04,  3.9e+04,  2.5e+04,
        1.2e+04,  1.4e+04,  2.3e+04,  1.1e+04,  1.5e+04,  1.7e+04,
        1.7e+04, -1.0e+00], dtype=float32)

In [270]:
user_groups_arr[202836341][4] # prior_question_had_explanation

array([1, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
       4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 2], dtype=int8)

In [271]:
user_groups_arr[202836341][5] # part

array([1, 8, 8, 8, 8, 8, 8, 8, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 8, 8, 8, 8,
       8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8], dtype=int16)

In [272]:
user_groups_arr[202836341][6] # tags

array(['1', '92', '58', '56', '56', '99', '58', '99',
       '158 166 165 95 105', '65 93 103 165 41 95', '141 44 95 32',
       '65 140 145 84 95', '72 140 91 84 32', '146 23 41 32',
       '140 91 32 105', '65 140 145 41 32', '132 146 33 41 84',
       '146 23 84 32', '88', '126', '82', '76', '128', '169', '112', '99',
       '46', '169', '176', '99', '162', '52', '58', '112', '11', '57',
       '131', '76'], dtype=object)

In [273]:
user_groups_arr[202836341][7] # answered_correctly

array([1, 4, 4, 3, 4, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 3, 4, 4, 3, 3, 4, 4,
       4, 4, 4, 4, 3, 4, 4, 3, 4, 3, 3, 4, 4, 4, 4, 3], dtype=int8)

In [274]:
for i in range(8):
  print(user_groups_arr[202836341][i].dtype)

int64
int16
int16
float32
int8
int16
object
int8


In [275]:
def _bytes_feature(value):
  """Returns a bytes_list from a string / byte."""
  if isinstance(value, type(tf.constant(0))):
    value = value.numpy() # BytesList won't unpack a string from an EagerTensor.
  return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _float_feature(value):
  """Returns a float_list from a float / double."""
  return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

def _int64_feature(value):
  """Returns an int64_list from a bool / enum / int / uint."""
  return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

In [276]:
def serialize_example(user_arr):
  feature = {
      "timestamp": _bytes_feature(tf.io.serialize_tensor(user_arr[0])),
      "content_id": _bytes_feature(tf.io.serialize_tensor(user_arr[1])),
      "task_container_id": _bytes_feature(tf.io.serialize_tensor(user_arr[2])),
      "elapsed_time": _bytes_feature(tf.io.serialize_tensor(user_arr[3])),
      "had_explanation": _bytes_feature(tf.io.serialize_tensor(user_arr[4])),
      "part": _bytes_feature(tf.io.serialize_tensor(user_arr[5])),
      "tags": _bytes_feature(tf.io.serialize_tensor(user_arr[6])),
      "answered_correctly": _bytes_feature(tf.io.serialize_tensor(user_arr[7])),
  }
  proto_example = tf.train.Example(features=tf.train.Features(feature=feature))
  serialized_example = proto_example.SerializeToString()
  return serialized_example

In [277]:
feature_desc = {
    "timestamp": tf.io.FixedLenFeature([], tf.string),
    "content_id": tf.io.FixedLenFeature([], tf.string),
    "task_container_id": tf.io.FixedLenFeature([], tf.string),
    "elapsed_time": tf.io.FixedLenFeature([], tf.string),
    "had_explanation": tf.io.FixedLenFeature([], tf.string),
    "part": tf.io.FixedLenFeature([], tf.string),
    "tags": tf.io.FixedLenFeature([], tf.string),
    "answered_correctly": tf.io.FixedLenFeature([], tf.string),
}

In [278]:
def create_tfrecords_from_user_groups(user_groups_arr, part):

  with tf.io.TFRecordWriter(f"data/tfrec_{part}.tfrec") as writer:
    for user_arr in user_groups_arr:
      example = serialize_example(user_arr)

      writer.write(example)

In [279]:
create_tfrecords_from_user_groups(user_groups_arr, 0)

### Read TFRecords and Create TF Dataset

In [280]:
SEQ_LEN = 200

In [281]:
"""
dtypes_train = {
    'row_id': 'int64',
    'timestamp': 'int64',
    'user_id': 'int32',
    'content_id': 'int16',
    'content_type_id': 'int8',
    'task_container_id': 'int16',
    'user_answer': 'int8',
    'answered_correctly': 'int8',
    'prior_question_elapsed_time': 'float32',
    'prior_question_had_explanation': 'int8'
    }

dtypes_questions = {
    "question_id": "",
    "bundle_id": "",
    "correct_answer": "",
    "part": "int16",
    "tags": "",
}

dtypes_lectures = {
    "lecture_id": "",
    "part": "",
    "tag": "",
    "type_of": "",
}
"""

def parse_example(example):
  example = tf.io.parse_single_example(example, feature_desc)

  timestamp = tf.io.parse_tensor(example["timestamp"], tf.int64)
  content_id = tf.io.parse_tensor(example["content_id"], tf.int16)
  task_container_id = tf.io.parse_tensor(example["task_container_id"], tf.int16)
  elapsed_time = tf.io.parse_tensor(example["elapsed_time"], tf.float32)
  had_explanation = tf.io.parse_tensor(example["had_explanation"], tf.int8)
  part = tf.io.parse_tensor(example["part"], tf.int16)
  answered_correctly = tf.io.parse_tensor(example["answered_correctly"], tf.int8)

  # tags
  tags = tf.io.parse_tensor(example["tags"], tf.string) # as string, one q can have multiple tags.
  tags = tf.strings.to_number(tf.strings.split(tags), out_type=tf.int32) # will produce a ragged tensor with tags for each q
  # ragged tensor of tags [[2], [3, 4]] is converted to one hot like [[0,0,1..], [[0,0,0,1,..], [0,0,0,0,1...]]]
  # then sumed along axis 1, so for each question there will be 1 for all the tags associated with it.
  tags = tf.reduce_sum(tf.one_hot(tags, depth=190), axis=1) # shape [seq_len, 190]
  tags = tf.transpose(tags, (1, 0)) # shape [190, seq_len] to stack with other features.
  
  features = tf.stack([
      tf.cast(timestamp, tf.float32),
      tf.cast(content_id, tf.float32),
      tf.cast(task_container_id, tf.float32),
      tf.cast(elapsed_time, tf.float32),
      tf.cast(had_explanation, tf.float32),
      tf.cast(part, tf.float32),
      tf.cast(answered_correctly, tf.float32),
  ])

  # add tags
  return tf.concat([
      features,
      tf.cast(tags, tf.float32),
  ], axis=0)

In [314]:
filenames = tf.io.gfile.glob("data/*.tfrec")
filenames

['data/tfrec_15.tfrec',
 'data/tfrec_1.tfrec',
 'data/tfrec_30.tfrec',
 'data/tfrec_12.tfrec',
 'data/tfrec_0.tfrec',
 'data/tfrec_27.tfrec',
 'data/tfrec_18.tfrec',
 'data/tfrec_5.tfrec',
 'data/tfrec_4.tfrec',
 'data/tfrec_24.tfrec',
 'data/tfrec_23.tfrec',
 'data/tfrec_25.tfrec',
 'data/tfrec_9.tfrec',
 'data/tfrec_13.tfrec',
 'data/tfrec_7.tfrec',
 'data/tfrec_26.tfrec',
 'data/tfrec_8.tfrec',
 'data/tfrec_21.tfrec',
 'data/tfrec_17.tfrec',
 'data/tfrec_20.tfrec',
 'data/tfrec_19.tfrec',
 'data/tfrec_16.tfrec',
 'data/tfrec_14.tfrec',
 'data/tfrec_29.tfrec',
 'data/tfrec_28.tfrec',
 'data/tfrec_6.tfrec',
 'data/tfrec_10.tfrec',
 'data/tfrec_11.tfrec',
 'data/tfrec_2.tfrec',
 'data/tfrec_3.tfrec',
 'data/tfrec_31.tfrec',
 'data/tfrec_22.tfrec']

In [315]:
dataset = tf.data.TFRecordDataset(filenames)

In [316]:
dataset = dataset.map(parse_example)

In [317]:
for features in dataset.take(1):
  pass

In [318]:
# normal features, tags associated with each question - sparse array with 1s
features[0:7].shape, features[7:, :].shape

(TensorShape([7, 14]), TensorShape([190, 14]))

In [319]:
tf.where(features[7:, 8]==1) # tags starts from index 7,  for 7 th q

<tf.Tensor: shape=(3, 1), dtype=int64, numpy=
array([[ 32],
       [106],
       [139]])>

In [320]:
@tf.function
def pad(a, seq_len, max_seq_len):
  s = max_seq_len - seq_len
  # making [[0, 0], [s, 0]]
  r = tf.stack([s, tf.constant(0)])
  t = tf.stack([tf.constant([0, 0]), r])
  
  return tf.pad(a, t) # ,1 to debug

@tf.function
def trim(a, seq_len,  max_seq_len):
  start = tf.squeeze(tf.random.uniform((1,), maxval=(seq_len-max_seq_len), dtype=tf.int32))
  # https://www.quora.com/How-does-tf-slice-work-in-TensorFlow
  begin = tf.stack([tf.constant(0), start])
  size = tf.stack([tf.shape(a)[0], max_seq_len])
  
  return tf.slice(a, begin, size) # , start - to debug

@tf.function
def pad_or_trim(a):
  seq_len = tf.shape(a)[-1]
  max_seq_len = SEQ_LEN + 1 # accomodate for the start token
  fn = tf.cond(tf.less_equal(seq_len, max_seq_len), lambda: pad(a, seq_len, max_seq_len), lambda: trim(a, seq_len, max_seq_len))
  return fn

In [321]:
dataset = dataset.map(pad_or_trim) # every sample is padded if len < SEQ_LEN or randomly trimmed to SEQ_LEN

In [322]:
for i in dataset.take(1):
  print(i.shape)

(197, 201)


In [307]:
i[0] # timestamp

<tf.Tensor: shape=(201,), dtype=float32, numpy=
array([0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
       0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
       0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
       0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
       0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
       0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
       0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
       0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
       0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
       0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
       0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
       0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
       0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
       0.0000000e+00, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
       0.00000

In [308]:
i[1] # content_id

<tf.Tensor: shape=(201,), dtype=float32, numpy=
array([0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
       0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
       0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
       0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
       0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
       0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
       0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
       0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
       0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
       0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
       0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
       0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
       0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
       0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
       0.0000e

In [323]:
data_batch = dataset.batch(128, drop_remainder=True)

for features in data_batch.take(1):
  print(features.shape)

(128, 197, 201)


#### Lets process all the folds

In [311]:
!rm -f data/riiid_train*.* data/riiid-train*.* data/*.tfrec

In [312]:
!mkdir data

mkdir: cannot create directory ‘data’: File exists


In [313]:
for fold, parition in folds_partitions.items():
  fold_df = table[folds_partitions[fold]["start"]:folds_partitions[fold]["end"], :].to_pandas()
  fold_df = fold_df[fold_df.answered_correctly != -1]
  fold_df = fold_df.join(questions_df, on="content_id")
  drop_cols = [
              "row_id", 
              "content_type_id", 
              "user_answer", 
              "question_id", 
              "bundle_id", 
              "correct_answer", 
              ]
  fold_df = fold_df.drop(drop_cols, axis=1)

  
  # 0, 1, 2 are special tokens, so increment 3
  indicator_cols = ["content_id", "task_container_id", "part", "answered_correctly", "prior_question_had_explanation"]
  for c in indicator_cols:
    fold_df[c] = fold_df[c] + 3
  
  fold_df["tags"] = fold_df["tags"].apply(lambda row: " ".join([str(int(x)+3) for x in row.split(" ")]))

  user_groups = fold_df.groupby("user_id")
  user_groups_arr = user_groups.apply(
    lambda rows: (
        rows["timestamp"].values.astype(dtypes_train["timestamp"]),
        rows["content_id"].values.astype(dtypes_train["content_id"]), 
        rows["task_container_id"].values.astype(dtypes_train["task_container_id"]), 
        rows["prior_question_elapsed_time"].shift(-1, fill_value=-1).values.astype(dtypes_train["prior_question_elapsed_time"]), # last question for the user doesnt have elapsed time. fill with -1
        rows["prior_question_had_explanation"].shift(-1, fill_value=2).values.astype(dtypes_train["prior_question_had_explanation"]), # last question for the user doesnt have "had_explanation". fill with 2
        rows["part"].values.astype(dtypes_questions["part"]),
        rows["tags"].values,
        rows["answered_correctly"].values.astype(dtypes_train["answered_correctly"]),
        )
    )
  
  
  # Inserting START TOKEN before every feature
  START_TOKEN = 1
  user_groups_arr  = user_groups_arr.apply(
      lambda row: (
          np.insert(row[0], 0, START_TOKEN), # timestamp
          np.insert(row[1], 0, START_TOKEN), # content_id
          np.insert(row[2], 0, START_TOKEN), # task_container_id
          np.insert(row[3], 0, START_TOKEN), # prior_question_elapsed_time
          np.insert(row[4], 0, START_TOKEN), # prior_question_had_explanation
          np.insert(row[5], 0, START_TOKEN), # part
          np.insert(row[6], 0, str(START_TOKEN)), # tags
          np.insert(row[7], 0, START_TOKEN), # answered_correctly
          )
      )

  create_tfrecords_from_user_groups(user_groups_arr, fold)


#### Upload to Kaggle

In [324]:
!kaggle datasets init -p data/

Data package template written to: data/dataset-metadata.json


In [325]:
# id and title only alphanumeric and "-"
meta = """
{
  "licenses": [
    {
      "name": "CC0-1.0"
    }
  ], 
  "id": "nisarahamedk/riid-0-2",
  "title": "riid-0-2"
}
"""
with open("data/dataset-metadata.json", "w") as f:
  f.write(meta)

In [326]:
# create
!kaggle datasets create -p data/ --dir-mode tar -u

Starting upload for file tfrec_15.tfrec
100% 88.7M/88.7M [00:01<00:00, 49.6MB/s]
Upload successful: tfrec_15.tfrec (89MB)
Starting upload for file tfrec_1.tfrec
100% 89.5M/89.5M [00:01<00:00, 47.4MB/s]
Upload successful: tfrec_1.tfrec (89MB)
Starting upload for file tfrec_30.tfrec
100% 91.1M/91.1M [00:01<00:00, 62.9MB/s]
Upload successful: tfrec_30.tfrec (91MB)
Starting upload for file tfrec_12.tfrec
100% 92.0M/92.0M [00:02<00:00, 46.5MB/s]
Upload successful: tfrec_12.tfrec (92MB)
Starting upload for file tfrec_0.tfrec
100% 88.1M/88.1M [00:01<00:00, 48.1MB/s]
Upload successful: tfrec_0.tfrec (88MB)
Starting upload for file tfrec_27.tfrec
100% 91.9M/91.9M [00:01<00:00, 63.4MB/s]
Upload successful: tfrec_27.tfrec (92MB)
Starting upload for file tfrec_18.tfrec
100% 88.0M/88.0M [00:02<00:00, 45.7MB/s]
Upload successful: tfrec_18.tfrec (88MB)
Starting upload for file emb_sz.pkl
100% 132/132 [00:00<00:00, 591B/s]
Upload successful: emb_sz.pkl (132B)
Starting upload for file tfrec_5.tfrec
100

In [None]:
# Update
!kaggle datasets version -p data/ -m "adding embed sizes"

Starting upload for file tfrec_6.tfrec
100% 37.9M/37.9M [00:07<00:00, 5.01MB/s]
Upload successful: tfrec_6.tfrec (38MB)
Starting upload for file tfrec_21.tfrec
100% 38.1M/38.1M [00:07<00:00, 5.27MB/s]
Upload successful: tfrec_21.tfrec (38MB)
Starting upload for file tfrec_12.tfrec
100% 39.5M/39.5M [00:08<00:00, 4.77MB/s]
Upload successful: tfrec_12.tfrec (40MB)
Starting upload for file tfrec_4.tfrec
100% 39.7M/39.7M [00:08<00:00, 4.93MB/s]
Upload successful: tfrec_4.tfrec (40MB)
Starting upload for file tfrec_20.tfrec
100% 38.3M/38.3M [00:08<00:00, 4.53MB/s]
Upload successful: tfrec_20.tfrec (38MB)
Starting upload for file tfrec_23.tfrec
100% 37.5M/37.5M [00:07<00:00, 4.98MB/s]
Upload successful: tfrec_23.tfrec (37MB)
Starting upload for file tfrec_18.tfrec
100% 37.9M/37.9M [00:10<00:00, 3.64MB/s]
Upload successful: tfrec_18.tfrec (38MB)
Starting upload for file tfrec_26.tfrec
100% 37.5M/37.5M [00:08<00:00, 4.50MB/s]
Upload successful: tfrec_26.tfrec (38MB)
Starting upload for file tfr

#### Padded Batching

`padded_shapes` argument for specifying for each item in the dataset which axis and pad length.  

[None], [None], [None], [None] - Since our dataset is returning a tuple of 4. and we want each of them to be padded to the maximum length in a batch.  


https://stackoverflow.com/a/49848103/7812715

In [None]:
d = dataset.padded_batch(32, padded_shapes=([None], [None], [None], [None]), drop_remainder=True)


In [None]:
for e in d.take(1):
  print(e)

## Sequence Example

In [None]:
context = tf.train.Features(
    feature = {
        "user_id": _int64_feature(115)
    }
)

content_ids = [_int64_feature(x) for x in user_groups_arr[115][0]]

In [None]:
sequence_example = tf.train.SequenceExample(
    context = context,
    feature_lists = tf.train.FeatureLists(feature_list={
        "content_id": tf.train.FeatureList(feature=content_ids)
    })
)

In [None]:
serialized = sequence_example.SerializeToString()

In [None]:
serialized

In [None]:
context_feat_desc = {
    "user_id": tf.io.FixedLenFeature([], tf.int64, default_value=0),
}

sequence_feat_desc = {
    "content_id": tf.io.VarLenFeature(tf.int64)
}

In [None]:
context, seq = tf.io.parse_single_sequence_example(serialized, context_feat_desc, sequence_feat_desc)

In [None]:
print(seq["content_id"].values)