**Otto Recommender System:** Converting train.jsonl file to TFRecord file format. The idea is to use TFRecord file format further and learn TensorFlow Recommenders (TFRS) while implementing this usecase  

Following resources gave me good understanding of WHAT, WHY and How of TFRecord file format. 

* https://www.tensorflow.org/tutorials/load_data/tfrecord
* https://keras.io/examples/keras_recipes/creating_tfrecords/
* https://www.kaggle.com/code/ryanholbrook/tfrecords-basics/notebook

As I start to learn TensorFlow, your remarks / comments for sure can help in improving my knowledge 

In [1]:
import numpy as np 
import pandas as pd 
import os
import zipfile
import json
from tqdm import tqdm
import tensorflow as tf

In [2]:
INPUT_DIR = '../input/otto-recommender-system'
TRAIN_FILE = f'{INPUT_DIR}/train.jsonl'
TEST_FILE = f'{INPUT_DIR}/test.jsonl'

In [3]:
def _int64_feature(value):
    # Returns an int64_list from a bool / enum / int / uint
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

def serialize_example(session, aid, ts, typ):
    # Creates a tf.train.Example message ready to be written to a file.
    # Create a dictionary mapping the feature name to the tf.train.Example-compatible data type.
    feature = {
          'session': _int64_feature(session),
          'aid': _int64_feature(aid),
          'ts': _int64_feature(ts),
          'typ': _int64_feature(typ),
      }
    # Create a Features message using tf.train.Example.
    example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
    return example_proto.SerializeToString()

In [4]:
def features_list(train_df):
    session = []
    aid = []
    ts = []
    typ = []
    codes = {'clicks':1, 'carts':2, 'orders':3}

    for index in train_df.index:
        session_id = train_df['session'][index]
        for event in train_df['events'][index]:
            session.append(session_id)
            aid.append(event['aid'])
            ts.append(event['ts'])
            typ.append(codes[event['type']])
    return session, aid, ts, typ

def write_tfrecord(file_num, session, aid, ts, typ):
    filename = f'train_chunk_{file_num}.tfrecord'
    options = tf.io.TFRecordOptions(compression_type='ZLIB')
    
    with tf.io.TFRecordWriter(filename, options=options) as writer:
        for i in range(len(session)):
            example = serialize_example(session[i], aid[i], ts[i], typ[i])
            writer.write(example)
    zip_tfrecord(filename)
    return

def zip_tfrecord(filename):
    OUTPUT_DIR = '/kaggle/working/'
    zip_file = f'{OUTPUT_DIR}/otto_tfrecord.zip'
    filename = f'{OUTPUT_DIR}/{filename}'
    
    with zipfile.ZipFile(zip_file, 'a') as zf:
        zf.write(filename)
    os.remove(filename)
    return

In [5]:
%%time
num_lines = sum(1 for line in open(TRAIN_FILE))
print('Number of lines: ', num_lines)

chunk_size=100000
num_chunks= int(np.ceil(num_lines / chunk_size))
print('Number of chunks: ',num_chunks)

Number of lines:  12899779
Number of chunks:  129
CPU times: user 10 s, sys: 5.81 s, total: 15.8 s
Wall time: 2min 11s


In [6]:
%%time
chunks = pd.read_json(TRAIN_FILE, lines=True, chunksize=chunk_size)

for e, chunk in tqdm(enumerate(chunks)):
    print('Processing chunk # : ', e)
    session, aid, ts, typ = features_list(chunk)
    write_tfrecord(e, session, aid, ts, typ)

0it [00:00, ?it/s]

Processing chunk # :  0


1it [04:03, 243.77s/it]

Processing chunk # :  1


2it [07:59, 239.05s/it]

Processing chunk # :  2


3it [11:26, 224.58s/it]

Processing chunk # :  3


4it [14:33, 209.53s/it]

Processing chunk # :  4


5it [17:37, 200.52s/it]

Processing chunk # :  5


6it [20:29, 190.68s/it]

Processing chunk # :  6


7it [23:17, 183.44s/it]

Processing chunk # :  7


8it [25:58, 176.18s/it]

Processing chunk # :  8


9it [28:40, 171.61s/it]

Processing chunk # :  9


10it [31:12, 165.63s/it]

Processing chunk # :  10


11it [33:39, 159.89s/it]

Processing chunk # :  11


12it [35:58, 153.58s/it]

Processing chunk # :  12


13it [38:39, 155.70s/it]

Processing chunk # :  13


14it [41:03, 152.35s/it]

Processing chunk # :  14


15it [43:18, 146.95s/it]

Processing chunk # :  15


16it [45:24, 140.64s/it]

Processing chunk # :  16


17it [47:31, 136.54s/it]

Processing chunk # :  17


18it [49:31, 131.60s/it]

Processing chunk # :  18


19it [51:33, 128.72s/it]

Processing chunk # :  19


20it [53:28, 124.71s/it]

Processing chunk # :  20


21it [55:20, 120.75s/it]

Processing chunk # :  21


22it [57:17, 119.68s/it]

Processing chunk # :  22


23it [59:17, 119.76s/it]

Processing chunk # :  23


24it [1:01:03, 115.73s/it]

Processing chunk # :  24


25it [1:02:49, 112.67s/it]

Processing chunk # :  25


26it [1:04:34, 110.47s/it]

Processing chunk # :  26


27it [1:06:16, 108.03s/it]

Processing chunk # :  27


28it [1:07:56, 105.57s/it]

Processing chunk # :  28


29it [1:09:33, 102.91s/it]

Processing chunk # :  29


30it [1:11:06, 100.14s/it]

Processing chunk # :  30


31it [1:12:52, 101.83s/it]

Processing chunk # :  31


32it [1:14:24, 98.78s/it] 

Processing chunk # :  32


33it [1:15:55, 96.56s/it]

Processing chunk # :  33


34it [1:17:25, 94.47s/it]

Processing chunk # :  34


35it [1:18:55, 93.05s/it]

Processing chunk # :  35


36it [1:20:22, 91.45s/it]

Processing chunk # :  36


37it [1:21:48, 89.82s/it]

Processing chunk # :  37


38it [1:23:21, 90.61s/it]

Processing chunk # :  38


39it [1:24:48, 89.67s/it]

Processing chunk # :  39


40it [1:26:15, 88.69s/it]

Processing chunk # :  40


41it [1:27:43, 88.65s/it]

Processing chunk # :  41


42it [1:29:09, 87.92s/it]

Processing chunk # :  42


43it [1:30:37, 87.87s/it]

Processing chunk # :  43


44it [1:32:06, 88.20s/it]

Processing chunk # :  44


45it [1:33:29, 86.63s/it]

Processing chunk # :  45


46it [1:34:49, 84.61s/it]

Processing chunk # :  46


47it [1:36:17, 85.58s/it]

Processing chunk # :  47


48it [1:37:38, 84.20s/it]

Processing chunk # :  48


49it [1:39:06, 85.39s/it]

Processing chunk # :  49


50it [1:40:29, 84.58s/it]

Processing chunk # :  50


51it [1:41:49, 83.22s/it]

Processing chunk # :  51


52it [1:43:05, 81.26s/it]

Processing chunk # :  52


53it [1:44:22, 79.96s/it]

Processing chunk # :  53


54it [1:45:43, 80.02s/it]

Processing chunk # :  54


55it [1:47:00, 79.26s/it]

Processing chunk # :  55


56it [1:48:15, 78.02s/it]

Processing chunk # :  56


57it [1:49:31, 77.31s/it]

Processing chunk # :  57


58it [1:50:46, 76.71s/it]

Processing chunk # :  58


59it [1:52:02, 76.47s/it]

Processing chunk # :  59


60it [1:53:12, 74.53s/it]

Processing chunk # :  60


61it [1:54:21, 72.78s/it]

Processing chunk # :  61


62it [1:55:26, 70.55s/it]

Processing chunk # :  62


63it [1:56:36, 70.51s/it]

Processing chunk # :  63


64it [1:57:44, 69.58s/it]

Processing chunk # :  64


65it [1:58:50, 68.50s/it]

Processing chunk # :  65


66it [1:59:55, 67.48s/it]

Processing chunk # :  66


67it [2:01:01, 67.19s/it]

Processing chunk # :  67


68it [2:02:09, 67.43s/it]

Processing chunk # :  68


69it [2:03:15, 66.73s/it]

Processing chunk # :  69


70it [2:04:18, 65.87s/it]

Processing chunk # :  70


71it [2:05:20, 64.70s/it]

Processing chunk # :  71


72it [2:06:23, 64.22s/it]

Processing chunk # :  72


73it [2:07:24, 63.02s/it]

Processing chunk # :  73


74it [2:08:22, 61.74s/it]

Processing chunk # :  74


75it [2:09:28, 62.97s/it]

Processing chunk # :  75


76it [2:10:27, 61.78s/it]

Processing chunk # :  76


77it [2:11:28, 61.44s/it]

Processing chunk # :  77


78it [2:12:25, 60.18s/it]

Processing chunk # :  78


79it [2:13:21, 59.02s/it]

Processing chunk # :  79


80it [2:14:17, 57.84s/it]

Processing chunk # :  80


81it [2:15:16, 58.28s/it]

Processing chunk # :  81


82it [2:16:16, 58.80s/it]

Processing chunk # :  82


83it [2:17:14, 58.72s/it]

Processing chunk # :  83


84it [2:18:10, 57.88s/it]

Processing chunk # :  84


85it [2:19:04, 56.67s/it]

Processing chunk # :  85


86it [2:19:57, 55.54s/it]

Processing chunk # :  86


87it [2:20:55, 56.12s/it]

Processing chunk # :  87


88it [2:21:50, 55.83s/it]

Processing chunk # :  88


89it [2:22:44, 55.37s/it]

Processing chunk # :  89


90it [2:23:36, 54.31s/it]

Processing chunk # :  90


91it [2:24:26, 53.09s/it]

Processing chunk # :  91


92it [2:25:22, 53.81s/it]

Processing chunk # :  92


93it [2:26:16, 53.99s/it]

Processing chunk # :  93


94it [2:27:09, 53.78s/it]

Processing chunk # :  94


95it [2:28:04, 53.93s/it]

Processing chunk # :  95


96it [2:28:59, 54.51s/it]

Processing chunk # :  96


97it [2:29:52, 53.83s/it]

Processing chunk # :  97


98it [2:30:42, 52.90s/it]

Processing chunk # :  98


99it [2:31:36, 53.04s/it]

Processing chunk # :  99


100it [2:32:26, 52.19s/it]

Processing chunk # :  100


101it [2:33:16, 51.50s/it]

Processing chunk # :  101


102it [2:34:07, 51.42s/it]

Processing chunk # :  102


103it [2:34:57, 51.00s/it]

Processing chunk # :  103


104it [2:35:45, 50.21s/it]

Processing chunk # :  104


105it [2:36:36, 50.36s/it]

Processing chunk # :  105


106it [2:37:25, 49.97s/it]

Processing chunk # :  106


107it [2:38:14, 49.57s/it]

Processing chunk # :  107


108it [2:39:04, 49.72s/it]

Processing chunk # :  108


109it [2:39:55, 50.07s/it]

Processing chunk # :  109


110it [2:40:42, 49.30s/it]

Processing chunk # :  110


111it [2:41:27, 48.00s/it]

Processing chunk # :  111


112it [2:42:15, 48.01s/it]

Processing chunk # :  112


113it [2:43:07, 48.98s/it]

Processing chunk # :  113


114it [2:43:51, 47.75s/it]

Processing chunk # :  114


115it [2:44:37, 47.16s/it]

Processing chunk # :  115


116it [2:45:20, 45.91s/it]

Processing chunk # :  116


117it [2:46:04, 45.31s/it]

Processing chunk # :  117


118it [2:46:47, 44.62s/it]

Processing chunk # :  118


119it [2:47:29, 43.88s/it]

Processing chunk # :  119


120it [2:48:12, 43.46s/it]

Processing chunk # :  120


121it [2:48:53, 42.66s/it]

Processing chunk # :  121


122it [2:49:34, 42.31s/it]

Processing chunk # :  122


123it [2:50:16, 42.18s/it]

Processing chunk # :  123


124it [2:50:58, 42.04s/it]

Processing chunk # :  124


125it [2:51:40, 42.12s/it]

Processing chunk # :  125


126it [2:52:20, 41.62s/it]

Processing chunk # :  126


127it [2:53:03, 42.02s/it]

Processing chunk # :  127


128it [2:53:44, 41.62s/it]

Processing chunk # :  128


129it [2:54:20, 81.09s/it]

CPU times: user 2h 51min 21s, sys: 1min 17s, total: 2h 52min 38s
Wall time: 2h 54min 20s



