**Otto Recommender System:** Converting train.jsonl file to TFRecord file format. The idea is to use TFRecord file format further and learn TensorFlow Recommenders (TFRS) while implementing this usecase  

Following resources gave me good understanding of WHAT, WHY and How of TFRecord file format. 

* https://www.tensorflow.org/tutorials/load_data/tfrecord
* https://keras.io/examples/keras_recipes/creating_tfrecords/
* https://www.kaggle.com/code/ryanholbrook/tfrecords-basics/notebook

As I start to learn TensorFlow, your remarks / comments for sure can help in improving my knowledge 

In [None]:
import numpy as np 
import pandas as pd 
import os
import zipfile
import json
from tqdm import tqdm
import tensorflow as tf

In [None]:
INPUT_DIR = '../input/otto-recommender-system'
TRAIN_FILE = f'{INPUT_DIR}/train.jsonl'
TEST_FILE = f'{INPUT_DIR}/test.jsonl'

In [None]:
def _int64_feature(value):
    # Returns an int64_list from a bool / enum / int / uint
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

def serialize_example(session, aid, ts, typ):
    # Creates a tf.train.Example message ready to be written to a file.
    # Create a dictionary mapping the feature name to the tf.train.Example-compatible data type.
    feature = {
          'session': _int64_feature(session),
          'aid': _int64_feature(aid),
          'ts': _int64_feature(ts),
          'typ': _int64_feature(typ),
      }
    # Create a Features message using tf.train.Example.
    example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
    return example_proto.SerializeToString()

In [None]:
def preprocess(df):
    initial_data, features={}, {}
    session_list, item_A_list, ts_list = [], [], []
    for index in df.index:
        session_id = str(df['session'][index])
        session_list.append(session_id)
        item_A_list.append(str(df['events'][index][0]['aid']))
        ts_list.append(df['events'][index][0]['ts'])
        click_list, cart_list, order_list = [], [], []
        for event in df['events'][index]:
            if event['type']=='clicks':
                click_list.append(str(event['aid']))
            elif event['type']=='carts':
                cart_list.append(str(event['aid']))
            else:
                order_list.append(str(event['aid']))
        initial_data[session_id] = {'clicks':click_list, 'carts':cart_list, 'orders':order_list}
    features={'session_list':session_list, 'item_A_list':item_A_list, 'ts_list':ts_list}
    return features, initial_data

def features_list(train_df):
    session = []
    aid = []
    ts = []
    typ = []
    codes = {'clicks':1, 'carts':2, 'orders':3}

    for index in train_df.index:
        session_id = train_df['session'][index]
        for event in train_df['events'][index]:
            session.append(session_id)
            aid.append(event['aid'])
            ts.append(event['ts'])
            typ.append(codes[event['type']])
    return session, aid, ts, typ

def write_tfrecord(file_num, session, aid, ts, typ):
    filename = f'train_chunk_{file_num}.tfrecord'
    options = tf.io.TFRecordOptions(compression_type='ZLIB')
    
    with tf.io.TFRecordWriter(filename, options=options) as writer:
        for i in range(len(session)):
            example = serialize_example(session[i], aid[i], ts[i], typ[i])
            writer.write(example)
    zip_tfrecord(filename)
    return

def zip_tfrecord(filename):
    OUTPUT_DIR = '/kaggle/working/'
    zip_file = f'{OUTPUT_DIR}/otto_tfrecord.zip'
    filename = f'{OUTPUT_DIR}/{filename}'
    
    with zipfile.ZipFile(zip_file, 'a') as zf:
        zf.write(filename)
    os.remove(filename)
    return

In [None]:
%%time
num_lines = sum(1 for line in open(TRAIN_FILE))
print('Number of lines: ', num_lines)

chunk_size=100000
num_chunks= int(np.ceil(num_lines / chunk_size))
print('Number of chunks: ',num_chunks)

In [None]:
%%time
chunks = pd.read_json(TRAIN_FILE, lines=True, chunksize=chunk_size)

for e, chunk in tqdm(enumerate(chunks)):
    print('Processing chunk # : ', e)
    session, aid, ts, typ = features_list(chunk)
    write_tfrecord(e, session, aid, ts, typ)