<a href="https://colab.research.google.com/github/nisarahamedk/kaggle-riid/blob/master/notebooks/RIID_TF_Transformer_Inference.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### RIID TF Transformer Inference

In [1]:
%%capture
!rm -f /opt/conda/lib/python3.7/site-packages/llvmlite-0.31.0-py3.7.egg-info
!pip install ../input/sparse-package-1/numpy-1.19.4-cp37-cp37m-manylinux2010_x86_64.whl
!pip install ../input/sparse-package-1/setuptools-51.1.1-py3-none-any.whl
!pip install ../input/sparse-package-1/scipy-1.6.0-cp37-cp37m-manylinux1_x86_64.whl
!pip install ../input/sparse-package-1/llvmlite-0.35.0-cp37-cp37m-manylinux2010_x86_64.whl
!pip install ../input/sparse-package-1/numba-0.52.0-cp37-cp37m-manylinux2014_x86_64.whl
!pip install ../input/sparse-package-1/sparse-0.11.2-py2.py3-none-any.whl

In [2]:
import itertools
import pickle
from datetime import datetime

import tensorflow as tf
import tensorflow.keras as keras
import pandas as pd
import numpy as np
import sparse

np.random.seed(42)
tf.random.set_seed(42)

In [None]:
AUTOTUNE = tf.data.experimental.AUTOTUNE

### Model

#### Positional Encoding

In [None]:
def get_angles(pos, i, d_model):
  angle_rates = 1 / np.power(10000, (2 * (i//2)) / np.float32(d_model))
  return pos * angle_rates

In [None]:
def positional_encoding(position, d_model):
  angle_rads = get_angles(np.arange(position)[:, np.newaxis],
                          np.arange(d_model)[np.newaxis, :],
                          d_model)

  # apply sin to even indices in the array; 2i
  angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])

  # apply cos to odd indices in the array; 2i+1
  angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])

  pos_encoding = angle_rads[np.newaxis, ...]

  return tf.cast(pos_encoding, dtype=tf.float32)

#### Look ahead mask¶


In [None]:
def create_look_ahead_mask(size):
  mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
  return mask  # (seq_len, seq_len)

#### Scaled Dot Product Attention

In [None]:
def scaled_dot_product_attention(q, k, v, mask):
  """Calculate the attention weights.
  q, k, v must have matching leading dimensions.
  k, v must have matching penultimate dimension, i.e.: seq_len_k = seq_len_v.
  The mask has different shapes depending on its type(padding or look ahead) 
  but it must be broadcastable for addition.

  Args:
    q: query shape == (..., seq_len_q, depth)
    k: key shape == (..., seq_len_k, depth)
    v: value shape == (..., seq_len_v, depth_v)
    mask: Float tensor with shape broadcastable 
          to (..., seq_len_q, seq_len_k). Defaults to None.

  Returns:
    output, attention_weights
  """

  matmul_qk = tf.matmul(q, k, transpose_b=True)  # (..., seq_len_q, seq_len_k)

  # scale matmul_qk
  dk = tf.cast(tf.shape(k)[-1], tf.float32)
  scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)

  # add the mask to the scaled tensor.
  if mask is not None:
    scaled_attention_logits += (mask * -1e9)  

  # softmax is normalized on the last axis (seq_len_k) so that the scores
  # add up to 1.
  attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)  # (..., seq_len_q, seq_len_k)

  output = tf.matmul(attention_weights, v)  # (..., seq_len_q, depth_v)

  return output, attention_weights

#### Multi Head Attention

In [None]:
class MultiHeadAttention(tf.keras.layers.Layer):
  def __init__(self, d_model, num_heads):
    super(MultiHeadAttention, self).__init__()
    self.num_heads = num_heads
    self.d_model = d_model

    assert d_model % self.num_heads == 0

    self.depth = d_model // self.num_heads

    self.wq = tf.keras.layers.Dense(d_model)
    self.wk = tf.keras.layers.Dense(d_model)
    self.wv = tf.keras.layers.Dense(d_model)

    self.dense = tf.keras.layers.Dense(d_model)

  def split_heads(self, x, batch_size):
    """Split the last dimension into (num_heads, depth).
    Transpose the result such that the shape is (batch_size, num_heads, seq_len, depth)
    """
    x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
    return tf.transpose(x, perm=[0, 2, 1, 3])

  def call(self, v, k, q, mask):
    batch_size = tf.shape(q)[0]

    q = self.wq(q)  # (batch_size, seq_len, d_model)
    k = self.wk(k)  # (batch_size, seq_len, d_model)
    v = self.wv(v)  # (batch_size, seq_len, d_model)

    q = self.split_heads(q, batch_size)  # (batch_size, num_heads, seq_len_q, depth)
    k = self.split_heads(k, batch_size)  # (batch_size, num_heads, seq_len_k, depth)
    v = self.split_heads(v, batch_size)  # (batch_size, num_heads, seq_len_v, depth)

    # scaled_attention.shape == (batch_size, num_heads, seq_len_q, depth)
    # attention_weights.shape == (batch_size, num_heads, seq_len_q, seq_len_k)
    scaled_attention, attention_weights = scaled_dot_product_attention(
        q, k, v, mask)

    scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])  # (batch_size, seq_len_q, num_heads, depth)

    concat_attention = tf.reshape(scaled_attention, 
                                  (batch_size, -1, self.d_model))  # (batch_size, seq_len_q, d_model)

    output = self.dense(concat_attention)  # (batch_size, seq_len_q, d_model)

    return output, attention_weights

#### Pointwise FeedForward Network

In [None]:
def point_wise_feed_forward_network(d_model, dff):
  return tf.keras.Sequential([
      tf.keras.layers.Dense(dff, activation='relu'),  # (batch_size, seq_len, dff)
      tf.keras.layers.Dense(d_model)  # (batch_size, seq_len, d_model)
  ])

#### EncoderLayer

In [None]:
class EncoderLayer(tf.keras.layers.Layer):
  def __init__(self, d_model, num_heads, dff, rate=0.1):
    super(EncoderLayer, self).__init__()

    self.mha = MultiHeadAttention(d_model, num_heads)
    self.ffn = point_wise_feed_forward_network(d_model, dff)

    self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

    self.dropout1 = tf.keras.layers.Dropout(rate)
    self.dropout2 = tf.keras.layers.Dropout(rate)

  def call(self, x, training, mask):

    attn_output, _ = self.mha(x, x, x, mask)  # (batch_size, input_seq_len, d_model)
    attn_output = self.dropout1(attn_output, training=training)
    out1 = self.layernorm1(x + attn_output)  # (batch_size, input_seq_len, d_model)

    ffn_output = self.ffn(out1)  # (batch_size, input_seq_len, d_model)
    ffn_output = self.dropout2(ffn_output, training=training)
    out2 = self.layernorm2(out1 + ffn_output)  # (batch_size, input_seq_len, d_model)

    return out2

#### Encoder

In [None]:
class Encoder(tf.keras.layers.Layer):
  def __init__(self, num_layers, d_model, num_heads, dff, maximum_position_encoding, embed_size_dict, rate):
    super(Encoder, self).__init__()

    self.d_model = d_model
    self.num_layers = num_layers

    self.content_id_emb = tf.keras.layers.Embedding(embed_size_dict["content_id"] + 2, d_model)
    self.task_container_id_emb = tf.keras.layers.Embedding(embed_size_dict["task_container_id"] + 2, d_model)
    self.part_emb = tf.keras.layers.Embedding(embed_size_dict["part"] + 2, d_model)
    self.prior_question_elapsed_time_emb = tf.keras.layers.Dense(d_model, use_bias=True)
    self.prev_answered_emb = tf.keras.layers.Embedding(4, d_model)
    self.pos_encoding = positional_encoding(maximum_position_encoding, 
                                            self.d_model)


    self.enc_layers = [EncoderLayer(d_model, num_heads, dff, rate) 
                       for _ in range(num_layers)]

    self.dropout = tf.keras.layers.Dropout(rate)

  def call(self, x, training, mask):

    seq_len = tf.shape(x)[1]

    # adding embeddings and position encoding.
    c_emb = self.content_id_emb(x[..., 0])  # (batch_size, input_seq_len, d_model)
    t_emb = self.task_container_id_emb(x[..., 1])
    prior_time_emb = self.prior_question_elapsed_time_emb(tf.expand_dims(x[..., 2], axis=-1))
    pt_emb = self.part_emb(x[..., 3])
    pv_emb = self.prev_answered_emb(x[..., 4])
    x = c_emb + t_emb + prior_time_emb + pt_emb + pv_emb
    
    x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
    x += self.pos_encoding[:, :seq_len, :]

    x = self.dropout(x, training=training)

    for i in range(self.num_layers):
      x = self.enc_layers[i](x, training, mask) # (batch_size, input_seq_len, d_model)

    return x

In [None]:
class TransformerSeq2SeqClassifier(keras.models.Model):
  def __init__(self, num_layers, d_model, num_heads, dff, maximum_position_encoding, embed_size_dict, rate=0.1):
    super(TransformerSeq2SeqClassifier, self).__init__()

    self.encoder = Encoder(num_layers, d_model, num_heads, dff, maximum_position_encoding, embed_size_dict, rate)
    self.out = tf.keras.layers.Dense(1, activation="sigmoid")

  def call(self, x):
    seq_len = tf.shape(x)[1]
    look_ahead_mask = create_look_ahead_mask(seq_len)
    encoded = self.encoder(x, mask=look_ahead_mask)

    out = self.out(encoded)
    return out # [batch_size, input_seq_len, 1]

#### Embedding sizes

In [None]:
# DATA_PATH = 'gs://kds-f48a9c4d95386273c0ef508e337abd3f874b82a454a6c3d0e035839c'
DATA_PATH = "/kaggle/input/riid-0-1"

In [None]:
embed_sizes = pickle.loads(tf.io.read_file(DATA_PATH + "/emb_sz.pkl").numpy())
embed_sizes

In [None]:
model = TransformerSeq2SeqClassifier(
      num_layers=1,
      d_model=512,
      num_heads=8,
      dff=1024,
      maximum_position_encoding=128,
      embed_size_dict=embed_sizes
  )
model.build(input_shape=(128, 128, 5)) # input_shape - [batch_size, seq_len, features]
model.summary()

In [None]:
model.load_weights("/kaggle/input/riid-model-1/best-model.h5")

### Dataset

#### Question df

In [3]:
questions_df = pd.read_csv("/kaggle/input/riiid-test-answer-prediction/questions.csv", index_col="question_id")

#### Test API

In [4]:
dtypes_train = {
    'row_id': 'int64',
    'timestamp': 'int64',
    'user_id': 'int32',
    'content_id': 'int16',
    'content_type_id': 'int8',
    'task_container_id': 'int16',
    'user_answer': 'int8',
    'answered_correctly': 'int8',
    'prior_question_elapsed_time': 'float32',
    'prior_question_had_explanation': 'boolean'
    }

dtypes_questions = {
    "question_id": "",
    "bundle_id": "",
    "correct_answer": "",
    "part": "int16",
    "tags": "",
}

In [5]:
SEQ_LEN = 512

#### Submission

In [6]:
import cProfile, pstats, io
from pstats import SortKey

### Fake API

In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
import os
import random
from copy import deepcopy
import _pickle as pickle

def save(file,name, folder = ""):
    if folder != "":
        outfile = open('./'+folder+'/'+name+'.pickle', 'wb')
    else:
        outfile = open(name+'.pickle', 'wb')
    pickle.dump(file, outfile, protocol=4)
    outfile.close
    
def load(name, folder = ""):
    if folder != "":
        outfile = open('./'+folder+'/'+name+'.pickle', 'rb')
    else:
        outfile = open(name+'.pickle', 'rb')
    file = pickle.load(outfile)
    outfile.close
    return file

In [8]:
class FakeDataGenerator:
    
    def __init__(self):
        '''
        self.data will be a dictionnary to iterate over the stored data
        self.all_rows will be the rows of the train set that are used by the generato
        self.data_index will be all the data available in the dataset        
        '''
        self.data = None
        self.all_rows = None
        self.data_index = None
        return None
    
    def __getitem__(self, idx):
        if idx > self.data_index[-1]:
            raise StopIteration
        sample = self.data[idx]
        sub = sample[['row_id', 'group_num']].copy()
        sub['answered_correctly'] = np.zeros(sub.shape[0])+0.5
        return (sample, sub)
    
    
    def load(self, save_name):
        self.data,self.all_rows = load(save_name)
        self.data_index = np.array(list(self.data.keys()))
    
    def build_from_train(self, train, n_users, beginner_rate = 0.3, save_name = 'fake_train_generator'):
        """
        train will be the training set you loaded
        n_users is a number of user from whom you will sample the data
        beginner_rate is the rate of these users who will begin their journey during test
        save_name : the name under which the item will be saved
        """
        
        ## Sampling a restricted list of users
        user_list = train['user_id'].unique()
        test_user_list = np.random.choice(user_list, size = n_users)
        train.index = train['user_id']
        test_data_non_filter = train.loc[test_user_list]
        test_data_non_filter.index = list(range(test_data_non_filter.shape[0]))
        
        ## building a dictionnary with all the rows and container id from a user
        dico_user = {}
        def agg(x):
            return [elt for elt in x]
        
        print("Generating user dictionnary")
        for user, frame in tqdm(test_data_non_filter.groupby('user_id'), total =test_data_non_filter['user_id'].nunique()):
            if frame.shape[0] > 0:
                dico_user[user] = {}

                dico_user[user]['min_indice'] = frame['task_container_id'].min()
                dico_user[user]['max_indice'] = frame['task_container_id'].max()

                r = random.uniform(0,1)
                if r < beginner_rate:
                    dico_user[user]['current_indice'] = dico_user[user]['min_indice']
                else:
                    dico_user[user]['current_indice'] = random.randint(dico_user[user]['min_indice'],dico_user[user]['max_indice']-2)

                row_ids = frame[['task_container_id','row_id']].groupby('task_container_id').agg(agg)
                row_ids = row_ids.to_dict()['row_id']
                dico_user[user]['row_ids'] = row_ids

        work_dico = deepcopy(dico_user)
        
        ## Choosing batch_data to generate
        work_dico = deepcopy(dico_user)
        batches = {}

        all_rows = []
        batch_number = 0
        
        print('Creating batches')
        while len(work_dico)> 1:

            size = random.randint(20,500)
            size = min(size, len(work_dico))


            batch = []

            users = np.random.choice(np.array(list(work_dico.keys())),replace = False,  size = size)

            for u in users:
                try:
                    batch.extend(work_dico[u]['row_ids'][work_dico[u]['current_indice']])
                    all_rows.extend(work_dico[u]['row_ids'][work_dico[u]['current_indice']])
                    work_dico[u]['current_indice'] += 1
                    if work_dico[u]['current_indice'] == work_dico[u]['max_indice']:
                        work_dico.pop(u)
                except:
                    work_dico.pop(u)

            batches[batch_number] = batch
            batch_number += 1
        
        ## building data

        data = {}
        
        print("Building dataset")
        test_data_non_filter.index = test_data_non_filter['row_id']
        for i in tqdm(batches):
            current_data = test_data_non_filter.loc[np.array(batches[i])]
            current_data['group_num'] = i

            current_data['prior_group_answers_correct'] = [np.nan for elt in range(current_data.shape[0])]
            current_data['prior_group_responses'] = [np.nan for elt in range(current_data.shape[0])]

            if i != 0:
                current_data['prior_group_answers_correct'].iloc[0] = saved_correct_answer
                current_data['prior_group_responses'].iloc[0] = saved_answer

            saved_answer = str(list(current_data[current_data['content_type_id'] == 0]['user_answer'].values))
            saved_correct_answer = str(list(current_data[current_data['content_type_id'] == 0]['answered_correctly'].values))
            current_data = current_data.drop(columns = ['user_answer', 'answered_correctly'])

            data[i] = current_data

        save((data,np.array(all_rows)) , save_name)
        
        self.data = data
        self.all_rows = np.array(all_rows)
        self.data_index = np.array(list(data.keys()))
        print('finished')

In [9]:
env = FakeDataGenerator()

In [10]:
env.load("/kaggle/input/fake-api-generation-riid-preprocessed/fake_train_generator")

In [11]:
iter_test = env

In [23]:
def time(func):
    def wrapped(*args, **kwargs):
        t0 = datetime.now()
        
        ret = func(*args, **kwargs)
        
        dt = datetime.now() - t0
        print(f"==> {func.__name__} took {dt.total_seconds():.2} sec")    
        return ret
    return wrapped

def profile(func):
    def wrapped(*args, **kwargs):
        pr = cProfile.Profile()
        pr.enable()
        
        ret = func(*args, **kwargs)
        
        pr.disable()
        s = io.StringIO()
        sortby = SortKey.CUMULATIVE
        ps = pstats.Stats(pr, stream=s).sort_stats(sortby)
        ps.print_stats(0.02) # print only 2% of the data.
        print(s.getvalue())
        return ret
    return wrapped

@time
def preprocess(test_dfm, prev_test_df):
    
    # we have prev df's answers, so store the prev_test_df in state_dict
    if prev_test_df is not None:
        try:
            oh_tags = one_hot_tags(prev_test_df["tags"])
            prev_test_df['answered_correctly'] = list(filter(lambda x: x !=-1, eval(test_dfm['prior_group_answers_correct'].iloc[0])))
            prev_test_df['answered_correctly'] += 3
    
            for i, row in enumerate(prev_test_df.values):
                user = row[0]
                # this state
                s = np.zeros(197)
                s[:7] = row[1:8] # [1:8] - skip user_id and tags.
                s[7:] = oh_tags[i]
                if state_dict.get(user, None) is None: # new user, add to the state dict.
                    state_dict[user] = s
                else: # existing user
                    state = np.vstack([state_dict[user], s]) # append to prev features.
                    state_dict[user] = state
        except Exception as e:
            # pass # do not update the state dict.
            raise
        
    # dummy "answered_correctly" - fill token
    test_dfm["answered_correctly"] = 2
    
    # join question for feaures
    test_df = test_dfm.join(questions_df, on="content_id")
    
    # -- process the current df.
    # selecting required cols
    test_df = test_df[["user_id", "timestamp", "content_id", "task_container_id", "prior_question_elapsed_time", "prior_question_had_explanation", "part", "answered_correctly", "tags"]]
    
    # 0, 1, 2 are special tokens, so increment 3
    indicator_cols = ["content_id", "task_container_id", "part", "prior_question_had_explanation"]
    for c in indicator_cols:
      test_df[c] = test_df[c] + 3
        
    # same treatment for the tags
    test_df["tags"] = test_df["tags"].apply(lambda row: " ".join([str(int(x)+3) for x in row.split(" ")]))
    
    # FIXME: unseen ids - content_id > 13525, 'task_container_id'> 10002, 'part'> 10
    test_df.loc[test_df["content_id"] > 13525, "content_id"] = 0
    test_df.loc[test_df["task_container_id"] > 10002, "task_container_id"] = 0
    test_df.loc[test_df["part"] > 10, "part"] = 0
    # test_df.fillna(2, inplace=True) # FILL TOKEN
    
    return test_df

@time
def one_hot_tags(tags):
    
    tags = tf.strings.to_number(tf.strings.split(tags), out_type=tf.int32) # will produce a ragged tensor with tags for each q
    # ragged tensor of tags [[2], [3, 4]] is converted to one hot like [[0,0,1..], [[0,0,0,1,..], [0,0,0,0,1...]]]
    # then sumed along axis 1, so for each question there will be 1 for all the tags associated with it.
    tags = tf.reduce_sum(tf.one_hot(tags, depth=190), axis=1) # shape [seq_len, 190]
    tags = tags.numpy().astype(np.uint8)
    
    return tags

@time
def get_x(test_df):
    max_seq_len = 2
    xb = []
    oh_tags = one_hot_tags(test_df["tags"])
    start_tokens = np.zeros(197)
    start_tokens[:7] = np.ones(7)
    for idx, row in enumerate(test_df.values):
        user = row[0]
        x = np.zeros((SEQ_LEN+1, 197))
        if state_dict.get(user, None) is None: # new user
            if train_state_dict.get(user, None) is None:
                x[-2, :] = start_tokens
                # current state
                x[-1, :7] = row[1:8] # [1:8] - skip user_id and tags.
                x[-1, 7:] = oh_tags[idx]
            else:
                # start tokens
                x[0, :] = start_tokens
                
                # prev states from train set
                i = train_state_dict[user]
                x[1:-1, 0] = int64_feat[i] # timestamp
                x[1:-1, 1:3] = int16_feat[i] # content_id, task_container_id
                x[1:-1, 3] = int32_feat[i] # prior_question_elapsed_time
                x[1:-1, 4:7] = int8_feat[i] # prior_q_had_exp, part, answered_correctly
                x[1:-1, 7:] = tags_feat[i].todense()
                
                # current state
                x[-1, :7] = row[1:8] # [1:] - skip user_id and tags.
                x[-1, 7:] = oh_tags[idx]
        else: # existing user, get prev states and build time series.
            # prev state from test set
            prev_state = state_dict[user]
            seq_len = prev_state.shape[0] + 1
            x[-seq_len:-1, :] = prev_state
            max_seq_len = max(seq_len, max_seq_len)

            # curr_state
            x[-1, :7] = row[1:8] # [1:] - skip user_id and tags.
            x[-1, 7:] = oh_tags[idx]
            
            # prev states from train set
            if train_state_dict.get(user, None):
                len_to_fill = SEQ_LEN+1 - seq_len
                i = train_state_dict[user]
                x[:len_to_fill, 0] = int64_feat[i][-len_to_fill:] # timestamp
                x[:len_to_fill, 1:3] = int16_feat[i][-len_to_fill:] # content_id, task_container_id
                x[:len_to_fill, 3] = int32_feat[i][-len_to_fill:] # prior_question_elapsed_time
                x[:len_to_fill, 4:7] = int8_feat[i][-len_to_fill:] # prior_q_had_exp, part, answered_correctly
                x[:len_to_fill, 7:] = tags_feat[i][-len_to_fill:].todense()
                max_seq_len = 513
            
        xb.append(x)
    x = np.stack(xb, axis=0)[:, -max_seq_len:, :]
    return x

@time
def predict(x):
    
    # predict
    preds = model(x, training=False)[:, -1, :].numpy().flatten()
    # preds = model.predict(x)[:, -1, :].flatten()
    
    return preds

In [13]:
state_dict = {}
train_state_dict = dict()
with open("/kaggle/input/riid-state-dict-512-0-7/user_index.pkl", "rb") as f:
    train_state_dict = pickle.load(f)
    
int64_feat = sparse.load_npz("/kaggle/input/riid-state-dict-512-0-7/int64_feat.npz").todense()
int32_feat = sparse.load_npz("/kaggle/input/riid-state-dict-512-0-7/int32_feat.npz").todense()
int16_feat = sparse.load_npz("/kaggle/input/riid-state-dict-512-0-7/int16_feat.npz").todense()
int8_feat = sparse.load_npz("/kaggle/input/riid-state-dict-512-0-7/int8_feat.npz").todense()
tags_feat = sparse.load_npz("/kaggle/input/riid-state-dict-512-0-7/tags_feat.npz")

In [24]:
start = datetime.now()
test_dfs = [] # for debug

prev_test_df = None

for test_dfm, sample_pred in itertools.islice(iter_test, 10):
    print("\n*\n*\n--- iteration ---")
    print(len(test_dfm))
    t0 = datetime.now()
    
    # filtering only questions, removing lectures.
    test_dfm = test_dfm[test_dfm.content_type_id == False].copy()
    
    test_df = preprocess(test_dfm, prev_test_df)

    test_dfs.append(test_df.copy())
    
    prev_test_df = test_df.copy()
    
    x = get_x(test_df)
    
    # final_preds = predict(x)
        
    # submit
    # test_dfm["answered_correctly"] = final_preds
    
    # env.predict(test_dfm[["row_id", "answered_correctly"]])
    print(f"iteration took: {(datetime.now() - t0).total_seconds():.3}s")
    
total_time = datetime.now() - start
print(f"===== Total time: {total_time.total_seconds():.3} sec")


*
*
--- iteration ---
561
==> preprocess took 0.014 sec
==> one_hot_tags took 0.006 sec
==> get_x took 0.95 sec
iteration took: 0.969s

*
*
--- iteration ---
582
==> one_hot_tags took 0.006 sec
==> preprocess took 0.032 sec
==> one_hot_tags took 0.0056 sec
==> get_x took 0.79 sec
iteration took: 0.827s

*
*
--- iteration ---
54
==> one_hot_tags took 0.0072 sec
==> preprocess took 0.026 sec
==> one_hot_tags took 0.005 sec
==> get_x took 0.067 sec
iteration took: 0.0965s

*
*
--- iteration ---
175
==> one_hot_tags took 0.0063 sec
==> preprocess took 0.019 sec
==> one_hot_tags took 0.0049 sec
==> get_x took 0.24 sec
iteration took: 0.263s

*
*
--- iteration ---
238
==> one_hot_tags took 0.0058 sec
==> preprocess took 0.02 sec
==> one_hot_tags took 0.0048 sec
==> get_x took 0.33 sec
iteration took: 0.349s

*
*
--- iteration ---
345
==> one_hot_tags took 0.0055 sec
==> preprocess took 0.021 sec
==> one_hot_tags took 0.0053 sec
==> get_x took 0.47 sec
iteration took: 0.498s

*
*
--- iterati

In [60]:
test_df.iloc[0]

user_id                           1500802804
timestamp                             215339
content_id                              2949
task_container_id                          9
prior_question_elapsed_time            15000
prior_question_had_explanation             3
part                                       7
answered_correctly                         2
tags                               139 70 95
Name: 70526899, dtype: object

In [62]:
x[0, -1, :7]

array([2.15339e+05, 2.94900e+03, 9.00000e+00, 1.50000e+04, 3.00000e+00,
       7.00000e+00, 2.00000e+00])

In [63]:
np.where(x[0, -1, 7:] == 1) # one hot tags

(array([ 70,  95, 139]),)

### Without train_df user states

tf.stack - 500 - 512-sec

np.stack - 500 - 443 sec

np.stack - 500 - GPU - 82 sec

tf.stack - 500 - GPU 142 sec

### With train_df user states.

np.stack 500 GPU - 248 sec