<a href="https://colab.research.google.com/github/nisarahamedk/kaggle-riid/blob/master/notebooks/RIID_TF_Transformer_Inference.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### RIID TF Transformer Inference

In [1]:
import itertools
import pickle
from datetime import datetime

import tensorflow as tf
import tensorflow.keras as keras
import pandas as pd
import numpy as np

np.random.seed(42)
tf.random.set_seed(42)

In [2]:
AUTOTUNE = tf.data.experimental.AUTOTUNE

### Model

#### Positional Encoding

In [3]:
def get_angles(pos, i, d_model):
  angle_rates = 1 / np.power(10000, (2 * (i//2)) / np.float32(d_model))
  return pos * angle_rates

In [4]:
def positional_encoding(position, d_model):
  angle_rads = get_angles(np.arange(position)[:, np.newaxis],
                          np.arange(d_model)[np.newaxis, :],
                          d_model)

  # apply sin to even indices in the array; 2i
  angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])

  # apply cos to odd indices in the array; 2i+1
  angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])

  pos_encoding = angle_rads[np.newaxis, ...]

  return tf.cast(pos_encoding, dtype=tf.float32)

#### Look ahead mask¶


In [5]:
def create_look_ahead_mask(size):
  mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
  return mask  # (seq_len, seq_len)

#### Scaled Dot Product Attention

In [6]:
def scaled_dot_product_attention(q, k, v, mask):
  """Calculate the attention weights.
  q, k, v must have matching leading dimensions.
  k, v must have matching penultimate dimension, i.e.: seq_len_k = seq_len_v.
  The mask has different shapes depending on its type(padding or look ahead) 
  but it must be broadcastable for addition.

  Args:
    q: query shape == (..., seq_len_q, depth)
    k: key shape == (..., seq_len_k, depth)
    v: value shape == (..., seq_len_v, depth_v)
    mask: Float tensor with shape broadcastable 
          to (..., seq_len_q, seq_len_k). Defaults to None.

  Returns:
    output, attention_weights
  """

  matmul_qk = tf.matmul(q, k, transpose_b=True)  # (..., seq_len_q, seq_len_k)

  # scale matmul_qk
  dk = tf.cast(tf.shape(k)[-1], tf.float32)
  scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)

  # add the mask to the scaled tensor.
  if mask is not None:
    scaled_attention_logits += (mask * -1e9)  

  # softmax is normalized on the last axis (seq_len_k) so that the scores
  # add up to 1.
  attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)  # (..., seq_len_q, seq_len_k)

  output = tf.matmul(attention_weights, v)  # (..., seq_len_q, depth_v)

  return output, attention_weights

#### Multi Head Attention

In [7]:
class MultiHeadAttention(tf.keras.layers.Layer):
  def __init__(self, d_model, num_heads):
    super(MultiHeadAttention, self).__init__()
    self.num_heads = num_heads
    self.d_model = d_model

    assert d_model % self.num_heads == 0

    self.depth = d_model // self.num_heads

    self.wq = tf.keras.layers.Dense(d_model)
    self.wk = tf.keras.layers.Dense(d_model)
    self.wv = tf.keras.layers.Dense(d_model)

    self.dense = tf.keras.layers.Dense(d_model)

  def split_heads(self, x, batch_size):
    """Split the last dimension into (num_heads, depth).
    Transpose the result such that the shape is (batch_size, num_heads, seq_len, depth)
    """
    x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
    return tf.transpose(x, perm=[0, 2, 1, 3])

  def call(self, v, k, q, mask):
    batch_size = tf.shape(q)[0]

    q = self.wq(q)  # (batch_size, seq_len, d_model)
    k = self.wk(k)  # (batch_size, seq_len, d_model)
    v = self.wv(v)  # (batch_size, seq_len, d_model)

    q = self.split_heads(q, batch_size)  # (batch_size, num_heads, seq_len_q, depth)
    k = self.split_heads(k, batch_size)  # (batch_size, num_heads, seq_len_k, depth)
    v = self.split_heads(v, batch_size)  # (batch_size, num_heads, seq_len_v, depth)

    # scaled_attention.shape == (batch_size, num_heads, seq_len_q, depth)
    # attention_weights.shape == (batch_size, num_heads, seq_len_q, seq_len_k)
    scaled_attention, attention_weights = scaled_dot_product_attention(
        q, k, v, mask)

    scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])  # (batch_size, seq_len_q, num_heads, depth)

    concat_attention = tf.reshape(scaled_attention, 
                                  (batch_size, -1, self.d_model))  # (batch_size, seq_len_q, d_model)

    output = self.dense(concat_attention)  # (batch_size, seq_len_q, d_model)

    return output, attention_weights

#### Pointwise FeedForward Network

In [8]:
def point_wise_feed_forward_network(d_model, dff):
  return tf.keras.Sequential([
      tf.keras.layers.Dense(dff, activation='relu'),  # (batch_size, seq_len, dff)
      tf.keras.layers.Dense(d_model)  # (batch_size, seq_len, d_model)
  ])

#### EncoderLayer

In [9]:
class EncoderLayer(tf.keras.layers.Layer):
  def __init__(self, d_model, num_heads, dff, rate=0.1):
    super(EncoderLayer, self).__init__()

    self.mha = MultiHeadAttention(d_model, num_heads)
    self.ffn = point_wise_feed_forward_network(d_model, dff)

    self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

    self.dropout1 = tf.keras.layers.Dropout(rate)
    self.dropout2 = tf.keras.layers.Dropout(rate)

  def call(self, x, training, mask):

    attn_output, _ = self.mha(x, x, x, mask)  # (batch_size, input_seq_len, d_model)
    attn_output = self.dropout1(attn_output, training=training)
    out1 = self.layernorm1(x + attn_output)  # (batch_size, input_seq_len, d_model)

    ffn_output = self.ffn(out1)  # (batch_size, input_seq_len, d_model)
    ffn_output = self.dropout2(ffn_output, training=training)
    out2 = self.layernorm2(out1 + ffn_output)  # (batch_size, input_seq_len, d_model)

    return out2

#### Encoder

In [10]:
class Encoder(tf.keras.layers.Layer):
  def __init__(self, num_layers, d_model, num_heads, dff, maximum_position_encoding, embed_size_dict, rate):
    super(Encoder, self).__init__()

    self.d_model = d_model
    self.num_layers = num_layers

    self.content_id_emb = tf.keras.layers.Embedding(embed_size_dict["content_id"] + 1, d_model)
    self.task_container_id_emb = tf.keras.layers.Embedding(embed_size_dict["task_container_id"] + 1, d_model)
    self.part_emb = tf.keras.layers.Embedding(embed_size_dict["part"] + 2, d_model)
    self.prior_question_elapsed_time_emb = tf.keras.layers.Dense(d_model, use_bias=True)
    self.prev_answered_emb = tf.keras.layers.Embedding(4, d_model)
    self.pos_encoding = positional_encoding(maximum_position_encoding, 
                                            self.d_model)


    self.enc_layers = [EncoderLayer(d_model, num_heads, dff, rate) 
                       for _ in range(num_layers)]

    self.dropout = tf.keras.layers.Dropout(rate)

  def call(self, x, training, mask):

    seq_len = tf.shape(x)[1]

    # adding embeddings and position encoding.
    c_emb = self.content_id_emb(x[..., 0])  # (batch_size, input_seq_len, d_model)
    t_emb = self.task_container_id_emb(x[..., 1])
    prior_time_emb = self.prior_question_elapsed_time_emb(tf.expand_dims(x[..., 2], axis=-1))
    pt_emb = self.part_emb(x[..., 3])
    pv_emb = self.prev_answered_emb(x[..., 4])
    x = c_emb + t_emb + prior_time_emb + pt_emb + pv_emb
    
    x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
    x += self.pos_encoding[:, :seq_len, :]

    x = self.dropout(x, training=training)

    for i in range(self.num_layers):
      x = self.enc_layers[i](x, training, mask) # (batch_size, input_seq_len, d_model)

    return x

In [11]:
class TransformerSeq2SeqClassifier(keras.models.Model):
  def __init__(self, num_layers, d_model, num_heads, dff, maximum_position_encoding, embed_size_dict, rate=0.1):
    super(TransformerSeq2SeqClassifier, self).__init__()

    self.encoder = Encoder(num_layers, d_model, num_heads, dff, maximum_position_encoding, embed_size_dict, rate)
    self.out = tf.keras.layers.Dense(1, activation="sigmoid")

  def call(self, x):
    seq_len = tf.shape(x)[1]
    look_ahead_mask = create_look_ahead_mask(seq_len)
    encoded = self.encoder(x, mask=look_ahead_mask)

    out = self.out(encoded)
    return out # [batch_size, input_seq_len, 1]

#### Embedding sizes

In [12]:
# DATA_PATH = 'gs://kds-f48a9c4d95386273c0ef508e337abd3f874b82a454a6c3d0e035839c'
DATA_PATH = "/kaggle/input/riid-0-1"

In [13]:
embed_sizes = pickle.loads(tf.io.read_file(DATA_PATH + "/emb_sz.pkl").numpy())
embed_sizes

{'content_id': 32736, 'task_container_id': 9999, 'part': 7}

In [14]:
model = TransformerSeq2SeqClassifier(
      num_layers=1,
      d_model=512,
      num_heads=8,
      dff=1024,
      maximum_position_encoding=128,
      embed_size_dict=embed_sizes
  )
model.build(input_shape=(128, 128, 5)) # input_shape - [batch_size, seq_len, features]
model.summary()

Model: "transformer_seq2seq_classifier"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
encoder (Encoder)            multiple                  23991808  
_________________________________________________________________
dense_7 (Dense)              multiple                  513       
Total params: 23,992,321
Trainable params: 23,992,321
Non-trainable params: 0
_________________________________________________________________


In [15]:
model.load_weights("/kaggle/input/riid-model-0/best-model.h5")

### Dataset

#### Question df

In [16]:
questions_df = pd.read_csv("/kaggle/input/riiid-test-answer-prediction/questions.csv", usecols=[0,3], index_col="question_id")

#### Test API

In [17]:
import riiideducation

# You can only call make_env() once, so don't lose it!
env = riiideducation.make_env()

In [18]:
# You can only iterate through a result from `env.iter_test()` once
# so be careful not to lose it once you start iterating.
iter_test = env.iter_test()

In [19]:
dtypes_train = {
    'row_id': 'int64',
    'timestamp': 'int64',
    'user_id': 'int32',
    'content_id': 'int16',
    'content_type_id': 'int8',
    'task_container_id': 'int16',
    'user_answer': 'int8',
    'answered_correctly': 'int8',
    'prior_question_elapsed_time': 'float32',
    'prior_question_had_explanation': 'boolean'
    }

dtypes_questions = {
    "question_id": "",
    "bundle_id": "",
    "correct_answer": "",
    "part": "int16",
    "tags": "",
}

In [20]:
SEQ_LEN = 128

In [21]:
@tf.function
def pad(a, seq_len, max_seq_len):
  s = max_seq_len - seq_len
  # making [[0, 0], [s, 0]]
  r = tf.stack([s, tf.constant(0)])
  t = tf.stack([tf.constant([0, 0]), r])
  
  return tf.pad(a, t) # ,1 to debug

@tf.function
def trim(a, seq_len,  max_seq_len):
  start = tf.squeeze(tf.random.uniform((1,), maxval=(seq_len-max_seq_len), dtype=tf.int32))
  # https://www.quora.com/How-does-tf-slice-work-in-TensorFlow
  begin = tf.stack([tf.constant(0), start])
  size = tf.stack([tf.shape(a)[0], max_seq_len])
  
  return tf.slice(a, begin, size) # , start - to debug

@tf.function
def pad_or_trim(a):
  seq_len = tf.shape(a)[-1]
  max_seq_len = SEQ_LEN
  fn = tf.cond(tf.less_equal(seq_len, max_seq_len), lambda: pad(a, seq_len, max_seq_len), lambda: trim(a, seq_len, max_seq_len))
  return fn

In [22]:
@tf.function
def split_x_mask(x):
  x = tf.transpose(x, (1, 0)) # [seq_len, n_features]
  pad_mask = tf.cast(tf.math.reduce_any(tf.math.not_equal(x, 0), axis=-1), dtype=tf.float32)
  return x, tf.expand_dims(pad_mask, axis=-1)

#### Submission

In [23]:
prev_test_df = None
state_dict = {}
start = datetime.now()

for test_dfm, sample_pred in iter_test:
    
    # filtering only questions, removing lectures.
    test_dfm = test_dfm[test_dfm.content_type_id == False]
    
    # we have prev df's answers, so store the prev_test_df in state_dict
    if prev_test_df is not None:
        try:
            prev_test_df['answered_correctly'] = list(filter(lambda x: x !=-1, eval(test_dfm['prior_group_answers_correct'].iloc[0])))
            prev_test_df['answered_correctly'] += 1
            prev_user_group = prev_test_df.groupby("user_id").apply(
                lambda row: row.values[:, 1:] # exclude user_id
            )
            for user in prev_user_group.index:
                if state_dict.get(user, None) is None: # new user, add to the state dict.
                    state_dict[user] = prev_user_group[user]
                else: # existing user
                    state = np.vstack([state_dict[user], prev_user_group[user]]) # append to prev features.
                    if state.shape[0] > SEQ_LEN: # we dont need history beyond SEQ_LEN
                        state = state[-SEQ_LEN:, :]
                    state_dict[user] = state
        except Exception as e:
            pass
        
        
    # -- process the current df.
    # selecting required cols
    test_df = test_dfm[["user_id", "content_id","task_container_id","prior_question_elapsed_time"]]
    
    # join question for feaures
    test_df = test_df.join(questions_df, on="content_id")
    
    # 0 used for padding so increment all indicator cols
    indicator_cols = ["content_id", "task_container_id", "part"]
    for c in indicator_cols:
        test_df[c] = test_df[c] + 1
        
    # fillna and convert milliseconds to minutes.
    test_df['prior_question_elapsed_time'] = test_df["prior_question_elapsed_time"].fillna(0).astype(np.float32) / 60000
    
    # FIXME: unseen ids - content_id > 32736, 'task_container_id'> 9999, 'part'> 7
    test_df.loc[test_df["content_id"] > 32736, "content_id"] = 0
    test_df.loc[test_df["task_container_id"] > 9999, "task_container_id"] = 0
    test_df.loc[test_df["part"] > 7, "part"] = 0
    test_df.fillna(0, inplace=True)
    
    prev_test_df = test_df.copy()
    
    # -- make x
    max_seq_len = 1
    xb = []
    for idx, row in test_df.iterrows():
        user = row["user_id"]
        x = np.zeros((SEQ_LEN, 5))
        if state_dict.get(user, None) is None: # new user
            row["prev_ans"] = 3
            x[-1, :] = row.values[1:] # [1:] - skip user_id
        else: # existing user, get prev states and build time series.
            try:
                prev_state = state_dict[user]
                row["prev_ans"] = 3 # this will be rolled to the first pos
                curr_state = np.vstack([prev_state, row.values[1:]]) # [1:] - skip user_id

                seq_len = curr_state.shape[0]
                max_seq_len = max(seq_len, max_seq_len)

                x[-seq_len:, :-1] = curr_state[:, :-1] # everything except prev_answer
                x[-seq_len:, -1] = np.roll(curr_state[:, -1], shift=1) # rolled answer as prev_answer
            except Exception as e: # dont use prev state.
                x = np.zeros((SEQ_LEN, 5))
                row["prev_ans"] = 3
                x[-1, :] = row.values[1:] # [1:] - skip user_id
                
        xb.append(x)
    x = tf.stack(xb, axis=0)[:, -max_seq_len:, :]
    
    # predict
    preds = model(x, training=False)[:, -1, :].numpy().flatten()
    # preds = model.predict(x)[:, -1, :].flatten() # batched prediction. could be slower.
    
    
    # submit
    test_dfm["answered_correctly"] = preds
    env.predict(test_dfm[["row_id", "answered_correctly"]])
    
total_time = datetime.now() - start
print(f"===== Total time: {total_time.total_seconds():.3} sec")

===== Total time: 0.734 sec


#### Profile

In [24]:
# def time(func):
#     def wrapped(*args, **kwargs):
#         t0 = datetime.now()
        
#         ret = func(*args, **kwargs)
        
#         dt = datetime.now() - t0
#         print(f"==> {func.__name__} took {dt.total_seconds():.2} sec")    
#         return ret
#     return wrapped

# # @time
# def preprocess(test_dfm, prev_test_df):
    
#     # we have prev df's answers, so store the prev_test_df in state_dict
#     if prev_test_df is not None:
#         prev_test_df['answered_correctly'] = list(filter(lambda x: x !=-1, eval(test_dfm['prior_group_answers_correct'].iloc[0])))
#         prev_test_df['answered_correctly'] += 1
#         prev_user_group = prev_test_df.groupby("user_id").apply(
#             lambda row: row.values[:, 1:] # exclude user_id
#         )
#         for user in prev_user_group.index:
#             if state_dict.get(user, None) is None: # new user, add to the state dict.
#                 state_dict[user] = prev_user_group[user]
#             else: # existing user
#                 state = np.vstack([state_dict[user], prev_user_group[user]]) # append to prev features.
#                 if state.shape[0] > SEQ_LEN: # we dont need history beyond SEQ_LEN
#                     state = state[-SEQ_LEN:, :]
#                 state_dict[user] = state
        
        
#     # -- process the current df.
#     # selecting required cols
#     test_df = test_dfm[["user_id", "content_id","task_container_id","prior_question_elapsed_time"]]
    
#     # join question for feaures
#     test_df = test_df.join(questions_df, on="content_id")
    
#     # 0 used for padding so increment all indicator cols
#     indicator_cols = ["content_id", "task_container_id", "part"]
#     for c in indicator_cols:
#         test_df[c] = test_df[c] + 1
        
#     # fillna and convert milliseconds to minutes.
#     test_df['prior_question_elapsed_time'] = test_df["prior_question_elapsed_time"].fillna(0).astype(np.float32) / 60000
    
#     # FIXME: unseen ids - content_id > 32736, 'task_container_id'> 9999, 'part'> 7
#     test_df.loc[test_df["content_id"] > 32736, "content_id"] = 0
#     test_df.loc[test_df["task_container_id"] > 9999, "task_container_id"] = 0
#     test_df.loc[test_df["part"] > 7, "part"] = 0
#     test_df.fillna(0, inplace=True)
    
#     return test_df

# # @time
# def get_x(test_df):
#     max_seq_len = 1
#     xb = []
#     for idx, row in test_df.iterrows():
#         user = row["user_id"]
#         x = np.zeros((SEQ_LEN, 5))
#         if state_dict.get(user, None) is None: # new user
#             row["prev_ans"] = 3
#             x[-1, :] = row.values[1:] # [1:] - skip user_id
#         else: # existing user, get prev states and build time series.
#             prev_state = state_dict[user]
#             row["prev_ans"] = 3 # this will be rolled to the first pos
#             curr_state = np.vstack([prev_state, row.values[1:]]) # [1:] - skip user_id
            
#             seq_len = curr_state.shape[0]
#             max_seq_len = max(seq_len, max_seq_len)
            
#             x[-seq_len:, :-1] = curr_state[:, :-1] # everything except prev_answer
#             x[-seq_len:, -1] = np.roll(curr_state[:, -1], shift=1) # rolled answer as prev_answer
#         xb.append(x)
            
#     return tf.stack(xb, axis=0)[:, -max_seq_len:, :]

# # @time
# def predict(x):
    
#     # predict
#     preds = model(x, training=False)[:, -1, :].numpy().flatten()
#     # preds = model.predict(x)[:, -1, :].flatten()
    
#     return preds

# start = datetime.now()
# # test_dfs = [] # for debug

# prev_test_df = None
# state_dict = {}

# for test_dfm, sample_pred in iter_test:
# #     print("\n*\n*\n--- iteration ---")
# #     t0 = datetime.now()
    
#     # filtering only questions, removing lectures.
#     test_dfm = test_dfm[test_dfm.content_type_id == False]
    
#     test_df = preprocess(test_dfm, prev_test_df)
# #     test_dfs.append(test_df.copy())
    
#     prev_test_df = test_df.copy()
    
#     x = get_x(test_df)
    
#     final_preds = predict(x)
    
    
#     # submit
#     test_dfm["answered_correctly"] = final_preds
#     env.predict(test_dfm[["row_id", "answered_correctly"]])
# #     print(f"iteration took: {(datetime.now() - t0).total_seconds():.3}s")
    
# total_time = datetime.now() - start
# print(f"===== Total time: {total_time.total_seconds():.3} sec")

Done.