<a href="https://colab.research.google.com/github/nisarahamedk/kaggle-riid/blob/master/notebooks/RIID_TF_EncoderDecoder_Transformers_TPU_WandB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#### RIID Transformer on TPU

In [1]:
%%capture
!pip install watermark

!pip uninstall -y wandb
!pip uninstall -y numpy
!pip uninstall -y pandas
!pip uninstall -y tensorflow

!pip install --upgrade wandb==0.10.12
!pip install --upgrade pandas==1.1.5
!pip install --upgrade numpy==1.19.4
!pip install --upgrade tensorflow==2.4.0
!pip install cloud_tpu_client

# In Kaggle, always restart the kernel after installing libs.
# With the focus on a cell, press Ctrl+Shift+P and choose "confirm restart kernel"

In [2]:
%reload_ext autoreload
%reload_ext watermark
%autoreload 2
%matplotlib inline

import pickle

import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras import backend as K
from tensorflow.python.keras.utils import tf_utils
from tensorflow.python.autograph.impl import api as autograph
from tensorflow.python.autograph.core import ag_ctx
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
import wandb
from wandb.keras import WandbCallback
from cloud_tpu_client import Client


np.random.seed(42)
tf.random.set_seed(42)

%watermark -iv

tensorflow       2.4.0
wandb            0.10.12
numpy            1.19.4
pandas           1.1.5
tensorflow.keras 2.4.0




### *Device* Settings - This needs to be at the TOP¶

In [3]:
Client().configure_tpu_version(tf.__version__, restart_type='ifNeeded')

In [4]:
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver() # no parameter needed for TPU_NAME env variable is set. This is the case for Kaggle
    print("Running on TPU: ", tpu.master())
except ValueError:
    tpu = None

Running on TPU:  grpc://10.9.143.10:8470


In [5]:
if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.TPUStrategy(tpu)
else:
    # default strategy with the available hw
    strategy = tf.distribute.get_strategy()
    
REPLICAS = strategy.num_replicas_in_sync
print("REPLICAS: ", REPLICAS)

INFO:tensorflow:Initializing the TPU system: grpc://10.9.143.10:8470


INFO:tensorflow:Initializing the TPU system: grpc://10.9.143.10:8470


INFO:tensorflow:Clearing out eager caches


INFO:tensorflow:Clearing out eager caches


INFO:tensorflow:Finished initializing TPU system.


INFO:tensorflow:Finished initializing TPU system.


INFO:tensorflow:Found TPU system:


INFO:tensorflow:Found TPU system:


INFO:tensorflow:*** Num TPU Cores: 8


INFO:tensorflow:*** Num TPU Cores: 8


INFO:tensorflow:*** Num TPU Workers: 1


INFO:tensorflow:*** Num TPU Workers: 1


INFO:tensorflow:*** Num TPU Cores Per Worker: 8


INFO:tensorflow:*** Num TPU Cores Per Worker: 8


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:1, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:1, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:2, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:2, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:3, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:3, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:4, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:4, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:5, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:5, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:6, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:6, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:7, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:7, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU_SYSTEM:0, TPU_SYSTEM, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU_SYSTEM:0, TPU_SYSTEM, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)


REPLICAS:  8


#### WandB Experiment Config

In [6]:
resume = False

In [7]:
experiment_config = {
    "dataset_args": {
        "tfrec_gcs_path": 'gs://kds-9c9a89c1a0d17bcb15230bb2e47c4641386843a62d12638c765e35cb',
        "folds": 10,
        "fold": 1,
        "batch_size": 256,
        "seq_len": 128,
    },
    "model_args": {
        "num_layers": 1,
        "d_model": 512,
        "num_heads": 8,
        "dff": 1024,
    },
    "training_args": {
        "epochs": 30,
    },
    "env_args": {
        "is_kaggle": False,
        "upload_model_to_kaggle": False
    }
}

In [8]:
!wandb login f137298421da563b24639d1287dd3ce5da537814

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [9]:
notes = "test run"

In [10]:
wandb_run = wandb.init(project="kaggle-riid", notes=notes, resume=resume)

[34m[1mwandb[0m: Currently logged in as: [33mnisarahamedk[0m (use `wandb login --relogin` to force relogin)


In [11]:
api = wandb.Api()
run = api.run("nisarahamedk/kaggle-riid/" + wandb.run.id)
run

<Run nisarahamedk/kaggle-riid/2xrlsyxj (running)>

In [12]:
if resume:
  experiment_config = run.config

### Datasets

In [13]:
DATA_PATH = experiment_config["dataset_args"]["tfrec_gcs_path"]
DATA_PATH

'gs://kds-9c9a89c1a0d17bcb15230bb2e47c4641386843a62d12638c765e35cb'

In [14]:
n_train_files = len(tf.io.gfile.glob(DATA_PATH + "/tfrec*"))
n_train_files

32

In [15]:
FOLDS = experiment_config["dataset_args"]["folds"]

kfold = KFold(n_splits=FOLDS, shuffle=True, random_state=42)
folds_list = list(kfold.split(np.arange(n_train_files)))
folds_list[:2]

[(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 16, 18,
         19, 20, 21, 22, 23, 25, 26, 27, 28, 30, 31]),
  array([15, 17, 24, 29])),
 (array([ 0,  1,  2,  3,  4,  5,  6,  7, 10, 11, 12, 13, 14, 15, 16, 17, 18,
         19, 20, 21, 22, 23, 24, 26, 27, 28, 29, 31]),
  array([ 8,  9, 25, 30]))]

In [16]:
FOLD = experiment_config["dataset_args"]["fold"]

train_folds, valid_folds = folds_list[FOLD]
len(train_folds), len(valid_folds)

(28, 4)

In [17]:
experiment_config["dataset_args"].update({"train_folds": str(list(train_folds)), "valid_folds": str(list(valid_folds))})

In [18]:
train_files = tf.io.gfile.glob([DATA_PATH + "/tfrec_%d.tfrec" % idx for idx in train_folds])
valid_files = tf.io.gfile.glob([DATA_PATH + "/tfrec_%d.tfrec" % idx for idx in valid_folds])

In [19]:
len(train_files), len(valid_files)

(28, 4)

#### Load TFRecord Datasets

In [20]:
AUTOTUNE = tf.data.experimental.AUTOTUNE

In [21]:
feature_desc = {
    "timestamp": tf.io.FixedLenFeature([], tf.string),
    "content_id": tf.io.FixedLenFeature([], tf.string),
    "task_container_id": tf.io.FixedLenFeature([], tf.string),
    "elapsed_time": tf.io.FixedLenFeature([], tf.string),
    "had_explanation": tf.io.FixedLenFeature([], tf.string),
    "part": tf.io.FixedLenFeature([], tf.string),
    "tags": tf.io.FixedLenFeature([], tf.string),
    "answered_correctly": tf.io.FixedLenFeature([], tf.string),
}

def parse_example(example):
  example = tf.io.parse_single_example(example, feature_desc)

  timestamp = tf.io.parse_tensor(example["timestamp"], tf.int64)
  content_id = tf.io.parse_tensor(example["content_id"], tf.int16)
  task_container_id = tf.io.parse_tensor(example["task_container_id"], tf.int16)
  elapsed_time = tf.io.parse_tensor(example["elapsed_time"], tf.float32)
  had_explanation = tf.io.parse_tensor(example["had_explanation"], tf.int8)
  part = tf.io.parse_tensor(example["part"], tf.int16)
  answered_correctly = tf.io.parse_tensor(example["answered_correctly"], tf.int8)

  # tags
  tags = tf.io.parse_tensor(example["tags"], tf.string) # as string, one q can have multiple tags.
  tags = tf.strings.to_number(tf.strings.split(tags), out_type=tf.int32) # will produce a ragged tensor with tags for each q
  # ragged tensor of tags [[2], [3, 4]] is converted to one hot like [[0,0,1..], [[0,0,0,1,..], [0,0,0,0,1...]]]
  # then sumed along axis 1, so for each question there will be 1 for all the tags associated with it.
  tags = tf.reduce_sum(tf.one_hot(tags, depth=190), axis=1) # shape [seq_len, 190]
  tags = tf.transpose(tags, (1, 0)) # shape [190, seq_len] to stack with other features.
  
  features = tf.stack([
      tf.cast(timestamp, tf.float32),
      tf.cast(content_id, tf.float32),
      tf.cast(task_container_id, tf.float32),
      tf.cast(elapsed_time, tf.float32),
      tf.cast(had_explanation, tf.float32),
      tf.cast(part, tf.float32),
      tf.cast(answered_correctly, tf.float32),
  ])

  # add tags
  return tf.concat([
      features,
      tf.cast(tags, tf.float32),
  ], axis=0) # [features, seq_len] # TODO make it [seq_len, features] so that we dont have to reshape in transformer

In [22]:
def load_dataset_from_tfrecord(filenames, ds_type="train", cache_to=None):
        # Since we are reading dataset from multiple files. and we dont care about the order.
        # set deterministic reading to False.
        ignore_order = tf.data.Options()
        if ds_type == "train":
            ignore_order.experimental_deterministic = False
            
        dataset = tf.data.TFRecordDataset(filenames, num_parallel_reads=AUTOTUNE)
        if not cache_to:
            dataset = dataset.cache() # cache to RAM
        else:
            dataset = dataset.cache(cache_to) # cache to file given by self.cache_to 
        if ds_type == "train":
            # dataset = dataset.repeat() # repeat individual item, so that we have full batch at every step.
            pass
        dataset.with_options(ignore_order)
        dataset = dataset.map(parse_example, num_parallel_calls=AUTOTUNE)
        return dataset

In [23]:
dataset = load_dataset_from_tfrecord(train_files)

In [24]:
SEQ_LEN = experiment_config["dataset_args"]["seq_len"]
SEQ_LEN

128

In [25]:
@tf.function
def pad(a, seq_len, max_seq_len):
  s = max_seq_len - seq_len
  # making [[0, 0], [s, 0]]
  r = tf.stack([s, tf.constant(0)])
  t = tf.stack([tf.constant([0, 0]), r])
  
  return tf.pad(a, t) # ,1 to debug

@tf.function
def trim(a, seq_len,  max_seq_len):
  """
  TODO: trimming actually get rid of the start token since we are trimming randomly in the model.
  Is this going to be a problem?, i think it should not.
  """
  start = tf.squeeze(tf.random.uniform((1,), maxval=(seq_len-max_seq_len), dtype=tf.int32))
  # https://www.quora.com/How-does-tf-slice-work-in-TensorFlow
  begin = tf.stack([tf.constant(0), start])
  size = tf.stack([tf.shape(a)[0], max_seq_len])
  
  return tf.slice(a, begin, size) # , start - to debug

@tf.function
def pad_or_trim(a):
  seq_len = tf.shape(a)[-1]
  max_seq_len = SEQ_LEN + 1 # accomodate for the start token
  fn = tf.cond(tf.less_equal(seq_len, max_seq_len), lambda: pad(a, seq_len, max_seq_len), lambda: trim(a, seq_len, max_seq_len))
  return fn

In [26]:
dataset = dataset.map(pad_or_trim, num_parallel_calls=AUTOTUNE) # every sample is padded if len < SEQ_LEN or randomly trimmed to SEQ_LEN

In [27]:
for item in dataset.take(1):
  print(item.shape)

(197, 129)


In [28]:
@tf.function
def create_padding_mask(seq):
  seq = tf.cast(tf.reduce_all(tf.math.equal(seq, 0), axis=-1), tf.float32)

  # add extra dimensions to add the padding
  # to the attention logits.
  return seq[:, tf.newaxis, tf.newaxis, :]  # (batch_size, 1, 1, seq_len)

@tf.function
def split_x_y_mask(xy):
  # x = xy[:-1, :] # we need 'answered_correctly' in x
  x = tf.transpose(xy, (1, 0)) # [seq_len, n_features]
  y = xy[6, :]
  y = y - 3 # +3 had added to use this in feature, make it back to 0,1
  pad_mask = tf.cast(tf.math.reduce_any(tf.math.not_equal(x, 0), axis=-1), dtype=tf.float32)
  start_token_mask = tf.cast(tf.math.logical_not(tf.math.reduce_all(tf.math.equal(x[:, :7], 1), axis=-1)), dtype=tf.float32)
  final_mask = tf.math.multiply(pad_mask, start_token_mask)
  return x, tf.expand_dims(y[1:], axis=-1), tf.expand_dims(final_mask[1:], axis=-1) # [1:] for y and mask, because first val is for start token

In [29]:
dataset = dataset.map(split_x_y_mask, num_parallel_calls=AUTOTUNE) # x and y

In [30]:
for x, y, mask in dataset.take(1):
  print(x.shape)
  print(y.shape)
  print(mask.shape)

(129, 197)
(128, 1)
(128, 1)


In [31]:
tf.squeeze(mask, axis=-1)

<tf.Tensor: shape=(128,), dtype=float32, numpy=
array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1.], dtype=float32)>

In [32]:
tf.squeeze(y, axis=-1)

<tf.Tensor: shape=(128,), dtype=float32, numpy=
array([-3., -3., -3., -3., -3., -3., -3., -3., -3., -3., -3., -3., -3.,
       -3., -3., -3., -3., -3., -3., -3., -3., -3., -3., -3., -3., -3.,
       -3., -3., -3., -3., -3., -3., -3., -3., -3., -3., -3., -3., -3.,
       -3., -3., -3., -3., -3., -3., -3., -3., -3., -3., -3., -3., -3.,
       -3., -3., -3., -3., -3., -3., -3., -3., -3., -3., -3., -3., -3.,
       -3., -3., -3., -3., -3., -3., -3., -3., -3., -3., -3., -3., -3.,
       -3., -3., -3., -2.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
        1.,  0.,  0.,  0.,  1.,  1.,  1.,  1.,  0.,  1.,  1.,  0.,  0.,
        0.,  0.,  1.,  1.,  1.,  1.,  1.,  0.,  1.,  0.,  1.,  1.,  1.,
        1.,  0.,  1.,  1.,  1.,  1.,  1.,  0.,  0.,  0.,  1.],
      dtype=float32)>

In [33]:
tf.squeeze(y, axis=-1) * tf.squeeze(mask, axis=-1) #  applying mask

<tf.Tensor: shape=(128,), dtype=float32, numpy=
array([-0., -0., -0., -0., -0., -0., -0., -0., -0., -0., -0., -0., -0.,
       -0., -0., -0., -0., -0., -0., -0., -0., -0., -0., -0., -0., -0.,
       -0., -0., -0., -0., -0., -0., -0., -0., -0., -0., -0., -0., -0.,
       -0., -0., -0., -0., -0., -0., -0., -0., -0., -0., -0., -0., -0.,
       -0., -0., -0., -0., -0., -0., -0., -0., -0., -0., -0., -0., -0.,
       -0., -0., -0., -0., -0., -0., -0., -0., -0., -0., -0., -0., -0.,
       -0., -0., -0., -0.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
        1.,  0.,  0.,  0.,  1.,  1.,  1.,  1.,  0.,  1.,  1.,  0.,  0.,
        0.,  0.,  1.,  1.,  1.,  1.,  1.,  0.,  1.,  0.,  1.,  1.,  1.,
        1.,  0.,  1.,  1.,  1.,  1.,  1.,  0.,  0.,  0.,  1.],
      dtype=float32)>

In [34]:
dataset = dataset.shuffle(int(1024 * REPLICAS))

In [35]:
BATCH_SIZE = experiment_config["dataset_args"]["batch_size"]
BATCH_SIZE

256

In [36]:
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

In [37]:
for xb, yb, mb in dataset.take(1):
  print(xb.shape)
  print(yb.shape)
  print(mb.shape)

(256, 129, 197)
(256, 128, 1)
(256, 128, 1)


In [38]:
# answered_correctly, # notice for trimmed sequence, there is no start token, instead a previous interaction is used.
xb[0, :, 6]

<tf.Tensor: shape=(129,), dtype=float32, numpy=
array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 4., 3., 3.,
       4., 4., 3., 4., 3., 3., 3., 3., 3., 3., 3., 3., 4., 3., 3., 3., 3.,
       3., 3., 4., 3., 3., 3., 3., 3., 3., 3.], dtype=float32)>

In [39]:
# corresponding mask
mb[0, :, 0]

<tf.Tensor: shape=(128,), dtype=float32, numpy=
array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1.], dtype=float32)>

In [40]:
dataset = dataset.prefetch(AUTOTUNE)

Valid dataset

In [41]:
valid_dataset = load_dataset_from_tfrecord(valid_files, ds_type="valid")
valid_dataset = valid_dataset.map(pad_or_trim, num_parallel_calls=AUTOTUNE) # every sample is padded if len < SEQ_LEN or randomly trimmed to SEQ_LEN
valid_dataset = valid_dataset.map(split_x_y_mask) # x, y and mask
valid_dataset = valid_dataset.batch(BATCH_SIZE * 2, drop_remainder=True)
valid_dataset = valid_dataset.prefetch(AUTOTUNE)

In [42]:
for vx, vy, vmask in valid_dataset.take(1):
  print(vx.shape)
  print(vy.shape)
  print(vmask.shape)

(512, 129, 197)
(512, 128, 1)
(512, 128, 1)


In [43]:
# answered_correctly
vx[0, :, 6]

<tf.Tensor: shape=(129,), dtype=float32, numpy=
array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 4., 4., 4.,
       4., 3., 4., 4., 4., 4., 3., 4., 4., 4., 4., 4., 4., 3., 4., 4., 3.,
       4., 4., 4., 4., 4., 4., 4., 4., 4., 4.], dtype=float32)>

In [44]:
# corresponding mask
vmask[0, :, 0]

<tf.Tensor: shape=(128,), dtype=float32, numpy=
array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1.], dtype=float32)>

In [45]:
vx[0, 1:, 6] * vmask[0, :, 0] # checking the mask

<tf.Tensor: shape=(128,), dtype=float32, numpy=
array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 4., 4., 4., 4.,
       3., 4., 4., 4., 4., 3., 4., 4., 4., 4., 4., 4., 3., 4., 4., 3., 4.,
       4., 4., 4., 4., 4., 4., 4., 4., 4.], dtype=float32)>

### Model

##### Positional Encoding

In [46]:
def get_angles(pos, i, d_model):
  angle_rates = 1 / np.power(10000, (2 * (i//2)) / np.float32(d_model))
  return pos * angle_rates

In [47]:
def positional_encoding(position, d_model):
  angle_rads = get_angles(np.arange(position)[:, np.newaxis],
                          np.arange(d_model)[np.newaxis, :],
                          d_model)

  # apply sin to even indices in the array; 2i
  angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])

  # apply cos to odd indices in the array; 2i+1
  angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])

  pos_encoding = angle_rads[np.newaxis, ...]

  return tf.cast(pos_encoding, dtype=tf.float32)

##### Look ahead mask

In [48]:
def create_look_ahead_mask(size):
  mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
  return mask  # (seq_len, seq_len)

##### Scaled Dot Product Attention

In [49]:
def scaled_dot_product_attention(q, k, v, mask):
  """Calculate the attention weights.
  q, k, v must have matching leading dimensions.
  k, v must have matching penultimate dimension, i.e.: seq_len_k = seq_len_v.
  The mask has different shapes depending on its type(padding or look ahead) 
  but it must be broadcastable for addition.

  Args:
    q: query shape == (..., seq_len_q, depth)
    k: key shape == (..., seq_len_k, depth)
    v: value shape == (..., seq_len_v, depth_v)
    mask: Float tensor with shape broadcastable 
          to (..., seq_len_q, seq_len_k). Defaults to None.

  Returns:
    output, attention_weights
  """

  matmul_qk = tf.matmul(q, k, transpose_b=True)  # (..., seq_len_q, seq_len_k)

  # scale matmul_qk
  dk = tf.cast(tf.shape(k)[-1], tf.float32)
  scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)

  # add the mask to the scaled tensor.
  if mask is not None:
    scaled_attention_logits += (mask * -1e9)  

  # softmax is normalized on the last axis (seq_len_k) so that the scores
  # add up to 1.
  attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)  # (..., seq_len_q, seq_len_k)

  output = tf.matmul(attention_weights, v)  # (..., seq_len_q, depth_v)

  return output, attention_weights

##### Multi Head Attention

In [50]:
class MultiHeadAttention(tf.keras.layers.Layer):
  def __init__(self, d_model, num_heads):
    super(MultiHeadAttention, self).__init__()
    self.num_heads = num_heads
    self.d_model = d_model

    assert d_model % self.num_heads == 0

    self.depth = d_model // self.num_heads

    self.wq = tf.keras.layers.Dense(d_model)
    self.wk = tf.keras.layers.Dense(d_model)
    self.wv = tf.keras.layers.Dense(d_model)

    self.dense = tf.keras.layers.Dense(d_model)

  def split_heads(self, x, batch_size):
    """Split the last dimension into (num_heads, depth).
    Transpose the result such that the shape is (batch_size, num_heads, seq_len, depth)
    """
    x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
    return tf.transpose(x, perm=[0, 2, 1, 3])

  def call(self, v, k, q, mask):
    batch_size = tf.shape(q)[0]

    q = self.wq(q)  # (batch_size, seq_len, d_model)
    k = self.wk(k)  # (batch_size, seq_len, d_model)
    v = self.wv(v)  # (batch_size, seq_len, d_model)

    q = self.split_heads(q, batch_size)  # (batch_size, num_heads, seq_len_q, depth)
    k = self.split_heads(k, batch_size)  # (batch_size, num_heads, seq_len_k, depth)
    v = self.split_heads(v, batch_size)  # (batch_size, num_heads, seq_len_v, depth)

    # scaled_attention.shape == (batch_size, num_heads, seq_len_q, depth)
    # attention_weights.shape == (batch_size, num_heads, seq_len_q, seq_len_k)
    scaled_attention, attention_weights = scaled_dot_product_attention(
        q, k, v, mask)

    scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])  # (batch_size, seq_len_q, num_heads, depth)

    concat_attention = tf.reshape(scaled_attention, 
                                  (batch_size, -1, self.d_model))  # (batch_size, seq_len_q, d_model)

    output = self.dense(concat_attention)  # (batch_size, seq_len_q, d_model)

    return output, attention_weights

##### Pointwise FeedForward Network

In [51]:
def point_wise_feed_forward_network(d_model, dff):
  return tf.keras.Sequential([
      tf.keras.layers.Dense(dff, activation='relu'),  # (batch_size, seq_len, dff)
      tf.keras.layers.Dense(d_model)  # (batch_size, seq_len, d_model)
  ])

##### EncoderLayer

In [52]:
class EncoderLayer(tf.keras.layers.Layer):
  def __init__(self, d_model, num_heads, dff, rate=0.1):
    super(EncoderLayer, self).__init__()

    self.mha = MultiHeadAttention(d_model, num_heads)
    self.ffn = point_wise_feed_forward_network(d_model, dff)

    self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

    self.dropout1 = tf.keras.layers.Dropout(rate)
    self.dropout2 = tf.keras.layers.Dropout(rate)

  def call(self, x, training, mask):

    attn_output, _ = self.mha(x, x, x, mask)  # (batch_size, input_seq_len, d_model)
    attn_output = self.dropout1(attn_output, training=training)
    out1 = self.layernorm1(x + attn_output)  # (batch_size, input_seq_len, d_model)

    ffn_output = self.ffn(out1)  # (batch_size, input_seq_len, d_model)
    ffn_output = self.dropout2(ffn_output, training=training)
    out2 = self.layernorm2(out1 + ffn_output)  # (batch_size, input_seq_len, d_model)

    return out2

##### Encoder

In [53]:
class Encoder(tf.keras.layers.Layer):
  def __init__(self, num_layers, d_model, num_heads, dff, maximum_position_encoding, embed_size_dict, rate):
    super(Encoder, self).__init__()

    self.d_model = d_model
    self.num_layers = num_layers

    # timestamp embedding
    self.timestamp_buckets = list(np.linspace(0, 87425772049, num=1000)) # close to 1 hour per bucket
    self.timestamp_emb = tf.keras.layers.Embedding(len(self.timestamp_buckets) + 1, d_model)
    # question id embedding
    self.content_id_emb = tf.keras.layers.Embedding(embed_size_dict["content_id"] + 1, d_model)
    # task container embedding
    self.task_container_id_emb = tf.keras.layers.Embedding(embed_size_dict["task_container_id"] + 1, d_model)
    # part embedding
    self.part_emb = tf.keras.layers.Embedding(embed_size_dict["part"] + 1, d_model)
    # tags/skills embedding
    self.tags_emb = tf.Variable(tf.random.uniform([embed_size_dict["tags"], d_model]))
    # elapsed_time embedding
    self.elapsed_time_buckets = list(np.linspace(0, 300000, num=10)) # close to 30sec per bucket
    self.elapsed_time_emb = tf.keras.layers.Embedding(len(self.elapsed_time_buckets) + 1, d_model)
    # answered_correctly embedding
    self.answered_correctly_emb = tf.keras.layers.Embedding(embed_size_dict["answered_correctly"], d_model)
    # position encoding
    self.pos_encoding = positional_encoding(maximum_position_encoding, self.d_model)


    self.enc_layers = [EncoderLayer(d_model, num_heads, dff, rate) 
                       for _ in range(num_layers)]

    self.dropout = tf.keras.layers.Dropout(rate)

  def call(self, input, training, mask):

    seq_len = tf.shape(input)[1]

    """
    tf.cast(timestamp, tf.float32),
    tf.cast(content_id, tf.float32),
    tf.cast(task_container_id, tf.float32),
    tf.cast(elapsed_time, tf.float32),
    tf.cast(had_explanation, tf.float32),
    tf.cast(part, tf.float32),
    tf.cast(answered_correctly, tf.float32),
    """
    # adding embeddings and position encoding.
    # --- question related embeddings
    content_id_emb = self.content_id_emb(input[..., 1])  # (batch_size, input_seq_len, d_model)
    x = content_id_emb
    task_container_id_emb = self.task_container_id_emb(input[..., 2])
    x += task_container_id_emb
    part_emb = self.part_emb(input[..., 5])
    x += part_emb
    # tags/skills
    tags_oh = input[:, :, 7:]
    # tags_oh = tf.nn.softmax(tags_oh * 100) # soft max so that the weights adds up to 1, multiplied by 100 to make the non-skills to 0, reduce this if importance needs to be given to non-skills.
    tags_emb = tf.linalg.matmul(tags_oh, self.tags_emb) # multiple skills can be there. this sum up the tag embeddings
    x += tags_emb

    # --- response related embeddings
    timestamp_buckets = tf.raw_ops.Bucketize(input=input[..., 0], boundaries=self.timestamp_buckets)
    timestamp_emb = self.timestamp_emb(timestamp_buckets)
    x += timestamp_emb
    elapsed_time_buckets = tf.raw_ops.Bucketize(input=input[..., 3], boundaries=self.elapsed_time_buckets)
    elapsed_time_emb = self.elapsed_time_emb(elapsed_time_buckets)
    x += elapsed_time_emb
    answered_correctly_emb = self.answered_correctly_emb(input[..., 6])
    x += answered_correctly_emb
    
    
    x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
    x += self.pos_encoding[:, :seq_len, :]

    x = self.dropout(x, training=training)

    for i in range(self.num_layers):
      x = self.enc_layers[i](x, training, mask) # (batch_size, input_seq_len, d_model)

    return x

##### Decoder Layer

In [54]:
class DecoderLayer(tf.keras.layers.Layer):
  def __init__(self, d_model, num_heads, dff, rate=0.1):
    super(DecoderLayer, self).__init__()

    self.mha1 = MultiHeadAttention(d_model, num_heads)
    self.mha2 = MultiHeadAttention(d_model, num_heads)

    self.ffn = point_wise_feed_forward_network(d_model, dff)

    self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    self.layernorm3 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

    self.dropout1 = tf.keras.layers.Dropout(rate)
    self.dropout2 = tf.keras.layers.Dropout(rate)
    self.dropout3 = tf.keras.layers.Dropout(rate)


  def call(self, x, enc_output, training, look_ahead_mask):
    # enc_output.shape == (batch_size, input_seq_len, d_model)

    attn1, attn_weights_block1 = self.mha1(x, x, x, look_ahead_mask)  # (batch_size, target_seq_len, d_model)
    attn1 = self.dropout1(attn1, training=training)
    out1 = self.layernorm1(attn1 + x)

    attn2, attn_weights_block2 = self.mha2(enc_output, enc_output, out1, look_ahead_mask)  # (batch_size, target_seq_len, d_model)
    attn2 = self.dropout2(attn2, training=training)
    out2 = self.layernorm2(attn2 + out1)  # (batch_size, target_seq_len, d_model)

    ffn_output = self.ffn(out2)  # (batch_size, target_seq_len, d_model)
    ffn_output = self.dropout3(ffn_output, training=training)
    out3 = self.layernorm3(ffn_output + out2)  # (batch_size, target_seq_len, d_model)

    return out3, attn_weights_block1, attn_weights_block2

##### Decoder

In [55]:
class Decoder(tf.keras.layers.Layer):
  def __init__(self, num_layers, d_model, num_heads, dff, maximum_position_encoding, embed_size_dict, rate):
    super(Decoder, self).__init__()

    self.d_model = d_model
    self.num_layers = num_layers

    self.content_id_emb = tf.keras.layers.Embedding(embed_size_dict["content_id"] + 1, d_model)
    self.task_container_id_emb = tf.keras.layers.Embedding(embed_size_dict["task_container_id"] + 1, d_model)
    self.part_emb = tf.keras.layers.Embedding(embed_size_dict["part"] + 1, d_model)
    self.tags_emb = tf.Variable(tf.random.uniform([embed_size_dict["tags"], d_model]))
    self.pos_encoding = positional_encoding(maximum_position_encoding, self.d_model)

    self.dec_layers = [DecoderLayer(d_model, num_heads, dff, rate) 
                       for _ in range(num_layers)]
    self.dropout = tf.keras.layers.Dropout(rate)

  def call(self, input, enc_output, training, look_ahead_mask):

    seq_len = tf.shape(input)[1]
    attention_weights = {}

    """
    tf.cast(timestamp, tf.float32),
    tf.cast(content_id, tf.float32),
    tf.cast(task_container_id, tf.float32),
    tf.cast(elapsed_time, tf.float32),
    tf.cast(had_explanation, tf.float32),
    tf.cast(part, tf.float32),
    tf.cast(answered_correctly, tf.float32),
    """
    # adding embeddings and position encoding.
    content_id_emb = self.content_id_emb(input[..., 1])  # (batch_size, input_seq_len, d_model)
    x = content_id_emb
    task_container_id_emb = self.task_container_id_emb(input[..., 2])
    x += task_container_id_emb
    part_emb = self.part_emb(input[..., 5])
    x += part_emb
    # tags/skills
    tags_oh = input[:, :, 7:]
    # tags_oh = tf.nn.softmax(tags_oh * 100) # soft max so that the weights adds up to 1, multiplied by 100 to make the non-skills to 0, reduce this if importance needs to be given to non-skills.
    tags_emb = tf.linalg.matmul(tags_oh, self.tags_emb) # multiple skills can be there. this sum up the tag embeddings
    x += tags_emb
    

    x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
    x += self.pos_encoding[:, :seq_len, :]

    x = self.dropout(x, training=training)

    for i in range(self.num_layers):
      x, block1, block2 = self.dec_layers[i](x, enc_output, training, look_ahead_mask)

      attention_weights['decoder_layer{}_block1'.format(i+1)] = block1
      attention_weights['decoder_layer{}_block2'.format(i+1)] = block2

    # x.shape == (batch_size, target_seq_len, d_model)
    return x, attention_weights

In [56]:
class TransformerSeq2SeqClassifier(keras.models.Model):
  def __init__(self, num_layers, d_model, num_heads, dff, maximum_position_encoding, embed_size_dict, rate=0.1):
    super(TransformerSeq2SeqClassifier, self).__init__()

    self.encoder = Encoder(num_layers, d_model, num_heads, dff, maximum_position_encoding, embed_size_dict, rate)
    self.decoder = Decoder(num_layers, d_model, num_heads, dff, maximum_position_encoding, embed_size_dict, rate)
    
    self.out = tf.keras.layers.Dense(1, activation="sigmoid")

  def call(self, x, training):
  
    look_ahead_mask = create_look_ahead_mask(SEQ_LEN)
    
    # encoder
    enc_input = x[:, :SEQ_LEN, :]
    enc_output = self.encoder(enc_input, training=training, mask=look_ahead_mask)

    # decoder
    dec_input = x[:, 1:, :]
    dec_output, attention_weights = self.decoder(dec_input, enc_output, training=training, look_ahead_mask=look_ahead_mask)

    out = self.out(dec_output)
    return out # [batch_size, input_seq_len, 1]


###### Embedding Sizes

In [57]:
embed_sizes = pickle.loads(tf.io.read_file(DATA_PATH + "/emb_sz.pkl").numpy())

In [58]:
embed_sizes

{'answered_correctly': 5,
 'content_id': 13525,
 'had_explanation': 5,
 'part': 10,
 'tags': 190,
 'task_container_id': 10002}

In [59]:
experiment_config["model_args"].update({"embed_sizes": embed_sizes})

In [60]:
experiment_config

{'dataset_args': {'batch_size': 256,
  'fold': 1,
  'folds': 10,
  'seq_len': 128,
  'tfrec_gcs_path': 'gs://kds-9c9a89c1a0d17bcb15230bb2e47c4641386843a62d12638c765e35cb',
  'train_folds': '[0, 1, 2, 3, 4, 5, 6, 7, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 26, 27, 28, 29, 31]',
  'valid_folds': '[8, 9, 25, 30]'},
 'env_args': {'is_kaggle': False, 'upload_model_to_kaggle': False},
 'model_args': {'d_model': 512,
  'dff': 1024,
  'embed_sizes': {'answered_correctly': 5,
   'content_id': 13525,
   'had_explanation': 5,
   'part': 10,
   'tags': 190,
   'task_container_id': 10002},
  'num_heads': 8,
  'num_layers': 1},
 'training_args': {'epochs': 30}}

In [61]:
with strategy.scope():
  model = TransformerSeq2SeqClassifier(
      num_layers=experiment_config["model_args"]["num_layers"],
      d_model=experiment_config["model_args"]["d_model"],
      num_heads=experiment_config["model_args"]["num_heads"],
      dff=experiment_config["model_args"]["dff"],
      maximum_position_encoding=SEQ_LEN,
      embed_size_dict=embed_sizes
  )

In [62]:
for xb, yb, mb in dataset.take(1):
  print(xb.shape, yb.shape, mb.shape)

(256, 129, 197) (256, 128, 1) (256, 128, 1)


In [63]:
with strategy.scope():
  y_pred = model(xb, training=False)

In [64]:
y_pred.shape

TensorShape([256, 128, 1])

#### Training

In [65]:
class CustomAUC(keras.metrics.Metric):

  def __init__(self, **kwargs):
    super().__init__(**kwargs)

    self.auc = keras.metrics.AUC()

  def update_state(self, y_true, y_pred, sample_weight):

    self.auc.update_state(y_true, y_pred, sample_weight)

  def result(self):
    return self.auc.result()

  def reset_states(self):
    return self.auc.reset_states()

  def get_config(self):
    return self.auc.get_config()

  @property
  def thresholds(self):
    return self.auc.thresholds

In [66]:
# Just subclassing Loss, doesnt give us the power how to weight the sample losses.
# Hence re-implementing the __call__ method
class MaskedBCELoss(keras.losses.Loss):
  def __init__(self, **kwargs):
    super().__init__(**kwargs)

    self.bce = keras.losses.BinaryCrossentropy(reduction=keras.losses.Reduction.NONE)

  def call(self, y_true, y_pred, sample_weight):

    normal_bce_loss = self.bce(y_true, y_pred, sample_weight) # gives 0 where masked.

    # count # of non masked entries in the batch
    unmasked_count = tf.math.reduce_sum(tf.cast(tf.math.not_equal(normal_bce_loss, 0), tf.float32))

    # sum the unmasked entries.
    unmasked_sum = tf.math.reduce_sum(normal_bce_loss)
  
    average_loss = tf.math.divide(unmasked_sum, unmasked_count)

    return average_loss

  def __call__(self, y_true, y_pred, sample_weight):

    graph_ctx = tf_utils.graph_context_for_symbolic_tensors(
        y_true, y_pred, sample_weight)
    with K.name_scope(self._name_scope), graph_ctx:
      ag_call = autograph.tf_convert(self.call, ag_ctx.control_status_ctx())
      losses = ag_call(y_true, y_pred, sample_weight)
      return losses


In [67]:
loss = MaskedBCELoss()
loss(yb, y_pred, mb)

<tf.Tensor: shape=(), dtype=float32, numpy=0.9950002>

In [68]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
  def __init__(self, d_model, warmup_steps=4000):
    super(CustomSchedule, self).__init__()

    self.d_model = d_model
    self.d_model = tf.cast(self.d_model, tf.float32)

    self.warmup_steps = warmup_steps

  def __call__(self, step):
    arg1 = tf.math.rsqrt(step)
    arg2 = step * (self.warmup_steps ** -1.5)

    return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

In [69]:
with strategy.scope():
  learning_rate = CustomSchedule(experiment_config["model_args"]["d_model"])
  optimizer = keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9)
  # loss = keras.losses.BinaryCrossentropy()
  loss = MaskedBCELoss()
  reduce_lr_cb = tf.keras.callbacks.ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=4, min_lr=1e-7, verbose=1) # cannot be used with custom scheduler
  model_checkpoint_cb = tf.keras.callbacks.ModelCheckpoint(
    filepath="model/best-model.h5",
    save_weights_only=True,
    monitor='val_loss',
    mode='min',
    save_best_only=True)
  wandb_cb = [WandbCallback(monitor="val_loss", mode="min", save_weights_only=True, verbose=1)]

In [70]:
# update the wandb config
wandb.config.update(experiment_config)

In [71]:
with strategy.scope():
  model.compile(loss=loss, optimizer=optimizer, metrics=[keras.metrics.AUC()]) #, weighted_metrics=[CustomAUC()]) # "weighted_metrics" not supported on TPU with tf.data

In [72]:
## OVERFIT SINGLE BATCH
# with strategy.scope():
#   x, y, mask = next(iter(dataset.take(1))) # cannot just use take(), cause that return different batch everytime
#   print(x.shape, y.shape, mask.shape)
#   model.fit(x, y, epochs=100, sample_weight=mask)

In [73]:
!mkdir model

In [None]:
with strategy.scope():
  history = model.fit(dataset, validation_data=valid_dataset, epochs=experiment_config["training_args"]["epochs"], callbacks=[model_checkpoint_cb, wandb_cb])

Epoch 1/30
Epoch 00000: val_loss improved from inf to 0.61583, saving model to /content/wandb/run-20201229_190124-2xrlsyxj/files/model-best.h5
Epoch 2/30

#### Validation - Latest Model

In [None]:
with strategy.scope():
  metric = keras.metrics.AUC()

  for valid_xb, valid_yb, valid_mb in valid_dataset:
    valid_y_pred = model(valid_xb, training=False)
    metric.update_state(valid_yb, valid_y_pred, valid_mb)

metric.result().numpy()

#### Validation - Best Model

In [None]:
with strategy.scope():
  new_model = TransformerSeq2SeqClassifier(
        num_layers=experiment_config["model_args"]["num_layers"],
        d_model=experiment_config["model_args"]["d_model"],
        num_heads=experiment_config["model_args"]["num_heads"],
        dff=experiment_config["model_args"]["dff"],
        maximum_position_encoding=SEQ_LEN,
        embed_size_dict=embed_sizes
    )
  new_model.build(input_shape=(128, SEQ_LEN+1, 197)) # input_shape - [batch_size, seq_len, features]
  new_model.summary()

In [None]:
new_model.load_weights("model/best-model.h5")

In [None]:
with strategy.scope():
  metric = keras.metrics.AUC()

  for valid_xb, valid_yb, valid_mb in valid_dataset:
    valid_y_pred = new_model(valid_xb, training=False)
    metric.update_state(valid_yb, valid_y_pred, valid_mb)

auc = metric.result().numpy()
auc

In [None]:
wandb.log({"best_model_auc": auc})

In [None]:
wandb_run.finish()

#### Upload to kaggle

In [None]:
%%capture
!pip install kaggle

In [None]:
if experiment_config["env_args"]["is_kaggle"]:
  from google.colab import drive
  drive.mount("/content/drive")

  # Copy Kaggle API key
  !mkdir -p ~/.kaggle && cp /content/drive/My\ Drive/Projects/Kaggle/api_key/kaggle.json ~/.kaggle/

  !kaggle datasets init -p model/

  # id and title only alphanumeric and "-"
  meta = """
  {
    "licenses": [
      {
        "name": "CC0-1.0"
      }
    ], 
    "id": "nisarahamedk/riid-model-2",
    "title": "riid-model-2"
  }
  """
  with open("model/dataset-metadata.json", "w") as f:
    f.write(meta)

  # create
  !kaggle datasets create -p model/ --dir-mode tar -u