In [None]:
#Mount Drive for remote data access
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# **Install packages**

In [None]:
!pip install transformers
!pip install tokenizers
!pip install tensorflow
!pip install tensorflow_addons

# **Import packages**

In [None]:
import tensorflow as tf
from tensorflow_addons.optimizers import AdamW
from tensorflow.python.client import device_lib

import matplotlib.pyplot as plt
import warnings
from tokenizers.processors import TemplateProcessing
from tokenizers.pre_tokenizers import WhitespaceSplit
from tokenizers.models import BPE
from tokenizers import Tokenizer
from tokenizers.trainers import BpeTrainer
from transformers import PreTrainedTokenizerFast, BertForPreTraining
from tokenizers.decoders import ByteLevel as ByteLevelDecoder
from numba import cuda 
from transformers import pipeline, set_seed, GPT2Model, GPT2Config, GPT2Tokenizer, TFGPT2LMHeadModel, TFBertForSequenceClassification, AdamW, BertConfig, TFBertForMaskedLM

from sklearn.model_selection import train_test_split, ShuffleSplit, KFold
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials
import logging 
logging.basicConfig(level=logging.ERROR)
import random
from math import *
from tqdm import tqdm, notebook
import pandas as pd
import numpy as np

import os

RANDOM_STATE = 123
random.seed(RANDOM_STATE)
tf.random.set_seed(RANDOM_STATE)

Check and append GPU

In [None]:
# If there's a GPU available...
if tf.config.list_physical_devices('GPU'):    
    # Tell PyTorch to use the GPU.    
    device = tf.device("cuda")
    print('There are %d GPU(s) available.' % len(tf.config.list_physical_devices('GPU')))
    print('We will use the GPU:', device_lib.list_local_devices()[1].name)
    device_name = device_lib.list_local_devices()[1].name
else:
    print('No GPU available, using the CPU instead.')
    device = tf.device("cpu")
    device_name = device_lib.list_local_devices()[0].name

There are 1 GPU(s) available.
We will use the GPU: /device:GPU:0


# **Import and view data**

In [None]:
#Pathnames

path_names = { 'train_data': "/content/drive/MyDrive/Seminar QM/Data Cleaned/data_train_cln.txt", 
    'test_data': "/content/drive/MyDrive/Seminar QM/Data/data_test.txt",
    'tokenizer_file' : "/content/drive/MyDrive/Seminar QM/Models/TF/Tokenizer/hamsbertje_wordlevel_cln.json",
    'BERT_pretrained_model' : "/content/drive/MyDrive/Seminar QM/Models/TF/220306 BERT pre-trained AH (REDDING)/bert pre-trained",
    'results_preck': "results_precK_BERTbase",
    'results_hyperparams': "results_best-params_BERTbase"
}

Importing and Showing Data

In [None]:
#import from drive
n_rows_train = 1000   #This controls the number of train baskets to load in
df_org = pd.read_csv(path_names['train_data'], names=["prod_series"], dtype=str, nrows=n_rows_train)
baskets = df_org['prod_series'].to_list()

n_rows_test = 1000   #This controls the number of test baskets to load in
df_test = pd.read_csv(path_names['test_data'], names=["prod_series"], dtype=str,nrows=n_rows_test)
baskets_test = df_test['prod_series'].to_list()

Importing Tokenizer and Tokenization


In [None]:
# loading tokenizer from the saved model path
tokenizer = PreTrainedTokenizerFast(
    tokenizer_file=path_names['tokenizer_file'], # You can load from the tokenizer file, alternatively
    eos_token= "</s>",
    bos_token= "<s>",
    unk_token= "<unk>",
    pad_token= "<pad>",
    mask_token= "<mask>",
    truncation_side = 'left'
)

# **BERT BASE CASE**

# BERT Hyperparamaters

In [None]:
#Hyperparams
bayes_parameters = {'lr' : hp.uniform('lr', 1e-7, 6e-5),
                    'epochs' : hp.quniform('epochs', 3,5,1)}

params = {
    'max_tokenized_length' : 60,
    'batch_size' : 12,
    'n_seqs' : 6,
    'n_folds' : 3
}

Prepare data

In [None]:
dataset = tokenizer(baskets, return_tensors='tf',
          max_length=params['max_tokenized_length'], truncation=True, padding='max_length')

dataset_test = tokenizer(baskets_test, return_tensors='tf',
               max_length=params['max_tokenized_length'], truncation=True, padding='max_length')

In [None]:
#Append labels, masking, missing items
#Train
indices_to_mask = np.where(dataset.input_ids == 2)[1]-1

labels_tensors = []
missing_items = []
mask_tensors = []

for i in tqdm(range(tf.shape(dataset.input_ids)[0].numpy())):
  #labels
  labels_array = np.full(tf.shape(dataset.input_ids)[1].numpy(), -100, dtype=np.int32)
  labels_array[indices_to_mask[i]] = dataset.input_ids[i,indices_to_mask[i]]
  labels_tensors.append(labels_array)
  #missing items
  missing_items.append([dataset.input_ids[i,indices_to_mask[i]]])
  #masking
  bool_array = np.zeros(tf.shape(dataset.input_ids)[1].numpy(),dtype=bool)
  bool_array[indices_to_mask[i]] = True
  mask_tensors.append(bool_array)

labels_arr = tf.convert_to_tensor(labels_tensors)
missing_item_arr = tf.convert_to_tensor(missing_items)
mask_arr = tf.convert_to_tensor(mask_tensors)

dataset['labels'] = labels_arr
dataset['missing_items'] = missing_item_arr
dataset['input_ids'] = tf.where(~mask_arr, dataset.input_ids, 4)
dataset['masked_index'] = tf.convert_to_tensor(np.where(dataset['labels'] != -100)[1])

100%|██████████| 1000/1000 [00:03<00:00, 291.82it/s]


In [None]:
#Append labels, masking, missing items
#Test
indices_to_mask_test = np.where(dataset_test.input_ids == 2)[1]-1

labels_tensors_test = []
missing_items_test = []
mask_tensors_test = []
for i in tqdm(range(tf.shape(dataset_test.input_ids)[0].numpy())):
  #labels
  labels_array_test = np.full(tf.shape(dataset_test.input_ids)[1].numpy(), -100, dtype=np.int32)
  labels_array_test[indices_to_mask_test[i]] = dataset_test.input_ids[i,indices_to_mask_test[i]]
  labels_tensors_test.append(labels_array_test)
  #missing items
  missing_items_test.append([dataset_test.input_ids[i,indices_to_mask_test[i]]])
  #masking
  bool_array_test = np.zeros(tf.shape(dataset_test.input_ids)[1].numpy(),dtype=bool)
  bool_array_test[indices_to_mask_test[i]] = True
  mask_tensors_test.append(bool_array_test)

labels_arr_test = tf.convert_to_tensor(labels_tensors_test)
missing_item_arr_test = tf.convert_to_tensor(missing_items_test)
mask_arr_test = tf.convert_to_tensor(mask_tensors_test)

dataset_test['labels'] = labels_arr_test
dataset_test['missing_items'] = missing_item_arr_test
dataset_test['input_ids'] = tf.where(~mask_arr_test, dataset_test.input_ids, 4)
dataset_test['masked_index'] = tf.convert_to_tensor(np.where(dataset_test['labels'] != -100)[1])

100%|██████████| 1000/1000 [00:01<00:00, 572.59it/s]


Initialize pre-trained models

In [None]:
global criterion
global bad_words

criterion = tf.keras.losses.CategoricalCrossentropy(from_logits=False, axis=-1)
bad_words = [[tokenizer.eos_token_id],[tokenizer.bos_token_id], [tokenizer.pad_token_id]]

kfold = KFold(n_splits=params['n_folds'], shuffle=True, random_state=RANDOM_STATE)

## TF BERT as Base Case

Hyperparameter optimization

In [None]:
def train_step(batch, optim,BERT):

    input_ids = batch[0]
    attention_mask = batch[1]
    labels = batch[2]
    missing_items = batch[3]
    masked_index = batch[4]

    with tf.GradientTape() as tape:
      res = BERT.call(input_ids, attention_mask=attention_mask, labels=labels, training=True)
      loss=tf.math.reduce_mean(res.loss)
      attentions=res.attentions
    optim.minimize(loss, var_list = BERT.trainable_variables, tape=tape)

    return loss,attentions

In [None]:
def BERT_Base_finetuning(space):
  lr = space['lr']
  num_epochs = int(space['epochs'])
  precK_ls = []
  losses, val_losses = [],[]

  for train_index, val_index in kfold.split(dataset['input_ids']):

    BERT = TFBertForMaskedLM.from_pretrained(path_names['BERT_pretrained_model'],
                                             output_attentions=True)
    optim = tf.keras.optimizers.Adam(lr)
    losses = []
    correct_train = 0
    correct_val=0
    train_basket, val_basket = {},{}

    train_basket['input_ids'] = tf.gather(dataset['input_ids'], indices=tf.constant(train_index))
    train_basket['attention_mask'] = tf.gather(dataset['attention_mask'], indices=tf.constant(train_index))
    train_basket['labels'] = tf.gather(dataset['labels'], indices=tf.constant(train_index))
    train_basket['missing_items'] = tf.gather(dataset['missing_items'], indices=tf.constant(train_index))
    train_basket['masked_index'] = tf.gather(dataset['masked_index'], indices=tf.constant(train_index))

    val_basket['input_ids'] = tf.gather(dataset['input_ids'], indices=tf.constant(val_index))
    val_basket['attention_mask'] = tf.gather(dataset['attention_mask'], indices=tf.constant(val_index))
    val_basket['labels'] = tf.gather(dataset['labels'], indices=tf.constant(val_index))
    val_basket['missing_items'] = tf.gather(dataset['missing_items'], indices=tf.constant(val_index))
    val_basket['masked_index'] = tf.gather(dataset['masked_index'], indices=tf.constant(val_index))

    #-----------------------------------------------------------------------------
    # ~~~ Training ~~~
    #-----------------------------------------------------------------------------

    for epoch in range(num_epochs):
      correct_train = 0
      correct_val=0
      avg_loss = 0
      losses, val_losses = [],[]
      loader_train = tf.data.Dataset.from_tensor_slices((train_basket['input_ids'],
                                                        train_basket['attention_mask'],
                                                        train_basket['labels'],
                                                        train_basket['missing_items'],
                                                        train_basket['masked_index']
                                                        )).batch(params['batch_size'],drop_remainder=True) 
                                                                           
      loop_train = notebook.tqdm(loader_train, position=0, leave=True, colour='green')
      for k, batch in enumerate(loop_train):
        if k==0:
            train_step_fn = tf.function(train_step).get_concrete_function(batch,optim, BERT)

        loss, attentions = train_step_fn(batch)

        losses.append(loss.numpy())
        avg_loss = sum(losses)/k
            
        loop_train.set_postfix(loss=avg_loss, data="train", epochs=num_epochs, lr=lr)

      #-----------------------------------------------------------------------------
      # ~~~ Validation ~~~
      #-----------------------------------------------------------------------------
      loader_val = tf.data.Dataset.from_tensor_slices((val_basket['input_ids'],
                                              val_basket['attention_mask'],
                                              val_basket['labels'],
                                              val_basket['missing_items'],
                                              val_basket['masked_index']
                                              )).batch(params['batch_size'],drop_remainder=True) 

      loop_val = notebook.tqdm(loader_val, position=0, leave=True, colour='yellow')
      correct_val = 0

      for v, batch_val in enumerate(loop_val):

        batch_size=params['batch_size']
        input_ids = batch_val[0]
        attention_mask = batch_val[1]
        labels = batch_val[2]
        missing_items = batch_val[3]
        masked_index = batch_val[4]

        outputBERT_logits = BERT.call(input_ids, attention_mask=attention_mask, labels=labels,training=False).logits 
        logits = []

        for m, mask_ind in enumerate(masked_index.numpy()):
          lgts = outputBERT_logits[m,mask_ind,:]
          logits.append(lgts)

        outputs_val_softmax = tf.nn.softmax(tf.convert_to_tensor(logits))

        top_k_items = tf.squeeze(tf.nn.top_k(outputs_val_softmax,k=params['n_seqs'],sorted=False)[1])

        for i in range(0,batch_size):
            for j in range(0, params['n_seqs']):
              if (top_k_items[i][j] == missing_items[i]):
                correct_val+=1
                break

        precK_val = correct_val/((v+1)*batch_size)

        loop_val.set_postfix(PrecisionAtK =precK_val, Correct=correct_val,  data="validation")

        precK_ls.append(precK_val)

  precK_avg = np.mean(np.array(precK_ls))
  
  return {'loss': -precK_avg, 'lossG': losses, 'status': STATUS_OK}


In [None]:
# Run Bayesian Optimization
best_params, loss = dict(), dict()
trials = Trials()
best_params = fmin(fn=BERT_Base_finetuning,
          space=bayes_parameters,
          algo=tpe.suggest,
          trials=trials,
          return_argmin=False, 
          max_evals=15,
          rstate= np.random.RandomState(RANDOM_STATE))

precisionk  = [-x['result']['loss']  for x in trials.trials] 
losses  = [x['result']['loss']  for x in trials.trials]
best_idx = np.argmax(precisionk)
best_model = [x['result']['model']  for x in trials.trials][best_idx]

Saving hyperparameter optimisation results

In [None]:
import pickle
# Dump results to pickle files
with open(path_names['results_preck'], "wb") as fp:   #Pickling
    pickle.dump(precisionk, fp)

with open(path_names['results_hyperparams'], "wb") as fp:   #Pickling
    pickle.dump(best_params, fp)    

# **Train Full**

Open hyperparameter optimization results

In [None]:
# Open pickle files with results for testing
import pickle
with open(path_names['results_preck'], "rb") as fp:   # Unpickling
    precK_load = pickle.load(fp)

with open(path_names['results_hyperparams'], "rb") as fp:   #Pickling
    best_params_load = pickle.load(fp)    

In [None]:
BERT = TFBertForMaskedLM.from_pretrained(path_names['BERT_pretrained_model'])
lr=best_params_load['lr']
num_epochs = best_params_load['epochs']
optim = tf.keras.optimizers.Adam(lr,)
losses = []
correct_train = 0

#-----------------------------------------------------------------------------
# ~~~ Training ~~~
#-----------------------------------------------------------------------------

for epoch in range(num_epochs):
  correct_train = 0
  correct_val=0
  avg_loss = 0
  losses, val_losses = [],[]
  loader_train = tf.data.Dataset.from_tensor_slices((dataset['input_ids'],
                                                    dataset['attention_mask'],
                                                    dataset['labels'],
                                                    dataset['missing_items'],
                                                    dataset['masked_index']
                                                    )).batch(params['batch_size'],drop_remainder=True) 
                                                                        
  loop_train = notebook.tqdm(loader_train, position=0, leave=True, colour='green')
  for k, batch in enumerate(loop_train):
    if k==0:
        train_step_fn = tf.function(train_step).get_concrete_function(batch,optim, BERT)

    input_ids = batch[0]
    attention_mask = batch[1]
    labels = batch[2]
    missing_items = batch[3]
    masked_index = batch[4]

    with tf.GradientTape() as tape:
      res = BERT.call(input_ids, attention_mask=attention_mask, labels=labels, training=True)
      loss=tf.math.reduce_mean(res.loss)

    optim.minimize(loss, var_list = BERT.trainable_variables, tape=tape)
    outputBERT_logits = res.logits 

    logits = []
    for m, mask_ind in enumerate(masked_index.numpy()):
      lgts = outputBERT_logits[m,mask_ind,:]
      logits.append(lgts)
    outputs_val_softmax = tf.nn.softmax(tf.convert_to_tensor(logits))

    top_k_items = tf.squeeze(tf.nn.top_k(outputs_val_softmax,k=params['n_seqs'],sorted=False)[1])

    for i in range(0,params['batch_size']):
      for j in range(0, params['n_seqs']):
        if (top_k_items[i][j] == missing_items[i]):
          correct_train+=1
          break

    precK_train = correct_train/((k+1)*params['batch_size'])

    losses.append(loss.numpy())
    avg_loss = sum(losses)/k

    loop_train.set_postfix(loss=avg_loss,precK_train = precK_train, data="train", epochs=num_epochs, lr=lr)

# **Test on test set**

In [None]:
for epoch in range(num_epochs):
    correct_test=0
    avg_loss = 0
    pr_ls, precK_ls=[],[]
    
    #-----------------------------------------------------------------------------
    # ~~~ Test ~~~
    #-----------------------------------------------------------------------------
    loader_test = tf.data.Dataset.from_tensor_slices((dataset_test['input_ids'],
                                              dataset_test['attention_mask'],
                                              dataset_test['labels'],
                                              dataset_test['missing_items'],
                                              dataset_test['masked_index']
                                              )).batch(params['batch_size'],drop_remainder=True) 

    loop_test = notebook.tqdm(loader_test, position=0, leave=True, colour='yellow')
    correct_test =0
    for t, batch_test in enumerate(loop_test):

      batch_size=params['batch_size']
      input_ids = batch_test[0]
      attention_mask = batch_test[1]
      labels = batch_test[2]
      missing_items = batch_test[3]
      masked_index = batch_test[4]

      outputBERT_logits = BERT.call(input_ids, attention_mask=attention_mask, labels=labels,training=False).logits 
      logits = []

      for m, mask_ind in enumerate(masked_index.numpy()):
        lgts = outputBERT_logits[m,mask_ind,:]
        logits.append(lgts)
      outputs_test_softmax = tf.nn.softmax(tf.convert_to_tensor(logits))

      top_k_items = tf.squeeze(tf.nn.top_k(outputs_test_softmax,k=params['n_seqs'],sorted=False)[1])

      for i in range(0,batch_size):
          pr = tf.math.reduce_sum(tf.where(tf.math.less(outputs_test_softmax[i,:], 
                                                  outputs_test_softmax[i,missing_items[i].numpy()[0]]), 1.0, 0.0)).numpy()/tokenizer.vocab_size
          pr_ls.append(pr)
          for j in range(0, params['n_seqs']):
            if (top_k_items[i][j] == missing_items[i]):
              correct_test+=1
              break

      mpr_test = np.array(pr_ls).sum()/((t+1)*batch_size) 
      precK_test = correct_test/((t+1)*batch_size)
      
      loop_test.set_postfix(PrecisionAtK =precK_test, MPR=mpr_test, Correct=correct_test,  data="test")