**BERT BASE CASE**



In [None]:
#Mount Drive for remote data access
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# **Install packages**

In [None]:
!pip install transformers
!pip install tokenizers
!pip install tensorflow
!pip install tensorflow_addons
!pip install numba

# **Import packages**

In [None]:
import tensorflow as tf
from tensorflow.python.client import device_lib
import matplotlib.pyplot as plt
import warnings
from tokenizers.processors import TemplateProcessing
from tokenizers.pre_tokenizers import WhitespaceSplit
from tokenizers.models import BPE
from tokenizers import Tokenizer
from tokenizers.trainers import BpeTrainer
from transformers import PreTrainedTokenizerFast, BertForPreTraining
from tokenizers.decoders import ByteLevel as ByteLevelDecoder
from transformers import pipeline, set_seed, GPT2Model, GPT2Config, GPT2Tokenizer, TFGPT2LMHeadModel, TFBertForSequenceClassification, AdamW, BertConfig, TFBertForMaskedLM
from sklearn.model_selection import train_test_split, ShuffleSplit, KFold
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials
import logging 
logging.basicConfig(level=logging.ERROR)
import random
from math import *
from tqdm import tqdm, notebook
import pandas as pd
import numpy as np
import os
import json

# **Import and view data**

connect to GPU if available


In [None]:
# If there's a GPU available...
if tf.config.list_physical_devices('GPU'):    
    # Tell PyTorch to use the GPU.    
    device = tf.device("cuda")
    print('There are %d GPU(s) available.' % len(tf.config.list_physical_devices('GPU')))
    print('We will use the GPU:', device_lib.list_local_devices()[1].name)
    device_name = device_lib.list_local_devices()[1].name
else:
    print('No GPU available, using the CPU instead.')
    device = tf.device("cpu")
    device_name = device_lib.list_local_devices()[0].name

Importing and Showing Data

In [None]:
path_names = { 'train_data': "/content/drive/MyDrive/Seminar QM/Data Cleaned/data_train_cln.txt", 
    'test_data': "/content/drive/MyDrive/Seminar QM/Data/data_test.txt",
    'tokenizer_file' : "/content/drive/MyDrive/Seminar QM/Models/TF/Tokenizer/hamsbertje_wordlevel_cln.json",
    'BERT_pretrained_model' : "/content/drive/MyDrive/Seminar QM/Models/TF/220306 BERT pre-trained AH (REDDING)/BERT pre-trained",
    'results_preck': "results_precK_BERTbase",
    'results_hyperparams': "results_best-params_BERTbase",
    'taxonomy': "/content/drive/MyDrive/Seminar QM/Data/product_taxonomy_paths.csv",
    'productID_toName': "/content/drive/MyDrive/Seminar QM/Data/productIDs_names.txt"
}

params = {
    'max_tokenized_length' : 60,
    'batch_size' : 24,
    'n_seqs' : 6,
    'ngram_sim' : 0.8
}

In [None]:
df_test = pd.read_csv(path_names[test_data], names=["prod_series"], dtype=str)
baskets_test = df_test['prod_series'].to_list()

with open(path_names['productID_toName']) as f:
    dict_textfile = f.read()     
productIDs_names = json.loads(dict_textfile)

RANDOM_STATE = 123
random.seed(RANDOM_STATE)
tf.random.set_seed(RANDOM_STATE)

In [None]:
# create baskets for sequential metrics
basket_trial = []
for basket in df_test['prod_series']:
  step = []
  products = len(basket.split())
  for i in range(products):
    if i>0:
      basket_trial.append(basket.split()[0:i+1])

baskets_trial = []
for i in basket_trial:
  step = ""
  for j in i:
    step = step +" " +  str(j)
  baskets_trial.append(step)

df_trial = pd.DataFrame(baskets_trial)

In [None]:
# loading tokenizer from the saved model path ; take old tokenizer
tokenizer = PreTrainedTokenizerFast(
    tokenizer_file=path_names['tokenizer_file'], # You can load from the tokenizer file, alternatively
    eos_token= "</s>",
    bos_token= "<s>",
    unk_token= "<unk>",
    pad_token= "<pad>",
    mask_token= "<mask>",
    padding_side = "left",
    truncation = "left",
)

26722


# ***GPT BASE CASE***

Initialize pre-trained models

In [None]:
global dataset_test
baskets_dataset = baskets_test # replace with baskets_trails for sequential data
dataset_test = tokenizer(baskets_test, return_tensors='tf',
               max_length=60, truncation=True, padding='max_length')

In [None]:
#Append labels, masking, missing items
#Test
indices_to_mask_test = np.where(dataset_test.input_ids == 2)[1]-1

labels_tensors_test = []
missing_items_test = []
mask_tensors_test = []
for i in tqdm(range(tf.shape(dataset_test.input_ids)[0].numpy())):
  #labels
  labels_array_test = np.full(tf.shape(dataset_test.input_ids)[1].numpy(), -100, dtype=np.int32)
  labels_array_test[indices_to_mask_test[i]] = dataset_test.input_ids[i,indices_to_mask_test[i]]
  labels_tensors_test.append(labels_array_test)
  #missing items
  missing_items_test.append([dataset_test.input_ids[i,indices_to_mask_test[i]]])
  #masking
  bool_array_test = np.zeros(tf.shape(dataset_test.input_ids)[1].numpy(),dtype=bool)
  bool_array_test[indices_to_mask_test[i]] = True
  mask_tensors_test.append(bool_array_test)

labels_arr_test = tf.convert_to_tensor(labels_tensors_test)
missing_item_arr_test = tf.convert_to_tensor(missing_items_test)
mask_arr_test = tf.convert_to_tensor(mask_tensors_test)

dataset_test['labels'] = labels_arr_test
dataset_test['missing_items'] = missing_item_arr_test
dataset_test['input_ids'] = tf.where(~mask_arr_test, dataset_test.input_ids, 4)
dataset_test['masked_index'] = tf.convert_to_tensor(np.where(dataset_test['labels'] != -100)[1])

100%|██████████| 24999/24999 [00:56<00:00, 446.24it/s] 


In [None]:
global bad_words
bad_words = [[tokenizer.eos_token_id],[tokenizer.bos_token_id], [tokenizer.pad_token_id]]

BERT = TFBertForMaskedLM.from_pretrained(path_names['BERT_pretrained_model'])

In [None]:
#returns nothing when product name not found
def getProductNames(A):
  if (tf.rank(A) <= 1): # check if rank 1, otherwise tokenizer.decode doesn't work
    B = tokenizer.decode(A).split(" ")
    names = [productIDs_names[B[b]] for b in range(len(B)) if B[b] in productIDs_names.keys()]
  else:
    names = [getProductNames(A[i]) for i in range(len(A))]
  return names

# returns "n.a." when product name not found
def getProductNames_Real(A):
  if (tf.rank(A) <= 1): # check if rank 1, otherwise tokenizer.decode doesn't work
    B = tokenizer.decode(A).split(" ")
    names = [productIDs_names[B[b]] if B[b] in productIDs_names.keys() else "NOT AVAILABLE" for b in range(len(B))]
  else:
    names = [getProductNames(A[i]) for i in range(len(A))]
  return names

In [None]:
def dice_coef(set1, set2):
  count = 0
  for i in set1:
    if i in set2:
      count+=1    
  dice =  2*count/(len(set1)+len(set2))
  return dice

def ngram(product):
    n_gram = []
    for word in product.split():
        for j in range(len(word)-2): 
          if len(word)>=3:   
              n_gram.append(word[j: j + 3])
    return n_gram

def standardError(bootstraps):
  B = len(bootstraps)
  return np.sqrt(1/(B-1) * sum( np.square(bootstraps - np.mean(bootstraps))))

TEST

In [None]:
loader_test = tf.data.Dataset.from_tensor_slices((dataset_test['input_ids'],
                                              dataset_test['attention_mask'],
                                              dataset_test['labels'],
                                              dataset_test['missing_items'],
                                              dataset_test['masked_index']
                                              )).batch(params['batch_size'],drop_remainder=True)  

In [None]:
#-------------------------------------------------------------------------------
# ~~~ TEST for bootstrapping
#-------------------------------------------------------------------------------

test_size = len(baskets_dataset)
num_specialtok = 5
top_1_count,top_k_count = 0,0
top_1_count_adj, top_k_count_adj = 0,0
output = {}
total_batches = 0
pr_ls_test = []
prec_at = []
prec_at_adj = []
loop_test = notebook.tqdm(loader_test, position=0, leave=True, colour='yellow')

#create top_k matrix for predictions in test set
top_k_all = np.zeros((test_size,80),dtype=np.int32)
missing_items_all = np.zeros((test_size),dtype=np.int32)

for k, batch_test in enumerate(loop_test):
  batch_size = len(batch_test[0])
  total_batches += batch_size
  non_predict = tf.fill((batch_size,num_specialtok),0.0)

  incomplete_ids_test = batch_test[0]
  attention_mask_test = batch_test[1]
  labels = batch_test[2]
  missing_items_test = batch_test[3]
  incomplete_mask_test = batch_test[4]

  
  outputBERT_logits = BERT.call(incomplete_ids_test, attention_mask=attention_mask_test, labels=labels,training=False).logits 
  logits = []
  for m, mask_ind in enumerate(incomplete_mask_test.numpy()):
    lgts = outputBERT_logits[m,mask_ind,:]
    logits.append(lgts)
  outputG = tf.convert_to_tensor(logits)[:,num_specialtok:]
  outputG_softmax = tf.concat((non_predict, tf.nn.softmax(outputG)), axis=-1)

  missing_items_test = np.array([missing_items_test[i][0] for i in range(batch_size)])
  
  top_k = tf.nn.top_k(outputG_softmax, k=80, sorted=True)[1]

  top_k_items = tf.squeeze(top_k)

  for i in range(batch_size):
    top_k_all[(k*batch_size)+i,:] =  top_k_items[i]
    missing_items_all[(k*batch_size)+i] = missing_items_test[i]

  # calculate (normal) prec@k
  for i in range(0,batch_size):
    top_1_count += int(top_k_items[i][0] == missing_items_test[i])
    for j in range(params['n_seqs']):
      if (top_k_items[i][j] == missing_items_test[i]):
        top_k_count += 1
        prec_at.append(j)
        break

  # calculate adj. prec@k
  top_k_names = getProductNames(top_k_items)
  real_names = getProductNames_Real(missing_items_test)
  for i in range(batch_size):
    real_name = real_names[i]
    top_k_item = top_k_names[i]
    if real_name == "NOT AVAILABLE":
      real_token = missing_items_test[i]
      top_k_token = top_k_items[i]
      if real_token == top_k_token[0]:
        top_1_count_adj +=1
      for j in range(params['n_seqs']):
        if real_token == top_k_token[j]:
          prec_at_adj.append(j)
          top_k_count_adj += 1
          break
    else:
      predict_names, ngrams = [], dict()
      q, t = 0,0
      ngram_correct = ngram(real_name)
      #print(ngram_correct)
      short=False
      while len(predict_names)!=params['n_seqs'] and q!=len(top_k_item) and short!=True:
        if len(top_k_item)<=params['n_seqs']:
          short=True
          predict_names = top_k_item
          for w, item in enumerate(predict_names):
            ngram_item = ngram(item)
            ngrams.update({w : ngram_item})
        else:
          item = top_k_item[q]   
          ngram_item = ngram(item)
          if q==0:
            predict_names.append(item)
            ngrams.update({t : ngram_item})
            t+=1
          else:
            sim = [dice_coef(ngram_item, value) for value in ngrams.values()]
            if all(z<params['ngram_sim'] for z in sim):
              ngrams.update({t : ngram_item})
              predict_names.append(item)
              t+=1

          q+=1
        
      for key, value in ngrams.items():
        ngram_sim = dice_coef(ngram_correct, value)
        if ngram_sim >= params['ngram_sim']:
          output_want = [real_name] + predict_names
          output[total_batches-batch_size+i] = output_want
          top_k_count_adj+=1
          prec_at_adj.append(key)
          if key==0:
            top_1_count_adj+=1
          break
  
  for i in range(0,batch_size):
    pr = tf.math.reduce_sum(tf.where(tf.math.less(outputG_softmax[i,:], outputG_softmax[i,missing_items_test[i]]), 1.0, 0.0)).numpy()/tokenizer.vocab_size
    pr_ls_test.append(pr)
  precK_test = top_k_count/((k+1)*batch_size)

  mpr_test = np.array(pr_ls_test).sum()/((k+1)*batch_size) 

  loop_test.set_postfix(PrecisionAtK =round(precK_test,4), Correct=top_k_count,  data="Test", MPR = mpr_test)

#print metrics
mpr_test = np.array(pr_ls_test).sum()/test_size
prec1 = (top_1_count/test_size) * 100
prec6 = (top_k_count/test_size) * 100
prec1_adj = (top_1_count_adj/test_size) * 100
prec6_adj = ((top_k_count_adj)/test_size) * 100
print(f"   Mean Percentile Rank: {mpr_test}")
print(f"     precision at 1 (%): {prec1}")
print(f"     precision at 6 (%): {prec6}")
print(f"adj. precision at 1 (%): {prec1_adj}")
print(f"adj. precision at 6 (%): {prec6_adj}")

In [None]:
#plot graphs for precision at k, different k values

prec_array = np.array(prec_at)
prec_array_adj = np.array(prec_at_adj)

cum_value_adj,prec_cum_adj, cum_value, prec_cum = [],[],[],[]
for i in range(params['n_seqs']):
    value = np.count_nonzero(np.where(prec_array==i))
    value_adj = np.count_nonzero(np.where(prec_array_adj==i))
    cum_value.append(value)
    prec_cum.append(sum(cum_value)/test_size)
    cum_value_adj.append(value_adj)
    prec_cum_adj.append(sum(cum_value_adj)/test_size)

plt.plot(prec_cum)
plt.plot(prec_cum_adj)
plt.xlabel("@k")
plt.ylabel("Precision")
plt.legend(["Precision@k", "Adj. Precision@k"])
plt.show()

In [None]:
#bootstrap SEs for (normal) precision@ (takes appr. 30 seconds)

B = 200
np.random.seed(100)
prec1_bootstraps, preck_bootstraps = np.zeros(B),  np.zeros(B)

for _,b in enumerate(tqdm(range(B),position=0, leave=True)):
  #create bootstrap
  idx_boot = np.random.choice(list(range(0,test_size)), size=test_size, replace=True)
  top_k_boot = [top_k_all[idx_boot[i],0:6] for i in range(test_size)]
  missing_items_boot = [missing_items_all[idx_boot[i]] for i in range(test_size)]

  # calculate bootstrap stats
  top_1_count,top_k_count = 0,0
  for i in range(0,test_size):
    top_1_count += int(top_k_boot[i][0] == missing_items_boot[i])
    for j in range(6):
      if (top_k_boot[i][j] == missing_items_boot[i]):
        top_k_count += 1
        break
  prec1_bootstraps[b] = (top_1_count / test_size)*100
  preck_bootstraps[b] = (top_k_count / test_size)*100
  
#calculate SEs
prec1_se = standardError(prec1_bootstraps)
preck_se = standardError(preck_bootstraps)
print('\n')
print(f'prec@1 se (%): {prec1_se}')
print(f'prec@6 se (%): {preck_se}')

100%|██████████| 200/200 [04:59<00:00,  1.50s/it]



prec@1 se (%): 0.0274015016598457
prec@6 se (%): 0.05109871341913614





In [None]:
#bootstrap SEs for adjusted precision@ (takes appr. 30min)

B = 200
np.random.seed(100)
prec1_bootstraps, preck_bootstraps = np.zeros(B),  np.zeros(B)

for _,b in enumerate(tqdm(range(B),position=0, leave=True)):

  idx_boot = np.random.choice(list(range(0,test_size)), size=test_size, replace=True)
  top_k_boot = [top_k_all[idx_boot[i],0:20] for i in range(test_size)]
  missing_items_boot = [missing_items_all[idx_boot[i]] for i in range(test_size)]

  #context_names = getProductNames(incomplete_ids_test)
  top_k_names = getProductNames(top_k_boot)
  real_names = getProductNames_Real(missing_items_boot)

  top_1_count,top_k_count = 0,0
  for i in range(test_size):
    real_name = real_names[i]
    top_k_item = top_k_names[i]
    if real_name == "NOT AVAILABLE":
      real_token = missing_items_boot[i]
      top_k_token = top_k_boot[i]
      if real_token == top_k_token[0]:
        top_1_count +=1
      for j in range(6):
        if real_token == top_k_token[j]:
          top_k_count += 1
          break
    else:
      predict_names, ngrams = [], {}
      q, t = 0,0
      ngram_correct = ngram(real_name)
      short=False
      while len(predict_names)!=6 and q!=len(top_k_item) and short!=True:
        if len(top_k_item)<=6:
          short=True
          predict_names = top_k_item
          for w, item in enumerate(predict_names):
            ngram_item = ngram(item)
            ngrams.update({w : ngram_item})
        else:
          item = top_k_item[q]   
          ngram_item = ngram(item)
          if q==0:
            predict_names.append(item)
            ngrams.update({t : ngram_item})
            t+=1
          else:
            sim = [dice_coef(ngram_item, value) for value in ngrams.values()]
            if all(z<params['ngram_sim'] for z in sim):
              ngrams.update({t : ngram_item})
              predict_names.append(item)
              t+=1

          q+=1
        
      for key, value in ngrams.items():
        ngram_sim = dice_coef(ngram_correct, value)
        if ngram_sim >= params['ngram_sim']:
          output_want = [real_name] + predict_names
          output[total_batches-batch_size+i] = output_want
          top_k_count+=1
          if key==0:
            top_1_count+=1
          break
    prec1_bootstraps[b] = (top_1_count / test_size)*100
    preck_bootstraps[b] = (top_k_count / test_size)*100
  
#calculate SEs
prec1_se = standardError(prec1_bootstraps)
preck_se = standardError(preck_bootstraps)
print('\n')
print(f'adj. prec@1 se (%): {prec1_se}')
print(f'adj. prec@6 se (%): {preck_se}')

100%|██████████| 200/200 [23:17<00:00,  6.99s/it]



adj. prec@1 se (%): 0.1781470404031198
adj. prec@6 se (%): 0.2611987367586265





CODE FOR HISTOGRAMS (normal) prec@ vs. basket size

In [None]:
#-------------------------------------------------------------------------------
# ~~~ TEST ~~~
#-------------------------------------------------------------------------------
 
loop_test = notebook.tqdm(loader_test, position=0, leave=True, colour='yellow')
correct_val = 0
num_specialtok = 5
item_num = -1
top_1_list,top_k_list,top_n_list,top_JOE_list = [],[],[],[]
for k, batch_test in enumerate(loop_test):
  batch_size = len(batch_test[0])
  non_predict = tf.fill((batch_size,num_specialtok),0.0)

  incomplete_ids_test = batch_test[0]
  attention_mask_test = batch_test[1]
  labels = batch_test[2]
  missing_items_test = batch_test[3]
  incomplete_mask_test = batch_test[4]

  outputBERT_logits = BERT.call(incomplete_ids_test, attention_mask=attention_mask_test, labels=labels,training=False).logits 
  logits = []
  for m, mask_ind in enumerate(incomplete_mask_test.numpy()):
    lgts = outputBERT_logits[m,mask_ind,:]
    logits.append(lgts)
  outputBERT = tf.convert_to_tensor(logits)[:,num_specialtok:]
  outputBERT_softmax = tf.concat((non_predict, tf.nn.softmax(outputBERT)), axis=-1)

  top_k = tf.nn.top_k(outputBERT_softmax, k=30, sorted=True)[1]
  top_k_items = tf.squeeze(top_k)

  for i in range(0,batch_size):
    item_num += 1
    if (top_k_items[i][0] == missing_items_test[i]):
      top_1_list.append([batch_test['input_ids'][i],batch_test['attention_mask'][i],top_k_items[i],item_num])
      continue
    for j in range(1, params['n_seqs']):
      if (top_k_items[i][j] == missing_items_test[i]):
        top_k_list.append([batch_test['input_ids'][i],batch_test['attention_mask'][i],top_k_items[i],item_num])
        break
      if (j+1==params['n_seqs']):
        top_n_list.append([batch_test['input_ids'][i],batch_test['attention_mask'][i],top_k_items[i],item_num])

  precK = correct_val/((k+1)*batch_size)
  loop_test.set_postfix(PrecisionAtK =precK, Correct=correct_val,  data="validation")

  0%|          | 0/1041 [00:00<?, ?it/s]

In [None]:
#print examples

def print_predictions(batch,i):

  context_names = getProductNames(batch[0][0:-1])
  real_names = getProductNames(batch[0][-1])
  top_k_names = getProductNames(batch[2])
  ID = batch[3]
  print(f'- {ID} - {i}')
  print(f'context: {context_names}')
  print(f'real name missing: {real_names}')
  print(f'top_k names: {top_k_names}')
  print('\n')

# print predictions, interchange the list with the one containing the desired predictions (top_1_list, top_k_list,top_n_list)
for i in range(0,500):
 print_predictions(top_1_list[i],i)