**Empirical Analysis**


In [None]:
#Mount Drive for remote data access
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# **Install packages**

In [None]:
!pip install transformers
!pip install tokenizers
!pip install tensorflow
!pip install tensorflow_addons
!pip install numba

# **Import packages**

In [None]:
import tensorflow as tf
from tensorflow_addons.optimizers import AdamW
from tensorflow.python.client import device_lib
import matplotlib.pyplot as plt
import warnings
from tokenizers.processors import TemplateProcessing
from tokenizers.pre_tokenizers import WhitespaceSplit
from tokenizers.models import BPE
from tokenizers import Tokenizer
from tokenizers.trainers import BpeTrainer
from transformers import PreTrainedTokenizerFast, BertForPreTraining
from tokenizers.decoders import ByteLevel as ByteLevelDecoder
from numba import cuda 
from transformers import pipeline, set_seed, GPT2Model, GPT2Config, GPT2Tokenizer, TFGPT2LMHeadModel, TFBertForSequenceClassification, AdamW, BertConfig, TFBertForMaskedLM
from sklearn.model_selection import train_test_split, ShuffleSplit, KFold
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials
import logging 
logging.basicConfig(level=logging.ERROR)
import random
from math import *
from tqdm import tqdm, notebook
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

# **Import and view data**

Importing and Showing Data

In [None]:
path_names = { 'train_data': "/content/drive/MyDrive/Seminar QM/Data Cleaned/data_train_cln.txt", 
    'test_data': "/content/drive/MyDrive/Seminar QM/Data/data_test.txt",
    'tokenizer_file' : "/content/drive/MyDrive/Seminar QM/Models/TF/Tokenizer/hamsbertje_wordlevel_cln.json",
    'GPT_pretrained_model' : "/content/drive/MyDrive/Seminar QM/Models/TF/220306 GPT pre-trained AH (REDDING)/GPT pre-trained",
    'results_preck': "results_precK_BERTbase",
    'results_hyperparams': "results_best-params_BERTbase",
    'taxonomy': "/content/drive/MyDrive/Seminar QM/Data/product_taxonomy_paths.csv",
    'productID_toName': "/content/drive/MyDrive/Seminar QM/Data/productIDs_names.txt"
}

params = {
    'max_tokenized_length' : 60,
    'batch_size' : 24,
    'n_seqs' : 6,
    'ngram_sim' : 0.8
}

In [None]:
#import from drive
df_org = pd.read_csv(path_names['train_data'], names=["prod_series"], dtype=str, nrows=10000)
baskets = df_org['prod_series'].to_list()

df_test = pd.read_csv(path_names['test_data'], names=["prod_series"], dtype=str)
baskets_test = df_test['prod_series'].to_list()

RANDOM_STATE = 123
random.seed(RANDOM_STATE)
tf.random.set_seed(RANDOM_STATE)

Unnamed: 0,prod_series
0,167085 484999 59670
1,237907 163780 55068 222510 210737 139096 13909...
2,104818 140717 130862 41172 3856 124148


In [None]:
basket_trial = []
for basket in df_test['prod_series']:
  step = []
  products = len(basket.split())
  for i in range(products):
    if i>0:
      basket_trial.append(basket.split()[0:i+1])

baskets_trial = []
for i in basket_trial:
  step = ""
  for j in i:
    step = step +" " +  str(j)
  baskets_trial.append(step)

df_trial = pd.DataFrame(baskets_trial)

Importing Tokenizer and Creating BasketDataset Class

In [None]:
# loading tokenizer from the saved model path ; take old tokenizer
tokenizer = PreTrainedTokenizerFast(
    tokenizer_file=path_names['tokenizer_file'], # You can load from the tokenizer file, alternatively
    eos_token= "</s>",
    bos_token= "<s>",
    unk_token= "<unk>",
    pad_token= "<pad>",
    mask_token= "<mask>",
    padding_side = "left",
    truncation = "left",
)

print(tokenizer.vocab_size)

class BasketDataset:
    def __init__(self, baskets, tokenizer):
        self.tokenizer = tokenizer
        self.baskets = baskets
        self.input_ids = []
        self.attn_mask = []        
        self.encodings = self.tokenizer(self.baskets, truncation=True, max_length=params['max_tokenized_length'], add_special_tokens=False, padding=True)
        self.input_ids.append(tf.constant(self.encodings['input_ids']))
        self.attn_mask.append(tf.constant(self.encodings['attention_mask']))
    def __getitem__(self, idx):
        return self.input_ids[idx]
    def __len__(self):
        return len(self.input_ids)

26722


# ***GPT BASE CASE***

Check and append GPU

In [None]:
# If there's a GPU available...
if tf.config.list_physical_devices('GPU'):    
    # Tell PyTorch to use the GPU.    
    device = tf.device("cuda")
    print('There are %d GPU(s) available.' % len(tf.config.list_physical_devices('GPU')))
    print('We will use the GPU:', device_lib.list_local_devices()[1].name)
    device_name = device_lib.list_local_devices()[1].name
else:
    print('No GPU available, using the CPU instead.')
    device = tf.device("cpu")
    device_name = device_lib.list_local_devices()[0].name

There are 1 GPU(s) available.
We will use the GPU: /device:GPU:0


Initialize pre-trained models

In [None]:
netG = TFGPT2LMHeadModel.from_pretrained(path_names['GPT_pretrained_model'])

In [None]:
import json
with open(path_names['productID_toName']) as f:
    dict_textfile = f.read()     
productIDs_names = json.loads(dict_textfile)

#returns nothing when product name not found
def getProductNames(A):
  if (tf.rank(A) <= 1): # check if rank 1, otherwise tokenizer.decode doesn't work
    B = tokenizer.decode(A).split(" ")
    names = [productIDs_names[B[b]] for b in range(len(B)) if B[b] in productIDs_names.keys()]
  else:
    names = [getProductNames(A[i]) for i in range(len(A))]
  return names

# returns "n.a." when product name not found
def getProductNames_Real(A):
  if (tf.rank(A) <= 1): # check if rank 1, otherwise tokenizer.decode doesn't work
    B = tokenizer.decode(A).split(" ")
    names = [productIDs_names[B[b]] if B[b] in productIDs_names.keys() else "NOT AVAILABLE" for b in range(len(B))]
  else:
    names = [getProductNames(A[i]) for i in range(len(A))]
  return names

In [None]:
def dice_coef(set1, set2):
  count = 0
  for i in set1:
    if i in set2:
      count+=1    
  dice =  2*count/(len(set1)+len(set2))
  return dice

def ngram(product):
    n_gram = []
    for word in product.split():
        for j in range(len(word)-2): 
          if len(word)>=3:   
              n_gram.append(word[j: j + 3])
    return n_gram

def StandardError(bootstraps):
  B = len(bootstraps)
  return np.sqrt(1/(B-1) * sum( np.square(bootstraps - np.mean(bootstraps))))

TEST

In [None]:
global dataset_test
baskets_dataset = baskets_test # insert baskets_trails for sequential results
dataset_test = BasketDataset(baskets_dataset, tokenizer)

In [None]:
#-------------------------------------------------------------------------------
# ~~~ TEST ~~~ w/ bootstrap for stats
#-------------------------------------------------------------------------------

test_size = len(baskets_dataset)
num_specialtok = 5
top_1_count,top_k_count = 0,0
top_1_count_adj, top_k_count_adj = 0,0
output = {}
total_batches = 0
pr_ls_test = []

test_basket = {}
test_basket['input_ids'] = dataset_test.input_ids[0]
test_basket['attention_mask'] = dataset_test.attn_mask[0]
loader_test = tf.data.Dataset.from_tensor_slices(test_basket).batch(params['batch_size'])   
loop_test = notebook.tqdm(loader_test, position=0, leave=True, colour='yellow')

#create top_k matrix for predictions in test set
top_k_all = np.zeros((test_size,30),dtype=np.int32)
missing_items_all = np.zeros((test_size),dtype=np.int32)
top_1_list,top_k_list,top_n_list = [],[],[]

for k, batch_test in enumerate(loop_test):
  batch_size = len(batch_test['input_ids'])
  total_batches += batch_size
  non_predict = tf.fill((batch_size,num_specialtok),0.0)
  incomplete_ids_test = batch_test['input_ids'][:,0:-1]
  incomplete_mask_test = batch_test['attention_mask'][:,0:-1]
  missing_items_test = batch_test['input_ids'][:,-1]
  outputG = netG.call(input_ids=incomplete_ids_test,
                      attention_mask=incomplete_mask_test,
                      training = False).logits[:,-1,num_specialtok:]

  #model should not predict special tokens, therefore, set softmax value at zero
  outputG_softmax = tf.concat((non_predict, tf.nn.softmax(outputG)), axis=-1)
  top_k = tf.nn.top_k(outputG_softmax, k=30, sorted=True)[1]
  top_k_items = tf.squeeze(top_k)
  for i in range(batch_size):
    top_k_all[(k*batch_size)+i,:] =  top_k_items[i]
    missing_items_all[(k*batch_size)+i] = missing_items_test[i]

  # calculate (normal) prec@k
  for i in range(0,batch_size):
    top_n_list.append([batch_test['input_ids'][i],batch_test['attention_mask'][i],top_k_items[i]])
    top_1_count += int(top_k_items[i][0] == missing_items_test[i])
    for j in range(6):
      if (top_k_items[i][j] == missing_items_test[i]):
        top_k_count += 1
        break

  # calculate adj. prec@k
  top_k_names = getProductNames(top_k_items)
  real_names = getProductNames_Real(missing_items_test)
  for i in range(batch_size):
    real_name = real_names[i]
    top_k_item = top_k_names[i]
    if real_name == "NOT AVAILABLE":
      real_token = missing_items_test[i]
      top_k_token = top_k_items[i]
      if real_token == top_k_token[0]:
        top_1_count_adj +=1
      for j in range(6):
        if real_token == top_k_token[j]:
          top_k_count_adj += 1
          break
    else:
      predict_names, ngrams = [], dict()
      q, t = 0,0
      ngram_correct = ngram(real_name)
      short=False
      while len(predict_names)!=6 and q!=len(top_k_item) and short!=True:
        if len(top_k_item)<=6:
          short=True
          predict_names = top_k_item
          for w, item in enumerate(predict_names):
            ngram_item = ngram(item)
            ngrams.update({w : ngram_item})
        else:
          item = top_k_item[q]   
          ngram_item = ngram(item)
          if q==0:
            predict_names.append(item)
            ngrams.update({t : ngram_item})
            t+=1
          else:
            sim = [dice_coef(ngram_item, value) for value in ngrams.values()]
            if all(z<params['ngram_sim'] for z in sim):
              ngrams.update({t : ngram_item})
              predict_names.append(item)
              t+=1

          q+=1
        
      for key, value in ngrams.items():
        ngram_sim = dice_coef(ngram_correct, value)
        if ngram_sim >= params['ngram_sim']:
          output_want = [real_name] + predict_names
          output[total_batches-batch_size+i] = output_want
          top_k_count_adj+=1
          top_k_list.append([batch_test['input_ids'][i],batch_test['attention_mask'][i],top_k_items[i]])
          if key==0:
            top_1_count_adj+=1
            top_1_list.append([batch_test['input_ids'][i],batch_test['attention_mask'][i],top_k_items[i]])
          break
  
  for i in range(0,batch_size):
    pr = tf.math.reduce_sum(tf.where(tf.math.less(outputG_softmax[i,:], outputG_softmax[i,missing_items_test[i].numpy()]), 1.0, 0.0)).numpy()/tokenizer.vocab_size
    pr_ls_test.append(pr)
  precK_test = top_k_count/(total_batches)

  mpr_test = np.array(pr_ls_test).sum()/(total_batches) 

  loop_test.set_postfix(PrecisionAtK =round(precK_test,4), Correct=top_k_count,  data="Test", MPR = mpr_test)

#print metrics
mpr_test = np.array(pr_ls_test).sum()/test_size
prec1 = (top_1_count/test_size) * 100
prec6 = (top_k_count/test_size) * 100
prec1_adj = (top_1_count_adj/test_size) * 100
prec6_adj = ((top_k_count_adj)/test_size) * 100
print(f"   Mean Percentile Rank: {mpr_test}")
print(f"     precision at 1 (%): {prec1}")
print(f"     precision at 6 (%): {prec6}")
print(f"adj. precision at 1 (%): {prec1_adj}")
print(f"adj. precision at 6 (%): {prec6_adj}")

  0%|          | 0/1042 [00:00<?, ?it/s]

   Mean Percentile Rank: 0.9146133093284213
     precision at 1 (%): 4.708188327533101
     precision at 6 (%): 12.81251250050002
adj. precision at 1 (%): 10.78043121724869
adj. precision at 6 (%): 23.084923396935878


In [None]:
#bootstrap SEs for (normal) precision@ (takes appr. 30 seconds)

B = 200
np.random.seed(100)
prec1_bootstraps, preck_bootstraps = np.zeros(B),  np.zeros(B)

for _,b in enumerate(tqdm(range(B),position=0, leave=True)):
  #create bootstrap
  idx_boot = np.random.choice(list(range(0,test_size)), size=test_size, replace=True)
  top_k_boot = [top_k_all[idx_boot[i],0:6] for i in range(test_size)]
  missing_items_boot = [missing_items_all[idx_boot[i]] for i in range(test_size)]

  # calculate bootstrap stats
  top_1_count,top_k_count = 0,0
  for i in range(0,test_size):
    top_1_count += int(top_k_boot[i][0] == missing_items_boot[i])
    for j in range(6):
      if (top_k_boot[i][j] == missing_items_boot[i]):
        top_k_count += 1
        break
  prec1_bootstraps[b] = (top_1_count / test_size)*100
  preck_bootstraps[b] = (top_k_count / test_size)*100
  
#calculate SEs
prec1_se = StandardError(prec1_bootstraps)
preck_se = StandardError(preck_bootstraps)
print('\n')
print(f'prec@1 se (%): {prec1_se}')
print(f'prec@6 se (%): {preck_se}')

100%|██████████| 200/200 [00:15<00:00, 12.74it/s]



prec@1 se (%): 0.11697136483030471
prec@6 se (%): 0.2176926111264603





In [None]:
#bootstrap SEs for adjusted precision@ (takes appr. 30min)

B = 200
np.random.seed(100)
prec1_bootstraps, preck_bootstraps = np.zeros(B),  np.zeros(B)

for _,b in enumerate(tqdm(range(B),position=0, leave=True)):

  idx_boot = np.random.choice(list(range(0,test_size)), size=test_size, replace=True)
  top_k_boot = [top_k_all[idx_boot[i],0:20] for i in range(test_size)]
  missing_items_boot = [missing_items_all[idx_boot[i]] for i in range(test_size)]

  #context_names = getProductNames(incomplete_ids_test)
  top_k_names = getProductNames(top_k_boot)
  real_names = getProductNames_Real(missing_items_boot)

  top_1_count,top_k_count = 0,0
  for i in range(test_size):
    real_name = real_names[i]
    top_k_item = top_k_names[i]
    if real_name == "NOT AVAILABLE":
      real_token = missing_items_boot[i]
      top_k_token = top_k_boot[i]
      if real_token == top_k_token[0]:
        top_1_count +=1
      for j in range(6):
        if real_token == top_k_token[j]:
          top_k_count += 1
          break
    else:
      predict_names, ngrams = [], {}
      q, t = 0,0
      ngram_correct = ngram(real_name)
      short=False
      while len(predict_names)!=6 and q!=len(top_k_item) and short!=True:
        if len(top_k_item)<=6:
          short=True
          predict_names = top_k_item
          for w, item in enumerate(predict_names):
            ngram_item = ngram(item)
            ngrams.update({w : ngram_item})
        else:
          item = top_k_item[q]   
          ngram_item = ngram(item)
          if q==0:
            predict_names.append(item)
            ngrams.update({t : ngram_item})
            t+=1
          else:
            sim = [dice_coef(ngram_item, value) for value in ngrams.values()]
            if all(z<params['ngram_sim'] for z in sim):
              ngrams.update({t : ngram_item})
              predict_names.append(item)
              t+=1

          q+=1
          #if q>=params['n_seqs']-1:
          #  print("ALERT")
        
      for key, value in ngrams.items():
        ngram_sim = dice_coef(ngram_correct, value)
        if ngram_sim >= params['ngram_sim']:
          output_want = [real_name] + predict_names
          output[total_batches-batch_size+i] = output_want
          top_k_count+=1
          if key==0:
            top_1_count+=1
          break
    prec1_bootstraps[b] = (top_1_count / test_size)*100
    preck_bootstraps[b] = (top_k_count / test_size)*100
  
#calculate SEs
prec1_se = StandardError(prec1_bootstraps)
preck_se = StandardError(preck_bootstraps)
print('\n')
print(f'adj. prec@1 se (%): {prec1_se}')
print(f'adj. prec@6 se (%): {preck_se}')

100%|██████████| 200/200 [22:37<00:00,  6.79s/it]



adj. prec@1 se (%): 0.19158802570849098
adj. prec@6 se (%): 0.27646718458756797





CODE FOR HISTOGRAMS (normal) prec@ vs. basket size

In [None]:
#-------------------------------------------------------------------------------
# ~~~ TEST ~~~
#-------------------------------------------------------------------------------

global dataset_test
dataset_test = BasketDataset(baskets_test, tokenizer)

test_basket = {}
test_basket['input_ids'] = dataset_test.input_ids[0]
test_basket['attention_mask'] = dataset_test.attn_mask[0]

top_1_list,top_k_list,top_n_list = [],[],[]

loader_test = tf.data.Dataset.from_tensor_slices(test_basket).batch(params['batch_size'])   
loop_test = notebook.tqdm(loader_test, position=0, leave=True, colour='yellow')
correct_val, item_num = 0,-1

for k, batch_test in enumerate(loop_test):
  batch_size = len(batch_test['input_ids'])
  incomplete_ids_test = batch_test['input_ids'][:,0:-1]
  incomplete_mask_test = batch_test['attention_mask'][:,0:-1]
  missing_items_test = batch_test['input_ids'][:,-1]
  outputG = netG.call(input_ids=incomplete_ids_test,
                      attention_mask=incomplete_mask_test)
  outputG_softmax = tf.nn.softmax(outputG.logits[:,-1,:])
  top_k = tf.nn.top_k(outputG_softmax, k=params['n_seqs'], sorted=True)[1]
  top_k_items = tf.squeeze(top_k)

  for i in range(0,batch_size):
    item_num += 1
    if (top_k_items[i][0] == missing_items_test[i]):
      top_1_list.append([batch_test['input_ids'][i],batch_test['attention_mask'][i],top_k_items[i],item_num])
      continue
    for j in range(1, params['n_seqs']):
      if (top_k_items[i][j] == missing_items_test[i]):
        top_k_list.append([batch_test['input_ids'][i],batch_test['attention_mask'][i],top_k_items[i],item_num])
        break
      if (j+1==params['n_seqs']):
        top_n_list.append([batch_test['input_ids'][i],batch_test['attention_mask'][i],top_k_items[i],item_num])

  precK = correct_val/((k+1)*batch_size)
  loop_test.set_postfix(PrecisionAtK =precK, Correct=correct_val,  data="validation")

  0%|          | 0/1042 [00:00<?, ?it/s]

In [None]:
# Investigate basket length vs out-of-sample performance

#create dictionaries with basket length and frequency for prec@1, prec@6 and whole test set
list_len_1 = [ sum(top_1_list[i][1]).numpy() for i in range(len(top_1_list))]
dict_len_1 = { i: list_len_1.count(i) for i in range(1,101)}
list_len_k = [ sum(top_k_list[i][1]) for i in range(len(top_k_list))]
dict_len_k = { i: list_len_k.count(i) for i in range(1,101)}
list_len_n = [ sum(top_n_list[i][1]) for i in range(len(top_n_list))]
dict_len_n = { i: list_len_n.count(i) for i in range(1,101)}

# normalise the results as for total occurence in dataset (there are a lot more small baskets in the data)
# use binwidth h = 2 as to smooth results
dict_total = { i: dict_len_k[i] + dict_len_n[i] for i in range(1,61)}
print(dict_total)
dict_top1_relative = { i: ((dict_len_1[i] + dict_len_1[i+1]) / (dict_total[i+1] + dict_total[i])) if ((i+1 % 2) == 0) else ((dict_len_1[i] + dict_len_1[i-1]) / (dict_total[i-1] + dict_total[i])) for i in range(2,61)}
print(dict_top1_relative)
dict_topk_relative = { i: ((dict_len_k[i] + dict_len_k[i+1]) / (dict_total[i+1] + dict_total[i])) if ((i+1 % 2) == 0) else ((dict_len_k[i] + dict_len_k[i-1]) / (dict_total[i-1] + dict_total[i])) for i in range(2,61)}

# plot results
plt.figure(0)
a = plt.bar(dict_top1_relative.keys(), dict_top1_relative.values(), width=0.5, color='tab:blue', label='prec @ 1')
plt.figure(1)
plt.bar(dict_topk_relative.keys(), dict_topk_relative.values(), width=0.5, color='tab:orange', label='prec @ k')
plt.figure(2)
plt.bar(dict_total.keys(), dict_total.values(), width=0.5, color='tab:blue')

In [None]:
#print examples

def print_predictions(batch,i):

  context_names = getProductNames(batch[0][0:-1])
  real_names = getProductNames(batch[0][-1])
  top_k_names = getProductNames(batch[2])
  ID = batch[3]
  print(f'- {ID} - {i}')
  print(f'context: {context_names}')
  print(f'real name missing: {real_names}')
  print(f'top_k names: {top_k_names}')
  print('\n')


# print predictions, interchange the list with the one containing the desired predictions (top_1_list, top_k_list,top_n_list)
for i in range(0,500):
 print_predictions(top_1_list[i],i)