In [None]:
import tensorflow as tf
from tensorflow.python.client import device_lib

import matplotlib.pyplot as plt
import warnings
from tokenizers.processors import TemplateProcessing
from tokenizers.pre_tokenizers import WhitespaceSplit
from tokenizers.models import BPE
from tokenizers import Tokenizer
from tokenizers.trainers import BpeTrainer
from transformers import PreTrainedTokenizerFast, BertForPreTraining
from tokenizers.decoders import ByteLevel as ByteLevelDecoder
from transformers import pipeline, set_seed, GPT2Model, GPT2Config, GPT2Tokenizer, TFGPT2LMHeadModel, TFBertForSequenceClassification, AdamW, BertConfig
import transformers


from sklearn.model_selection import train_test_split, ShuffleSplit, KFold
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials
import logging 
logging.basicConfig(level=logging.ERROR)
import random
from math import *
from tqdm import tqdm, notebook
import pandas as pd
import numpy as np

import os

from tensorflow.python.ops.numpy_ops import np_config
np_config.enable_numpy_behavior()

# **Import and view data**

In [None]:
TRAIN_PATH = 'TRAIN_PATH' 
TEST_PATH = 'TEST_PATH'
TOKENIZER_PATH = 'TOKENIZER_PATH'
MODEL_PATH = 'MODEL_PATH'
FNL_MODEL_PATH = 'FNL_MODEL_PATH'

In [None]:
#import from drive
df_org = pd.read_csv(TRAIN_PATH, names=["prod_series"], dtype=str, nrows=100000)
baskets = df_org['prod_series'].to_list()

RANDOM_STATE = 123
random.seed(RANDOM_STATE)
tf.random.set_seed(RANDOM_STATE)

kfold = KFold(n_splits=3, shuffle=True, random_state=RANDOM_STATE)

In [None]:
df_test = pd.read_csv(TEST_PATH, names=["prod_series"], dtype=str)

baskets_test = df_test['prod_series'].to_list()

In [None]:
# loading tokenizer from the saved model path
tokenizer = PreTrainedTokenizerFast(
    tokenizer_file=TOKENIZER_PATH, # You can load from the tokenizer file
    eos_token= "</s>",
    bos_token= "<s>",
    unk_token= "<unk>",
    pad_token= "<pad>",
    mask_token= "<mask>",
    padding_side = "left",
    truncation = 'left'
)

class BasketDataset:
    def __init__(self, baskets, tokenizer):
        self.tokenizer = tokenizer
        self.baskets = baskets
        self.input_ids = []
        self.attn_mask = []        
        self.encodings = self.tokenizer(self.baskets, truncation=True, max_length=60, add_special_tokens=False, padding=True)
        #complete basket encoding
        self.input_ids.append(tf.constant(self.encodings['input_ids']))
        self.attn_mask.append(tf.constant(self.encodings['attention_mask']))
    def __getitem__(self, idx):
        return self.input_ids[idx]
    def __len__(self):
        return len(self.input_ids)

In [None]:
# If there's a GPU available...
if tf.config.list_physical_devices('GPU'):    
    # Tell PyTorch to use the GPU.    
    device = tf.device("cuda")
    print('There are %d GPU(s) available.' % len(tf.config.list_physical_devices('GPU')))
    print('We will use the GPU:', device_lib.list_local_devices()[1].name)
    device_name = device_lib.list_local_devices()[1].name
else:
    print('No GPU available, using the CPU instead.')
    device = tf.device("cpu")
    device_name = device_lib.list_local_devices()[0].name

In [None]:
#Hyperparams
bayes_parameters = {'lrG' : hp.uniform('lrG', 1e-6, 1e-4)}

params = {
    'max_tokenized_length' : 60,
    'batch_size' : 12,
    'n_seqs' : 6,
    'epochs' : 1
}

In [None]:
global dataset 
dataset = BasketDataset(baskets, tokenizer)

In [None]:
global criterion
global real_label 
global fake_label
global bad_words

criterion = tf.keras.losses.CategoricalCrossentropy(from_logits=False, axis=-1)
bad_words = [[tokenizer.eos_token_id],[tokenizer.bos_token_id], [tokenizer.pad_token_id]]

netG = TFGPT2LMHeadModel.from_pretrained(MODEL_PATH)

In [None]:
def train_step(batch, generator_optimizer, netG):  
    
  b_size = batch[0].get_shape().as_list()[0]
  incomplete_ids = batch[0][:,0:-1]
  incomplete_mask = batch[1][:,0:-1]
  missing_items = batch[0][:,-1:]
  non_predict = tf.fill((b_size,3),0.0)

  with tf.GradientTape() as gen_tape:
      #forward pass real data
      gen_tape.watch(netG.trainable_variables)

      outputG = netG.call(input_ids=incomplete_ids,
                        attention_mask=incomplete_mask,
                        training=True)
      outputG_softmax = tf.nn.softmax(outputG.logits[:,-1,3:])
      outputG_softmax = tf.concat((non_predict, outputG_softmax[:,:]), axis=-1)
      trueLabel_onehot = tf.squeeze(tf.one_hot(missing_items, len(outputG_softmax[1])))

      #calculate cross entropy loss
      errG = criterion(trueLabel_onehot, outputG_softmax)

      #update generator variables
      gradients_of_generator = gen_tape.gradient(errG, netG.trainable_variables)
      generator_optimizer.apply_gradients(zip(gradients_of_generator, netG.trainable_variables))
      
      return errG 

In [None]:
def gptbase_finetuning(space):

    lrG = space['lrG']
    num_epochs = params['epochs']
    precK_ls = []

    for train_index, val_index in kfold.split(dataset.input_ids[0]):
        #initialize variables
        losses = []
        train_basket, val_basket = {},{}
        avg_loss = 0
    

        train_basket['input_ids'] = tf.gather(dataset.input_ids[0], indices=tf.constant(train_index))
        train_basket['attention_mask'] = tf.gather(dataset.attn_mask[0], indices=tf.constant(train_index))
        val_basket['input_ids'] = tf.gather(dataset.input_ids[0], indices=tf.constant(val_index))
        val_basket['attention_mask'] = tf.gather(dataset.attn_mask[0], indices=tf.constant(val_index))


        print(f'Training data: {len(train_index)}')
        print(f'Validation data: {len(val_index)}')
        #initialize models
        netG = TFGPT2LMHeadModel.from_pretrained(MODEL_PATH)     
        generator_optimizer = tf.keras.optimizers.Adam(lrG)
        
        #---------------------------------------------------------------------------
        # ~~~ Training ~~~
        #---------------------------------------------------------------------------

        for epoch in range(num_epochs):
            correct_train, correct_val = 0, 0
            loader_train = tf.data.Dataset.from_tensor_slices((train_basket['input_ids'],
                                                               train_basket['attention_mask'])).batch(params['batch_size'],
                                                                                                     drop_remainder=True) 
            loop_train = notebook.tqdm(loader_train, position=0, leave=True, colour='green')
            for k, batch in enumerate(loop_train):
                
                if k==0:
                  train_step_fn = tf.function(train_step).get_concrete_function(batch,
                                                                                generator_optimizer,
                                                                                netG)
                
                loss= train_step_fn(batch)
                losses.append(loss.numpy())
                avg_loss = sum(losses)/k

                loop_train.set_postfix(loss_G=avg_loss, data="train", epochs=num_epochs, lrG=lrG)

        #---------------------------------------------------------------------------
        # ~~~ Validation ~~~
        #---------------------------------------------------------------------------

            loader_val = tf.data.Dataset.from_tensor_slices((val_basket['input_ids'],
                                                             val_basket['attention_mask'])).batch(params['batch_size'],
                                                                                                 drop_remainder=True)   
            loop_val = notebook.tqdm(loader_val, position=0, leave=True, colour='yellow')
            for k, batch_val in enumerate(loop_val):
                #create incomplete basket
                b_size_val = batch_val[0].get_shape().as_list()[0]
                incomplete_ids_val = batch_val[0][:,0:-1]
                incomplete_mask_val = batch_val[1][:,0:-1]
                missing_items_val = batch_val[0][:,-1]
                non_predict = tf.fill((b_size_val,3),0.0)

                outputG = netG.call(input_ids=incomplete_ids_val,
                                    attention_mask=incomplete_mask_val,
                                    training=False)
                outputG_softmax = tf.nn.softmax(outputG.logits[:,-1,3:])
                outputG_softmax = tf.concat((non_predict, outputG_softmax[:,:]), axis=-1)
                top_k = tf.nn.top_k(outputG_softmax, k=params['n_seqs'], sorted=True)[1]
                top_k_items = tf.squeeze(top_k)

                for i in range(0,b_size_val):
                    for j in range(0, params['n_seqs']):
                        if (top_k_items[i][j] == missing_items_val[i]):
                            correct_val+=1
                            break

                precK_val = correct_val/((k+1)*b_size_val)
                
                loop_val.set_postfix(PrecisionAtK =round(precK_val,4), Correct=correct_val,  data="validation")
                
            precK_ls.append(precK_val)
    
    precK_avg = np.mean(np.array(precK_ls))
    
    return {'loss': -precK_avg, 'status': STATUS_OK}

In [None]:
transformers.utils.logging.set_verbosity_error()

# Run Bayesian Optimization
best_params, lossG  = dict(), dict()
trials = Trials()
best_params = fmin(fn=gptbase_finetuning,
          space=bayes_parameters,
          algo=tpe.suggest,
          trials=trials,
          return_argmin=False, # Return categorical vars as strings instead of indices
          max_evals=10,
          rstate= np.random.RandomState(RANDOM_STATE))
#---------------------------------------------------------------------------
# ~~ save best results ~~
#---------------------------------------------------------------------------
precisionk  = [-x['result']['loss']  for x in trials.trials] 
best_idx = np.argmax(precisionk)

In [None]:
import pickle
# Dump metrics to pickle files

with open("results_precK", "wb") as fp:   #Pickling
    pickle.dump(precisionk, fp)

with open("results_best-params", "wb") as fp:   #Pickling
    pickle.dump(best_params, fp)    

In [None]:
# Open pickle files
import pickle
with open("results_precK", "rb") as fp:   # Unpickling
    precK_load = pickle.load(fp)
    
with open("results_best-params", "rb") as fp:   #Unpickling
    best_params_load = pickle.load(fp)    

In [None]:
best_params_load

## **Train Full**

In [None]:
len(baskets_test)

In [None]:
dataset_train = BasketDataset(baskets, tokenizer)
dataset_test = BasketDataset(baskets_test, tokenizer)

In [None]:
lrG_full = best_params_load['lrG']
num_epochs = params['epochs']

In [None]:
netG_full = TFGPT2LMHeadModel.from_pretrained(MODEL_PATH)

generator_optimizer = tf.keras.optimizers.Adam(lrG_full)
criterion = tf.keras.losses.CategoricalCrossentropy(from_logits=False, axis=-1)

In [None]:
correct_train = 0
basket_size_train = 0
pr_ls_train = []

train_loader_full = tf.data.Dataset.from_tensor_slices((dataset_train.encodings['input_ids'],
                                                        dataset_train.encodings['attention_mask'])).batch(params['batch_size'],
                                                                                                drop_remainder=True) 
test_loader_full = tf.data.Dataset.from_tensor_slices((dataset_test.encodings['input_ids'],
                                                       dataset_test.encodings['attention_mask'])).batch(params['batch_size'],
                                                                                               drop_remainder=True) 
                                                        
train_loop = notebook.tqdm(train_loader_full, position=0, leave=True, colour='green')
for k, batch in enumerate(train_loop):
    b_size = batch[0].get_shape().as_list()[0]
    incomplete_ids = batch[0][:,0:-1]
    incomplete_mask = batch[1][:,0:-1]
    missing_items = batch[0][:,-1:]
    non_predict = tf.fill((b_size,3),0.0)

    with tf.GradientTape() as gen_tape:
        #forward pass real data
        gen_tape.watch(netG_full.trainable_variables)

        #create a prediction on incomplete baskets
        outputG = netG_full.call(input_ids=incomplete_ids,
                                 attention_mask=incomplete_mask)
        outputG_softmax = tf.nn.softmax(outputG.logits[:,-1,3:])
        outputG_softmax = tf.concat((non_predict, outputG_softmax[:,:]), axis=-1)
        trueLabel_onehot = tf.squeeze(tf.one_hot(missing_items, len(outputG_softmax[1])))

        top_k = tf.nn.top_k(outputG_softmax, k=params['n_seqs'], sorted=False)[1]
       
        top_k_items = tf.squeeze(top_k)

        for i in range(0,b_size):
            
                        
            for j in range(0, params['n_seqs']):
                if (top_k_items[i][j] == missing_items[i]):
                    basket_size_train += tf.math.reduce_sum(incomplete_mask[i]).numpy()
                    correct_train+=1
                    break
                    
        mpr_train = np.array(pr_ls_train).sum()/((k+1)*b_size)            
        precK_train = correct_train/((k+1)*b_size)
        avg_basket_train = basket_size_train/correct_train

        #calculate cross entropy loss
        errG = criterion(trueLabel_onehot, outputG_softmax)

        #update generator variables
        gradients_of_generator = gen_tape.gradient(errG, netG_full.trainable_variables)
        generator_optimizer.apply_gradients(zip(gradients_of_generator, netG_full.trainable_variables))

        train_loop.set_postfix(PrecisionAtK =round(precK_train,4), loss_G=errG.numpy(), Correct=correct_train, 
                                data="Train (full)", epochs=num_epochs, lrG=lrG_full, Size=avg_basket_train,MPR=mpr_train)


In [None]:
netG_full.save_pretrained(FNL_MODEL_PATH)

## **Test**

In [None]:
basket_size_test = 0
pr_ls_test = []
correct_test = 0

test_loop = notebook.tqdm(test_loader_full, position=0, leave=True, colour='yellow')
for k, batch_test in enumerate(test_loop):
    #create incomplete basket

    b_size_test = batch_test[0].get_shape().as_list()[0]
    incomplete_ids_test = batch_test[0][:,0:-1]
    incomplete_mask_test = batch_test[1][:,0:-1]
    missing_items_test = batch_test[0][:,-1:]

    outputG = netG_full.call(input_ids=incomplete_ids_test,
                        attention_mask=incomplete_mask_test)
    outputG_softmax = tf.nn.softmax(outputG.logits[:,-1,3:])
    outputG_softmax = tf.concat((non_predict, outputG_softmax[:,:]), axis=-1)

    top_k = tf.nn.top_k(outputG_softmax, k=params['n_seqs'], sorted=True)[1]
    top_k_items = tf.squeeze(top_k)

    for i in range(0,b_size_test):

        pr = tf.math.reduce_sum(tf.where(tf.math.less(outputG_softmax[i,:], 
                                                 outputG_softmax[i,missing_items_test[i].numpy()[0]]), 1.0, 0.0)).numpy()/tokenizer.vocab_size
        pr_ls_test.append(pr)
        for j in range(0, params['n_seqs']):
            if (top_k_items[i][j] == missing_items_test[i]):
                basket_size_test += tf.math.reduce_sum(incomplete_mask_test[i]).numpy()
                correct_test+=1
                break
    mpr_test = np.array(pr_ls_test).sum()/((k+1)*b_size_test) 
    precK_test = correct_test/((k+1)*b_size_test)
    
    if correct_test == 0:
        avg_basket_test =0 
    else:
        avg_basket_test = basket_size_test/correct_test
    
    test_loop.set_postfix(PrecisionAtK =round(precK_test,4), Correct=correct_test,  data="Test", Size = avg_basket_test, MPR = mpr_test)
    
mpr_test = np.array(pr_ls_test).sum()/((k+1)*b_size_test) 