In [None]:
#Mount Drive for remote data access
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# **Install packages**

In [None]:
!pip install transformers
!pip install tokenizers
!pip install tensorflow
!pip install tensorflow_addons
!pip install numba



# **Import packages**

In [None]:
import tensorflow as tf
#from tensorflow_addons.optimizers import AdamW
from tensorflow.python.client import device_lib
import transformers

import warnings
from tokenizers.processors import TemplateProcessing
from tokenizers.pre_tokenizers import WhitespaceSplit
from tokenizers.models import BPE
from tokenizers import Tokenizer
from tokenizers.trainers import BpeTrainer
from transformers import PreTrainedTokenizerFast, BertForPreTraining
from tokenizers.decoders import ByteLevel as ByteLevelDecoder
from transformers import pipeline, set_seed, GPT2Model, GPT2Config, GPT2Tokenizer, TFGPT2LMHeadModel, TFBertForSequenceClassification, AdamW, BertConfig, TFBertForNextSentencePrediction

from sklearn.model_selection import train_test_split, ShuffleSplit, KFold
from hyperopt import hp, fmin, tpe, STATUS_OK, STATUS_FAIL, Trials
import logging 
logging.basicConfig(level=logging.ERROR)
import random
from math import *
from tqdm import tqdm, notebook
import pandas as pd
import numpy as np

import os

# **Import and view data**

Paths

In [None]:
path_data_train = ""
path_data_test = ""
path_tokenizer = ""
path_gpt_pre = ""
path_bert_pre = ""
save_path = ""
path_final_metrics = ""
path_save_gpt = ""
path_save_bert = ""

Importing and Showing Data

In [None]:
#import from drive
df_org = pd.read_csv(path_data_train, names=["prod_series"])
baskets = df_org['prod_series'].to_list()

global RANDOM_STATE
RANDOM_STATE = 123
random.seed(RANDOM_STATE)
tf.random.set_seed(RANDOM_STATE)

kfold = KFold(n_splits=2, shuffle=True, random_state=RANDOM_STATE)


In [None]:
df_test = pd.read_csv(path_data_test, names=["prod_series"], dtype=str)
baskets_test = df_test['prod_series'].to_list()
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24999 entries, 0 to 24998
Data columns (total 1 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   prod_series  24999 non-null  object
dtypes: object(1)
memory usage: 195.4+ KB


In [None]:
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Your runtime has 54.8 gigabytes of available RAM

You are using a high-RAM runtime!
Sun Mar 13 13:16:07 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   38C    P0    26W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+---

Importing Tokenizer and Creating BasketDataset Class

In [None]:
# loading tokenizer from the saved model path
tokenizer = PreTrainedTokenizerFast(
    tokenizer_file=path_tokenizer, # You can load from the tokenizer file, alternatively
    eos_token= "</s>",
    bos_token= "<s>",
    unk_token= "<unk>",
    pad_token= "<pad>",
    mask_token= "<mask>",
    padding_side = "left"
)

vocab_size = tokenizer.__len__()
print(vocab_size)

class BasketDataset:
    def __init__(self, baskets, tokenizer):
        self.tokenizer = tokenizer
        self.baskets = baskets
        self.input_ids = []
        self.attention_mask = []
        self.encodings = self.tokenizer(self.baskets, truncation=True, max_length=60, padding=True)
        #complete basket encoding
        self.input_ids.append(tf.constant(self.encodings['input_ids']))
        self.attention_mask.append(tf.constant(self.encodings['attention_mask']))
    def __getitem__(self, idx):
        return self.input_ids[idx]
    def __len__(self):
        return len(self.input_ids)

26722




# **GAN**

Check and append GPU

In [None]:
# If there's a GPU available...
if tf.config.list_physical_devices('GPU'):    
    # Tell TensorFlow to use the GPU.    
    device = tf.device("cuda")
    print('There are %d GPU(s) available.' % len(tf.config.list_physical_devices('GPU')))
    print('We will use the GPU:', device_lib.list_local_devices()[1].name)
    device_name = device_lib.list_local_devices()[1].name
else:
    print('No GPU available, using the CPU instead.')
    device = tf.device("cpu")
    device_name = device_lib.list_local_devices()[0].name

There are 1 GPU(s) available.
We will use the GPU: /device:GPU:0


GAN Hyperparamaters

In [None]:
#Hyperparams
bayes_parameters = {'lrD' : hp.uniform('lrD', 1e-7, 2.5e-5),
                    'lrG' : hp.uniform('lrG', 1e-7, 1e-5),
                    'clipD' : hp.uniform('clipD', 0.75, 1),
                    'clipG' : hp.uniform('clipG', 0.5, 1),
                    'emb_dropout' : hp.uniform('emb_dropout', 0.05, 0.25),
                    'update_ratio' : hp.choice('update_ratio', [2,4])}
          
params = {
    'max_tokenized_length' : 60,
    'batch_size' : 4,
    'n_seqs' : 6,
    'm' : 6,
}

GAN Procedure

Dataloader

In [None]:
global dataset 
dataset = BasketDataset(baskets, tokenizer)

Initialize pre-trained models

In [None]:
global criterion
global real_label 
global fake_label
global bad_words
global batches
global eos_token
global store_values

batches = ceil(len(baskets)*(1-1/kfold.n_splits)/params['batch_size'])
criterion = tf.keras.losses.BinaryCrossentropy(from_logits=True, label_smoothing=0.2, axis=-1)

#establish labels
real_label = 1
fake_label = 0
eos_token = 2
bad_words = [[tokenizer.eos_token_id],[tokenizer.bos_token_id], [tokenizer.pad_token_id]]

HAMSTER Architechture

In [None]:
import json
with open('/content/drive/MyDrive/Seminar QM/Data/productIDs_names.txt') as f:
    dict_textfile = f.read()     
productIDs_names = json.loads(dict_textfile)

def getProductNames(A):
    if (tf.rank(A) <= 1): # check if rank 1, otherwise tokenizer.decode doesn't work
        B = tokenizer.decode(A).split(" ")
        names = [productIDs_names[B[b]] for b in range(len(B)) if B[b] in productIDs_names.keys()]
    else:
        names = [getProductNames(A[i]) for i in range(len(A))]
    return names

In [None]:
def maligan_rewards(fake_logits_D, b):
    dx = tf.math.sigmoid(fake_logits_D)[:]
    rewards = dx / (1 - dx)
    rewards = (rewards) / sum(rewards) - b
    return tf.transpose(rewards)

def maligan_loss(fake_logits_G, fake_logits_D, item_pred, b):
    # maligan loss for 1 prediction per basket
    rewards = maligan_rewards(fake_logits_D, b)
    loss_G = -1*tf.reduce_sum(
        tf.reduce_sum(
            tf.reduce_sum(
              tf.one_hot(
                  tf.cast(
                      tf.reshape(
                          item_pred, [-1] 
                          ), dtype=tf.int32
                      ),
                  vocab_size, 1.0, 0.0 
                  ),
                  axis=0
            )
            * tf.math.log(
                tf.clip_by_value(
                    tf.reshape(
                        tf.nn.softmax(fake_logits_G), [-1, tokenizer.vocab_size]
                        ),
                    1e-20, 1.0
                    )
                ),
            1
            )
        * tf.reshape(rewards, [-1])
    )
    return loss_G

In [None]:
def delete_spec_tokens(basket):
    for i in range(len(basket)):
        basket_num = basket[i].numpy()
        basket_num = np.where(basket_num==0, 1, basket_num)
        basket_num = np.where(basket_num==2, 1, basket_num)
        if i==0:
            basket_tok = basket_num
        else:
            basket_nump = basket_num
            basket_tok = np.vstack([basket_tok,basket_nump])
    basket_batch_tok = tf.convert_to_tensor(basket_tok, dtype=tf.int32)
    return basket_batch_tok

In [None]:
def bert_tokens(basket, attention, realfake):
    for i in range(len(basket)):
      attn_num = attention[i].numpy()
      basket_num = basket[i].numpy()

      attn_new = np.delete(attn_num, np.where(attn_num==0))
      length_padding = np.count_nonzero(basket_num==1)
      basket_num[-1] = 2
      basket_new = np.delete(basket_num, np.where(basket_num==1))              
      if length_padding!=0:
        padding_basket = np.full((length_padding-1,), 1)
        padding_attn = np.full((length_padding-1,), 0)
        basket_a = basket_new[:-2]
        basket_b = basket_new[-2:]
        basket_final = np.concatenate((basket_a, [2], basket_b, padding_basket))

        attn_final = np.concatenate((attn_new,[1], padding_attn))

        token_type_zero = np.full((len(attn_num)-len(padding_attn)-2,),0)
        token_type = np.concatenate((token_type_zero, [1,1], padding_attn))
      else:
        attn_final = attn_num
        token_type_zero = np.full((len(attn_num)-2,),0)
        token_type = np.concatenate((token_type_zero,[1,1]))
        basket_a = basket_new[1:-2]
        basket_b = basket_new[-2:]
        basket_final = np.concatenate((basket_a, [2], basket_b))

      if realfake == "fake":
        basket_final = np.concatenate(([0],basket_final))
        basket_final = basket_final[:-1]
        token_type = np.concatenate(([0],token_type))
        token_type = token_type[:-1]
        attn_final = attn_final[:-1]
      if i==0:
        attn_batch = attn_final
        token_batch = token_type
        basket_batch = basket_final
      else:
        attn_nump = attn_final
        attn_batch = np.vstack([attn_batch,attn_nump])
        token_nump = token_type
        token_batch = np.vstack([token_batch,token_nump])
        basket_nump = basket_final
        basket_batch = np.vstack([basket_batch,basket_nump])
      attn_batch_tens = tf.convert_to_tensor(attn_batch, dtype=tf.int32) 
      token_batch_tens = tf.convert_to_tensor(token_batch, dtype=tf.int32)
      basket_batch_tens = tf.convert_to_tensor(basket_batch, dtype=tf.int32)
    return basket_batch_tens, attn_batch_tens, token_batch_tens

In [None]:
def gan_finetuning(space):

    store_values = np.array([0,1,2,3,4,5,6])    # iter, correctD_real, correctD_fake, loss G, loss D, precK running average, prekK iter
    lr_D, lr_G = space['lrD'], space['lrG']
    clip_D, clip_G = space['clipD'], space['clipG']
    num_epochs = 1
    b_size = params['batch_size']
    m = params['m']
    k= params['n_seqs']
    update_ratio = space['update_ratio']
    emb_dropout= space['emb_dropout']
    attn_dropout= 0.0
    early_stop = False
    status_hyp = STATUS_OK


    fnl_prec_k,fnl_loss_G,fnl_loss_D = [],[],[]
    prec_k_final = 0

    for train_index, val_index in kfold.split(dataset.input_ids[0]):
        #initialize variables

        losses_G, losses_D = [], []
        avg_loss_G, avg_loss_D = 0,0
        train_input, val_input = {}, {}

        train_input['input_ids'] = tf.gather(dataset.input_ids[0], indices=tf.constant(train_index))
        train_input['attention_mask'] = tf.gather(dataset.attention_mask[0], indices=tf.constant(train_index))
        
        val_input['input_ids'] = tf.gather(dataset.input_ids[0], indices=tf.constant(val_index))
        val_input['attention_mask'] = tf.gather(dataset.attention_mask[0], indices=tf.constant(val_index))
        
        #initialize models and optimizers
        net_G = TFGPT2LMHeadModel.from_pretrained(path_gpt_pre,
                                                  embd_pdrop=emb_dropout,       #dropout for the hidden layers
                                                  attn_pdrop=attn_dropout)  #dropout for the attention heads
        optim_G = tf.keras.optimizers.Adam(lr_G, clipnorm=clip_G)

        net_D = TFBertForSequenceClassification.from_pretrained(path_bert_pre, num_labels=1)
        optim_D = tf.keras.optimizers.Adam(lr_D, clipnorm=clip_D)

        for epoch in range(num_epochs):
            train_correct = 0
            real_correct_D = 0
            fake_correct_D = 0
            correct_G = 0

            train_loader = tf.data.Dataset.from_tensor_slices((train_input['input_ids'], 
                                                               train_input['attention_mask']
                                                              )).batch(b_size) 
            
            train_loop = notebook.tqdm(train_loader, position=0, leave=True, colour='green')

            for t, batch in enumerate(train_loop):
                correctD_real_ITER, correctD_fake_ITER, train_correct_ITER = 0,0,0

                real_input_ids = batch[0]
                real_attention_mask = batch[1]
                missing_item = real_input_ids[:,-2]
                
                real_labels = tf.fill([b_size,1],real_label) # real_label = 1
                
                # create Generator input and new attention mask due to shifted padding and removal of special tokens
                inc_input_ids = delete_spec_tokens(real_input_ids)
                inc_input_ids = inc_input_ids[:,0:-2]
                inc_attention_mask = tf.where(~(inc_input_ids!=1),tf.fill(real_attention_mask[:,0:-2].shape,0),1)
                
                with tf.GradientTape() as tape_G, tf.GradientTape() as tape_D:
                    #forward pass real data
                    tape_D.watch(net_D.trainable_variables)
                    tape_G.watch(net_G.trainable_variables)
          
                    #switch padding
                    real_input_ids_sw, real_attention_mask_sw, real_token_type_ids_sw = bert_tokens(real_input_ids, real_attention_mask, realfake="real")
                    #output discriminator on real data
                    real_logits_D = net_D.call(input_ids=real_input_ids_sw,
                                               attention_mask=real_attention_mask_sw,
                                               token_type_ids=real_token_type_ids_sw,
                                               training=True).logits
                    
                    real_sigmoid_D = tf.math.sigmoid(real_logits_D)
                    real_loss_D = criterion(real_labels, real_logits_D) #BCE is specified to use logits
                    
                    #forward pass fake data through generator
                    fake_logits_G = net_G.call(input_ids=inc_input_ids,
                                               attention_mask=inc_attention_mask,
                                               training = True).logits[:,-1,5:]
                    fake_logits_adj_G = tf.concat((tf.fill([b_size,5],-100000.0), fake_logits_G), axis=-1) # set logits of first 5 tokens to -100 to rule out              
                    fake_softmax_G = tf.nn.softmax(fake_logits_adj_G)
                    
                    # sample m items from G according to the softmax probalities
                    fake_probs_G = np.array([fake_softmax_G[i,:]/np.sum(fake_softmax_G[i,:]) for i in range(b_size)])
                    fake_samples_G = np.array([np.random.choice(range(0,tokenizer.vocab_size), 
                                                                m, 
                                                                p=fake_probs_G[i,:]) for i in range(b_size)], dtype=np.int32)


                    #forward pass of generated samples to discriminator
                    fake_top_k = tf.nn.top_k(fake_softmax_G, k=k, sorted=True)[1]
                    
                    fake_input_ids = tf.concat([inc_input_ids,
                                                tf.reshape(fake_top_k[:,0],
                                                           [b_size,1]), 
                                                tf.fill((b_size,1), 2)], #seperator token
                                               1)

                    fake_input_ids_sw, fake_attention_mask_sw, fake_token_type_ids_sw = bert_tokens(fake_input_ids, real_attention_mask, realfake="fake")
                    
                    fake_logits_D = net_D.call(input_ids=fake_input_ids_sw,
                                               attention_mask=fake_attention_mask_sw,
                                               token_type_ids=fake_token_type_ids_sw,
                                               training=True).logits
                    fake_sigmoid_D = tf.math.sigmoid(fake_logits_D)
                    
                    for i in range(0,b_size):
                        for j in range(0, k):
                            if (fake_top_k[i][j] == missing_item[i]):
                                train_correct+=1
                                train_correct_ITER +=1
                                break
                    
                    #check if fake generated basket is 'not fake'
                    if tf.math.equal(tf.reduce_sum(fake_input_ids[:,-2]), tf.reduce_sum(real_input_ids[:,-2])) == False:
                        for i in range(len(fake_input_ids)): # loop over batchsize
                            if i == 0:
                                if real_input_ids[i,-2] != fake_input_ids[i,-2]:
                                    fake_labels = tf.fill([1,1],[fake_label])
                                elif real_input_ids[i,-2] == fake_input_ids[i,-2]:
                                    fake_labels = tf.fill([1,1],[real_label])
                            else:
                                if real_input_ids[i,-2] != fake_input_ids[i,-2]:
                                    step = tf.fill([1,1],[fake_label])
                                    fake_labels = tf.concat((fake_labels, step), axis=0)
                                elif real_input_ids[i,-2] == fake_input_ids[i,-2]:
                                    step = tf.fill([1,1],[real_label])
                                    fake_labels = tf.concat((fake_labels, step), axis=0)
                                    correct_G+=1
                    else:
                        fake_labels = tf.fill([b_size,1],[real_label])
                
                    fake_loss_D = criterion(fake_labels, fake_logits_D)
                    
                    loss_D = real_loss_D + fake_loss_D
                    
                    losses_D.append(loss_D.numpy())
                    
                    # test discriminator
                    for i in range(0,b_size):
                        if real_sigmoid_D[i] > 0.5: # when real sigmoid is leaning to label = 0, which is correct
                            real_correct_D += 1
                            correctD_real_ITER +=1
                        if (fake_sigmoid_D[i] > 0.5) and fake_labels[i]==1: # when fake sigmoid is leaning to correct label = 1
                            fake_correct_D += 1
                            correctD_fake_ITER +=1
                        elif (fake_sigmoid_D[i] < 0.5) and fake_labels[i]==0: # when fake sigmoid is leaning to correct label = 0
                            fake_correct_D += 1
                            correctD_fake_ITER +=1
                    
                    D_correct_ratio = (real_correct_D + fake_correct_D)  / (2*b_size*(t+1))

                    # Calculate generator loss
                    b = t/(batches)
                    loss_G = tf.constant(0.0)
                    predicted_items = fake_input_ids_sw[:,-2]
                    maligan_w, maligan_dx = [],[]
                    
                    eos_tokens = tf.fill((m,1), 2)
                    
                    for i in range(b_size):
                        fake_sampled_ids = tf.repeat(tf.reshape(inc_input_ids[i], shape = (1,58)), m, axis=0)
        
                        
                        fake_input_ids_m = tf.concat([fake_sampled_ids,tf.reshape(fake_samples_G[i,:],[m,1]), eos_tokens],1)
                        fake_attention_mask_m = tf.repeat(tf.reshape(real_attention_mask[i,:], shape = (1,60)), m, axis=0)
                    
                        fake_input_ids_m, fake_attention_mask_m, fake_token_type_ids_m = bert_tokens(fake_input_ids_m, 
                                                                                                   fake_attention_mask_m, 
                                                                                                   realfake="fake")
                        fake_logits_D_m = net_D.call(input_ids=fake_input_ids_m,
                                                     attention_mask=fake_attention_mask_m,
                                                     token_type_ids=fake_token_type_ids_m,
                                                     training=True
                                                    ).logits
                        fake_sigmoid_D_m = tf.math.sigmoid(fake_logits_D_m)
                        maligan_dx.append(fake_sigmoid_D_m[:])                          
                        maligan_w.append(maligan_rewards(fake_logits_D_m,b))
                        loss_G += maligan_loss(fake_logits_adj_G[i], 
                                               fake_logits_D_m, 
                                               tf.reshape(fake_samples_G[i,:],[m,1]), b)
                        
                    losses_G.append(loss_G.numpy())
                    if t%update_ratio==0:  
                      grad_G = tape_G.gradient(loss_G, net_G.trainable_variables)
                      optim_G.apply_gradients(zip(grad_G, net_G.trainable_variables))
                    
                    
                    grad_D = tape_D.gradient(loss_D, net_D.trainable_variables)
                    optim_D.apply_gradients(zip(grad_D, net_D.trainable_variables))
        
                    train_prec_k = train_correct/((t+1)*b_size)

                    if t>0:
                      avg_loss_G = sum(losses_G)/t
                      avg_loss_D = sum(losses_D)/t
                    
                    # early stopping condition
                    
                    if ((train_prec_k < 0.05) and t>(0.05* batches)):
                        print('Epoch stopped early...')
                        early_stop = True
                        break
                        status_hyp = 'fail'
                    if ((train_prec_k < 0.09) and t>(0.10* batches)):
                        print('Epoch stopped early...')
                        early_stop = True
                        status_hyp = 'fail'
                        break
                    
                    # iter, correctD_real, correctD_fake, loss G, loss D, precK running average, prekK iter
                    store_values = np.vstack((store_values,[t,correctD_real_ITER,correctD_real_ITER, loss_G.numpy(), loss_D.numpy(),train_prec_k, train_correct_ITER/b_size ]))
                    train_loop.set_postfix(lossG=avg_loss_G, lossD=avg_loss_D, data="train", epochs=num_epochs, 
                                           G_correct_at1=correct_G, G_total_correct=train_correct, precK=train_prec_k, lrD=lr_D, 
                                           lrG=lr_G, clipD=clip_D, clipG=clip_G, 
                                           real_correct_D =real_correct_D, 
                                           fake_correct_D =fake_correct_D, D_correct_ratio = D_correct_ratio)

            #####################################################################################
            #start validation
            loader_val = tf.data.Dataset.from_tensor_slices((val_input['input_ids'], val_input['attention_mask'])).batch(params['batch_size'])
            correct_val = 0   
            total_baskets_val = 0
            precK_val = 0
            loop_val = notebook.tqdm(loader_val, position=0, leave=True, colour='yellow')
            for k, batch_val in enumerate(loop_val):
              #create incomplete basket
              if (early_stop == True) : break
              batch_size = len(batch_val[0])
              total_baskets_val+=batch_size
              non_predict = tf.fill((batch_size,5),0.0)
              incomplete_tok_val = delete_spec_tokens(batch_val[0])
              incomplete_val = incomplete_tok_val[:,0:-2]
              incomplete_mask_val = batch_val[1][:,0:-2]
              true_items = batch_val[0][:,-2]
              
              output_G= net_G.call(input_ids=incomplete_val,
                                  attention_mask=incomplete_mask_val, training = False)
              #take output layer
              outputG_softmax = tf.nn.softmax(output_G.logits[:,-1,5:])
              #model should not predict special tokens, therefore, set softmax value at zero
              outputG_softmax = tf.concat((non_predict, outputG_softmax), axis=-1)

              top_k = tf.nn.top_k(outputG_softmax, k=params['n_seqs'], sorted=False)[1]
              top_k_items = tf.squeeze(top_k)

              
              for i in range(0,batch_size):
                for j in range(0, params['n_seqs']):
                  if (top_k_items[i][j] == true_items[i]):
                    correct_val+=1
                    break

              precK_val = correct_val/(total_baskets_val)
              loop_val.set_postfix(PrecisionAtK =precK_val, Correct=correct_val,  data="validation")

        break     

    #save results    
    fnl_prec_k.append(precK_val)
    fnl_loss_G.append(avg_loss_G)
    fnl_loss_D.append(avg_loss_D)

    folds = kfold.get_n_splits()
    prec_k_final = sum(fnl_prec_k)
    np.savetxt(f'{save_path}{prec_k_final}_CLIP_{clip_D}.csv', store_values, delimiter=",")
    
    return {'loss': -prec_k_final, 'lossG': sum(fnl_loss_G)/folds, 'lossD': sum(fnl_loss_D)/folds, 'status': status_hyp}

In [None]:
# Run Bayesian Optimization
transformers.utils.logging.set_verbosity_error()

best_params, lossG, lossD  = dict(), dict(), dict()
trials = Trials()
best_params = fmin(fn=gan_finetuning,
          space=bayes_parameters,
          algo=tpe.suggest,
          trials=trials,
          return_argmin=False, # Return categorical vars as strings instead of indices
          max_evals=15,
          rstate= np.random.RandomState(RANDOM_STATE))
losses  = [x['result']['loss']  for x in trials.trials]
lossesG = [x['result']['lossG'] for x in trials.trials]
lossesD = [x['result']['lossD'] for x in trials.trials]

**TRAIN FULL**

In [None]:
store_values = np.array([0,1,2,3,4,5,6])    # iter, correctD_real, correctD_fake, loss G, loss D, precK running average, prekK iter
lr_D, lr_G = best_params['lrD'], best_params['lrG']
clip_D, clip_G = best_params['clipD'], best_params['clipG']
num_epochs = 1
b_size = params['batch_size']
m = params['m']
k= params['n_seqs']
update_ratio = best_params['update_ratio']
emb_dropout= best_params['emb_dropout']
early_stop = False


fnl_prec_k,fnl_loss_G,fnl_loss_D = [],[],[]

#initialize variables

losses_G, losses_D = [], []
avg_loss_G, avg_loss_D = 0,0
train_input, val_input = {}, {}

train_input['input_ids'] = dataset.input_ids[0]
train_input['attention_mask'] = dataset.attention_mask[0]

#initialize models and optimizers
net_G = TFGPT2LMHeadModel.from_pretrained(path_gpt_pre,
                                        embd_pdrop=emb_dropout,       #dropout for the hidden layers
                                        attn_pdrop=attn_dropout)      #dropout for the attention heads
optim_G = tf.keras.optimizers.Adam(lr_G, clipnorm=clip_G)

net_D = TFBertForSequenceClassification.from_pretrained(path_bert_pre, num_labels=1)#, output_hidden_states=True, output_attentions=True)
optim_D = tf.keras.optimizers.Adam(lr_D, clipnorm=clip_D)

for epoch in range(num_epochs):
  train_correct = 0
  real_correct_D = 0
  fake_correct_D = 0
  correct_G = 0

  train_loader = tf.data.Dataset.from_tensor_slices((train_input['input_ids'], 
                                                    train_input['attention_mask']
                                                    )).batch(b_size) 
  
  train_loop = notebook.tqdm(train_loader, position=0, leave=True, colour='green')

  for t, batch in enumerate(train_loop):
      correctD_real_ITER, correctD_fake_ITER, train_correct_ITER = 0,0,0

      real_input_ids = batch[0]
      real_attention_mask = batch[1]
      missing_item = real_input_ids[:,-2]
      
      real_labels = tf.fill([b_size,1],real_label) # real_label = 1
      
      # create Generator input and new attention mask due to shifted padding and removal of special tokens
      inc_input_ids = delete_spec_tokens(real_input_ids)
      inc_input_ids = inc_input_ids[:,0:-2]
      inc_attention_mask = tf.where(~(inc_input_ids!=1),tf.fill(real_attention_mask[:,0:-2].shape,0),1)
      
      with tf.GradientTape() as tape_G, tf.GradientTape() as tape_D:
          #forward pass real data
          tape_D.watch(net_D.trainable_variables)
          tape_G.watch(net_G.trainable_variables)

          #switch padding
          real_input_ids_sw, real_attention_mask_sw, real_token_type_ids_sw = bert_tokens(real_input_ids, real_attention_mask, realfake="real")
          #output discriminator on real data
          real_logits_D = net_D.call(input_ids=real_input_ids_sw,
                                    attention_mask=real_attention_mask_sw,
                                    token_type_ids=real_token_type_ids_sw,
                                    training=True).logits
          
          real_sigmoid_D = tf.math.sigmoid(real_logits_D)
          real_loss_D = criterion(real_labels, real_logits_D) #BCE is specified to use logits
          
          #forward pass fake data through generator
          fake_logits_G = net_G.call(input_ids=inc_input_ids,
                                    attention_mask=inc_attention_mask,
                                    training = True).logits[:,-1,5:]
          fake_logits_adj_G = tf.concat((tf.fill([b_size,5],-100000.0), fake_logits_G), axis=-1) # set logits of first 5 tokens to -100 to rule out              
          fake_softmax_G = tf.nn.softmax(fake_logits_adj_G)
          
          # sample m items from G according to the softmax probalities
          fake_probs_G = np.array([fake_softmax_G[i,:]/np.sum(fake_softmax_G[i,:]) for i in range(b_size)])
          fake_samples_G = np.array([np.random.choice(range(0,tokenizer.vocab_size), 
                                                      m, 
                                                      p=fake_probs_G[i,:]) for i in range(b_size)], dtype=np.int32)


          #forward pass of generated samples to discriminator
          fake_top_k = tf.nn.top_k(fake_softmax_G, k=k, sorted=True)[1]
          
          fake_input_ids = tf.concat([inc_input_ids,
                                      tf.reshape(fake_top_k[:,0],
                                                [b_size,1]), 
                                      tf.fill((b_size,1), 2)], #seperator token
                                    1)

          fake_input_ids_sw, fake_attention_mask_sw, fake_token_type_ids_sw = bert_tokens(fake_input_ids, real_attention_mask, realfake="fake")
          
          fake_logits_D = net_D.call(input_ids=fake_input_ids_sw,
                                    attention_mask=fake_attention_mask_sw,
                                    token_type_ids=fake_token_type_ids_sw,
                                    training=True).logits
          fake_sigmoid_D = tf.math.sigmoid(fake_logits_D)
          
          for i in range(0,b_size):
              for j in range(0, k):
                  if (fake_top_k[i][j] == missing_item[i]):
                      train_correct+=1
                      train_correct_ITER +=1
                      break
          
          #check if fake generated basket is 'not fake'
          if tf.math.equal(tf.reduce_sum(fake_input_ids[:,-2]), tf.reduce_sum(real_input_ids[:,-2])) == False:
              for i in range(len(fake_input_ids)): # loop over batchsize
                  if i == 0:
                      if real_input_ids[i,-2] != fake_input_ids[i,-2]:
                          fake_labels = tf.fill([1,1],[fake_label])
                      elif real_input_ids[i,-2] == fake_input_ids[i,-2]:
                          fake_labels = tf.fill([1,1],[real_label])
                  else:
                      if real_input_ids[i,-2] != fake_input_ids[i,-2]:
                          step = tf.fill([1,1],[fake_label])
                          fake_labels = tf.concat((fake_labels, step), axis=0)
                      elif real_input_ids[i,-2] == fake_input_ids[i,-2]:
                          step = tf.fill([1,1],[real_label])
                          fake_labels = tf.concat((fake_labels, step), axis=0)
                          correct_G+=1
          else:
              fake_labels = tf.fill([b_size,1],[real_label])
      
          fake_loss_D = criterion(fake_labels, fake_logits_D)
          
          loss_D = real_loss_D + fake_loss_D
          
          losses_D.append(loss_D.numpy())
          
          # test discriminator
          for i in range(0,b_size):
              if real_sigmoid_D[i] > 0.5: # when real sigmoid is leaning to label = 0, which is correct
                  real_correct_D += 1
                  correctD_real_ITER +=1
              if (fake_sigmoid_D[i] > 0.5) and fake_labels[i]==1: # when fake sigmoid is leaning to correct label = 1
                  fake_correct_D += 1
                  correctD_fake_ITER +=1
              elif (fake_sigmoid_D[i] < 0.5) and fake_labels[i]==0: # when fake sigmoid is leaning to correct label = 0
                  fake_correct_D += 1
                  correctD_fake_ITER +=1
          
          D_correct_ratio = (real_correct_D + fake_correct_D)  / (2*b_size*(t+1))

          # Calculate generator loss
          b = t/batches
          loss_G = tf.constant(0.0)
          predicted_items = fake_input_ids_sw[:,-2]
          maligan_w, maligan_dx = [],[]
          
          eos_tokens = tf.fill((m,1), 2)
          
          for i in range(b_size):
              fake_sampled_ids = tf.repeat(tf.reshape(inc_input_ids[i], shape = (1,58)), m, axis=0)

              
              fake_input_ids_m = tf.concat([fake_sampled_ids,tf.reshape(fake_samples_G[i,:],[m,1]), eos_tokens],1)
              fake_attention_mask_m = tf.repeat(tf.reshape(real_attention_mask[i,:], shape = (1,60)), m, axis=0)
          
              fake_input_ids_m, fake_attention_mask_m, fake_token_type_ids_m = bert_tokens(fake_input_ids_m, 
                                                                                        fake_attention_mask_m, 
                                                                                        realfake="fake")
              fake_logits_D_m = net_D.call(input_ids=fake_input_ids_m,
                                          attention_mask=fake_attention_mask_m,
                                          token_type_ids=fake_token_type_ids_m,
                                          training=True
                                          ).logits
              fake_sigmoid_D_m = tf.math.sigmoid(fake_logits_D_m)
              maligan_dx.append(fake_sigmoid_D_m[:])                          
              maligan_w.append(maligan_rewards(fake_logits_D_m,b))
              loss_G += maligan_loss(fake_logits_adj_G[i], 
                                    fake_logits_D_m, 
                                    tf.reshape(fake_samples_G[i,:],[m,1]), b)
              
          losses_G.append(loss_G.numpy())
          if t%update_ratio==0:  
            grad_G = tape_G.gradient(loss_G, net_G.trainable_variables)
            optim_G.apply_gradients(zip(grad_G, net_G.trainable_variables))
          
          
          grad_D = tape_D.gradient(loss_D, net_D.trainable_variables)
          optim_D.apply_gradients(zip(grad_D, net_D.trainable_variables))

          train_prec_k = train_correct/((t+1)*b_size)

          if t>0:
            avg_loss_G = sum(losses_G)/t
            avg_loss_D = sum(losses_D)/t
          
          # early stopping condition
          
          if ((train_prec_k < 0.05) and t>(0.05* batches)):
              print('Epoch stopped early...')
              early_stop = True
              break
          if ((train_prec_k < 0.10) and t>(0.10* batches)):
              print('Epoch stopped early...')
              early_stop = True
              break
          
          # iter, correctD_real, correctD_fake, loss G, loss D, precK running average, prekK iter
          store_values = np.vstack((store_values,[t,correctD_real_ITER,correctD_real_ITER, loss_G.numpy(), loss_D.numpy(),train_prec_k, train_correct_ITER/b_size ]))
          train_loop.set_postfix(lossG=avg_loss_G, lossD=avg_loss_D, data="train", epochs=num_epochs, 
                                G_correct_at1=correct_G, G_total_correct=train_correct, precK=train_prec_k, lrD=lr_D, 
                                lrG=lr_G, clipD=clip_D, clipG=clip_G, 
                                real_correct_D =real_correct_D,
                                fake_correct_D =fake_correct_D, D_correct_ratio = D_correct_ratio)

  
  np.savetxt(path_final_metrics, store_values, delimiter=",")
  net_D.save_pretrained(path_save_bert)
  net_G.save_pretrained(path_save_gpt)

  #EVERYTHING BELOW SHOULD BE IMPLEMENTED AFTER VALIDATION
  fnl_prec_k.append(train_prec_k)
  fnl_loss_G.append(avg_loss_G)
  fnl_loss_D.append(avg_loss_D)

All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at /content/drive/MyDrive/Seminar QM/Models/TF/220306 GPT pre-trained AH (Redding).
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.
All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at /content/drive/MyDrive/Seminar QM/Models/TF/220306 BERT pre-trained AH (REDDING)/bert pre-trained and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/25000 [00:00<?, ?it/s]

In [None]:
#-------------------------------------------------------------------------------
# ~~~ TEST ~~~ w/ bootstrap only 1 call of netG
#-------------------------------------------------------------------------------

test_size = 25000
num_specialtok = 5
top_1_count,top_k_count = 0,0
top_1_count_adj, top_k_count_adj = 0,0
output = {}
total_batches = 0
pr_ls_test = []

test_basket = {}
test_basket['input_ids'] = dataset_test.input_ids[0]
test_basket['attention_mask'] = dataset_test.attn_mask[0]
loader_test = tf.data.Dataset.from_tensor_slices(test_basket).batch(params['batch_size'])   
loop_test = notebook.tqdm(loader_test, position=0, leave=True, colour='yellow')

#create top_k matrix for predictions in test set
top_k_all = np.zeros((test_size,30),dtype=np.int32)
missing_items_all = np.zeros((test_size),dtype=np.int32)
top_1_list,top_k_list,top_n_list = [],[],[]

for k, batch_test in enumerate(loop_test):
  batch_size = len(batch_test['input_ids'])
  total_batches += batch_size
  non_predict = tf.fill((batch_size,num_specialtok),0.0)
  incomplete_ids_test = batch_test['input_ids'][:,0:-1]
  incomplete_mask_test = batch_test['attention_mask'][:,0:-1]
  missing_items_test = batch_test['input_ids'][:,-1]
  outputG = netG.call(input_ids=incomplete_ids_test,
                      attention_mask=incomplete_mask_test,
                      training = False).logits[:,-1,num_specialtok:]

  #model should not predict special tokens, therefore, set softmax value at zero
  outputG_softmax = tf.concat((non_predict, tf.nn.softmax(outputG)), axis=-1)
  top_k = tf.nn.top_k(outputG_softmax, k=30, sorted=True)[1]
  top_k_items = tf.squeeze(top_k)
  for i in range(batch_size):
    top_k_all[(k*batch_size)+i,:] =  top_k_items[i]
    missing_items_all[(k*batch_size)+i] = missing_items_test[i]

  # calculate (normal) prec@k
  for i in range(0,batch_size):
    top_n_list.append([batch_test['input_ids'][i],batch_test['attention_mask'][i],top_k_items[i]])
    top_1_count += int(top_k_items[i][0] == missing_items_test[i])
    for j in range(6):
      if (top_k_items[i][j] == missing_items_test[i]):
        top_k_count += 1
        break

  # calculate adj. prec@k
  top_k_names = getProductNames(top_k_items)
  real_names = getProductNames_Real(missing_items_test)
  for i in range(batch_size):
    real_name = real_names[i]
    top_k_item = top_k_names[i]
    if real_name == "NOT AVAILABLE":
      real_token = missing_items_test[i]
      top_k_token = top_k_items[i]
      if real_token == top_k_token[0]:
        top_1_count_adj +=1
      for j in range(6):
        if real_token == top_k_token[j]:
          top_k_count_adj += 1
          break
    else:
      predict_names, ngrams = [], dict()
      q, t = 0,0
      ngram_correct = ngram(real_name)
      short=False
      while len(predict_names)!=6 and q!=len(top_k_item) and short!=True:
        if len(top_k_item)<=6:
          short=True
          predict_names = top_k_item
          for w, item in enumerate(predict_names):
            ngram_item = ngram(item)
            ngrams.update({w : ngram_item})
        else:
          item = top_k_item[q]   
          ngram_item = ngram(item)
          if q==0:
            predict_names.append(item)
            ngrams.update({t : ngram_item})
            t+=1
          else:
            sim = [dice_coef(ngram_item, value) for value in ngrams.values()]
            if all(z<params['ngram_sim'] for z in sim):
              ngrams.update({t : ngram_item})
              predict_names.append(item)
              t+=1

          q+=1
        
      for key, value in ngrams.items():
        ngram_sim = dice_coef(ngram_correct, value)
        if ngram_sim >= params['ngram_sim']:
          output_want = [real_name] + predict_names
          output[total_batches-batch_size+i] = output_want
          top_k_count_adj+=1
          top_k_list.append([batch_test['input_ids'][i],batch_test['attention_mask'][i],top_k_items[i]])
          if key==0:
            top_1_count_adj+=1
            top_1_list.append([batch_test['input_ids'][i],batch_test['attention_mask'][i],top_k_items[i]])
          break
  
  for i in range(0,batch_size):
    pr = tf.math.reduce_sum(tf.where(tf.math.less(outputG_softmax[i,:], outputG_softmax[i,missing_items_test[i].numpy()]), 1.0, 0.0)).numpy()/tokenizer.vocab_size
    pr_ls_test.append(pr)
  precK_test = top_k_count/(total_batches)

  mpr_test = np.array(pr_ls_test).sum()/(total_batches) 

  loop_test.set_postfix(PrecisionAtK =round(precK_test,4), Correct=top_k_count,  data="Test", MPR = mpr_test)

#print metrics
mpr_test = np.array(pr_ls_test).sum()/test_size
prec1 = (top_1_count/test_size) * 100
prec6 = (top_k_count/test_size) * 100
prec1_adj = (top_1_count_adj/test_size) * 100
prec6_adj = ((top_k_count_adj)/test_size) * 100
print(f"   Mean Percentile Rank: {mpr_test}")
print(f"     precision at 1 (%): {prec1}")
print(f"     precision at 6 (%): {prec6}")
print(f"adj. precision at 1 (%): {prec1_adj}")
print(f"adj. precision at 6 (%): {prec6_adj}")