# **Import packages**

In [3]:
import tensorflow as tf

from tokenizers.processors import TemplateProcessing
from tokenizers.pre_tokenizers import WhitespaceSplit
from tokenizers.models import BPE, WordLevel
from tokenizers import Tokenizer
from tokenizers.trainers import BpeTrainer, WordLevelTrainer
from transformers import PreTrainedTokenizerFast, BertForPreTraining, AdamWeightDecay
from tokenizers.decoders import ByteLevel as ByteLevelDecoder

from transformers import GPT2Config, GPT2LMHeadModel, GPT2Tokenizer, GPT2Model

import random
from math import *

import pandas as pd
import numpy as np

import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'

In [4]:
from tqdm import tqdm, notebook
tqdm.pandas()

# **Import data**

In [None]:
TRAIN_PATH = 'TRAIN_PATH' 
TOKENIZER_PATH = 'TOKENIZER_PATH'
CHECKPOINTS_PATH = 'CHECKPOINTS_PATH'

In [5]:
df= pd.read_csv(TRAIN_PATH, header=None, nrows=1250000)
df.columns = ["baskets"]

In [7]:
df_org = df

In [8]:
df_org.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1250000 entries, 0 to 1249999
Data columns (total 1 columns):
 #   Column   Non-Null Count    Dtype 
---  ------   --------------    ----- 
 0   baskets  1250000 non-null  object
dtypes: object(1)
memory usage: 9.5+ MB


In [9]:
df_org.head(10)

Unnamed: 0,baskets
0,94782 100232 495314 169797 111388 129242 20683...
1,164919 96398 415059 447741 189105
2,195653 215707 169115 4011 111740 132045 187588...
3,494121 168990
4,435755 217957 366767 365954 193449 446406 1231...
5,222368 480029 440742 3632 379359
6,472483 224876 195643 124189
7,502400 478351 451524
8,389243 119017 174017 143099 168925 206745 4367...
9,47354 475092 101170 165583 47366 101170 211023...


In [12]:
baskets = df_org['baskets'].to_list()

In [13]:
txt_file = open("/tmp/baskets.txt", "w")
for element in baskets:
    txt_file.write(element + "\n")
txt_file.close()

In [14]:
baskets_set = [basket.split() for basket in baskets]

In [15]:
basket_items = []

for basket in baskets_set:
    basket_items.extend(basket)

In [16]:
words = len(set(basket_items))
print(f'{words} unique items in baskets...')

26619 unique items in baskets...


## **Duplicates**

In [17]:
# #baskets = list(df_org['prod_series_cln'])
# product_cnt = {}

# for basket in tqdm(baskets):
#     items = basket.split()
#     for item in items:
#         if item not in product_cnt:
#             product_cnt[item] = 1
#         else:
#             product_cnt[item] += 1

In [18]:
# freq_cnt = 0
# del_items = []

# for k,v in product_cnt.items():
#     if v>10:
#         freq_cnt+=1
#     else:
#         del_items.append(k)
# freq_cnt

In [19]:
# baskets_cln = []

# for basket in tqdm(baskets):
#     items = basket.split()
#     #print(items)
#     if items != None:
#         for delete in del_items:
#             if delete in items:
#                 #print(delete)
#                 #print(items)
#                 items.remove(str(delete))
#     baskets_cln.append(' '.join(items))

In [20]:
#len(baskets_cln)

In [21]:
# baskets_fnl = []
# for basket in baskets_cln:
#     if len(basket.split()) > 1:
#         baskets_fnl.append(basket)

In [22]:
#len(baskets_fnl)

In [23]:
# baskets_pre_cln = baskets_fnl[:3747978]
# baskets_train_cln = baskets_fnl[3747978:]

In [24]:
# len(baskets_pre_cln)
#len(baskets_train_cln)

In [25]:
# txt_file = open("data/data_pretraining_cln.txt", "w")
# for basket in baskets_pre_cln:
#     txt_file.write(basket + "\n")
# txt_file.close()

# txt_file = open("data/data_train_cln.txt", "w")
# for basket in baskets_train_cln:
#     txt_file.write(basket + "\n")
# txt_file.close()

# **Universal Tokenizer**

Only run the code commented below, if you wish to make a new tokenizer.

In [27]:
# #NIET RUNNEN
# trainer = WordLevelTrainer(special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>"],vocab_size=words+5) # Amount of products plus 5 special tokens is 30779

# new_tokenizer = Tokenizer(WordLevel(unk_token="<unk>"))
# new_tokenizer.pre_tokenizer = WhitespaceSplit()

# files = ["/tmp/baskets.txt"]
# new_tokenizer.train(files, trainer)

# new_tokenizer.post_processor = TemplateProcessing(
#     single=f"<s>:0 $A:0 </s>:0",
#     pair=f"<s>:0 $A:0 </s>:0 $B:1 </s>:1",
#     special_tokens=[("<s>", new_tokenizer.token_to_id("<s>")), 
#                     ("</s>", new_tokenizer.token_to_id("</s>"))]
# )

# new_tokenizer.save(TOKENIZER_PATH)

In [28]:
tokenizer = PreTrainedTokenizerFast(
    tokenizer_file=TOKENIZER_PATH, # You can load from the tokenizer file
    eos_token= "</s>",
    bos_token= "<s>",
    unk_token= "<unk>",
    pad_token= "<pad>",
    mask_token= "<mask>",
    cls_token="<s>",
    sep_token="</s>",
    truncation_side='left'
)

In [29]:
tokenizer.vocab_size

26722

# **Pre-Train TF BERT**

In [30]:
baskets = df_org['baskets'].to_list()
len(baskets)

1250000

In [32]:
basket_a, basket_b = [], []

for basket in baskets:
    basket = basket.split(" ")
    basket_size = len(basket)
    basket_split = ceil(basket_size/2)
    basket_a.append(' '.join(basket[:basket_split]))
    basket_b.append(' '.join(basket[basket_split:]))

In [36]:
num_baskets = len(baskets)
basket_start, basket_end, labels = [], [], []


for i, start in enumerate(basket_a):
    end_idx = random.randint(0,num_baskets-1)

    # First case is correct remaining basket (labels = 0)
    if random.random() > 0.5:
        end = basket_b[i]
        basket_start.append(start)
        basket_end.append(end)
        labels.append(0)

      # Second case is random remaining basket (labels = 1)
    else:
    # If index is the same, choose new random integer
        while end_idx == i:
            end_idx = random.randint(0,num_baskets-1)

        end = basket_b[end_idx]
        basket_start.append(start)
        basket_end.append(end)
        labels.append(1)

In [None]:
train_basket_start, train_basket_end, train_labels = [], [], []
val_basket_start, val_basket_end, val_labels = [], [], []
basket_start_ls,basket_end_ls,labels_ls = [],[],[]

for i, _ in enumerate(tqdm(basket_start)):
   
    basket_start_ls.append(basket_start[i])
    basket_end_ls.append(basket_end[i])
    labels_ls.append(labels[i])

inputs = tokenizer(basket_start,basket_end, return_tensors='tf',
          max_length=60, truncation=True, padding='max_length')


100%|██████████| 1250000/1250000 [00:01<00:00, 1126133.73it/s]


In [None]:
inputs['next_sentence_label'] = tf.squeeze(tf.transpose(tf.convert_to_tensor(labels, dtype=tf.float32)))
inputs['labels'] = tf.fill(inputs['input_ids'].shape, -100)

In [None]:
inputs.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'next_sentence_label', 'labels'])

In [None]:
with tf.device('/cpu:0'):
    rand = tf.random.Generator.from_seed(1)
    rand = rand.uniform(shape=[len(inputs['input_ids']),60])
    
    mask_rand = tf.random.Generator.from_seed(2)
    mask_rand = mask_rand.uniform(shape=[len(inputs['input_ids']),60])
    
    rand_item = tf.random.Generator.from_seed(3)
    rand_item = rand_item.uniform(minval=5, maxval=tokenizer.vocab_size,
                                        shape=[len(inputs['input_ids']),60],
                                        dtype=tf.dtypes.int32
                                       )

In [None]:
# Mask 15% of tokens except for [CLS], [SEP], and [PAD] !!!!NOTE
mask_arr = (rand < 0.15) & (inputs['input_ids'] != 0) & (inputs['input_ids'] != 1) & (inputs['input_ids'] != 2)

In [None]:
# set 20% of the labels equal to the corresponsing input_ids
inputs['labels'] = tf.where(~mask_arr, inputs['labels'], inputs['input_ids'])
# mask the corresponding 20% of the input_ids
inputs['input_ids'] = tf.where(~mask_arr, inputs['input_ids'], 4)

In [None]:
# 10% of the masks is set back to the true token
true_arr = mask_arr & (mask_rand < 0.10)
inputs['input_ids'] = tf.where(~true_arr, inputs['input_ids'], inputs['labels'])

In [None]:
# 10% of the masks is set to a random token
rand_arr = mask_arr & (mask_rand > 0.90)
inputs['input_ids'] = tf.where(~rand_arr, inputs['input_ids'], rand_item)

In [None]:
BATCH_SIZE = 12
BUFFER_SIZE = 1000
LEARNING_RATE=2e-5
DISABLE_LR_SCHEDULE=False
NUM_EPOCHS = 3

In [None]:
class Checkpoint(tf.keras.callbacks.Callback):
    def __init__(self, dir):
        super(Checkpoint, self).__init__()

        self.dir = dir

    def on_epoch_end(self, epoch, logs=None):
        checkpoint_dir = os.path.join(self.dir, f'checkpoint-{epoch}')

        if not os.path.exists(checkpoint_dir):
            os.makedirs(checkpoint_dir)

        self.model.save_pretrained(checkpoint_dir)

    def on_train_end(self, logs=None):
        checkpoint_dir = os.path.join(self.dir, 'final_epoch')

        if not os.path.exists(checkpoint_dir):
            os.makedirs(checkpoint_dir)

        self.model.save_pretrained(checkpoint_dir)

In [None]:
checkpoint_callback = Checkpoint(CHECKPOINT_PATH)

In [None]:
tokenizer.vocab_size

26722

In [None]:
from transformers import BertConfig

config = BertConfig(
    vocab_size=tokenizer.vocab_size,
    max_position_embeddings=60,
    hidden_size=768,
    num_attention_heads=6,
    num_hidden_layers=6,
    cls_token_id = 0,
    sep_token_id = 2
)

In [None]:
from transformers import TFBertForPreTraining,TFBertForMaskedLM
#uncomment to load checkpoint
#bert_model = TFBertForPreTraining.from_pretrained(CHECKPOINT_PATH)

In [None]:
train_indices = list(range(round(len(inputs['input_ids'])*0.8)))
val_indices=list(range(round(len(inputs['input_ids'])*0.8),len(inputs['input_ids'])))

In [None]:
from typing import Union, List

def shape_list(tensor: Union[tf.Tensor, np.ndarray]) -> List[int]:
    """
    Deal with dynamic shape in tensorflow cleanly.
    Args:
        tensor (`tf.Tensor` or `np.ndarray`): The tensor we want the shape of.
    Returns:
        `List[int]`: The shape of the tensor as a list.
    """
    if isinstance(tensor, np.ndarray):
        return list(tensor.shape)

    dynamic = tf.shape(tensor)

    if tensor.shape == tf.TensorShape(None):
        return dynamic

    static = tensor.shape.as_list()

    return [dynamic[i] if s is None else s for i, s in enumerate(static)]

In [None]:
@tf.function
def compute_loss(labels, logits):
        loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(
            from_logits=True, reduction=tf.keras.losses.Reduction.NONE
        )
        
        # make sure only labels that are not equal to -100
        # are taken into account as loss
        masked_lm_active_loss = tf.not_equal(tf.reshape(tensor=labels["labels"], shape=(-1,)), -100)
        masked_lm_reduced_logits = tf.boolean_mask(
            tensor=tf.reshape(tensor=logits[0], shape=(-1, shape_list(logits[0])[2])),
            mask=masked_lm_active_loss,
        )
        masked_lm_labels = tf.boolean_mask(
            tensor=tf.reshape(tensor=labels["labels"], shape=(-1,)), mask=masked_lm_active_loss
        )
        next_sentence_active_loss = tf.not_equal(tf.reshape(tensor=labels["next_sentence_label"], shape=(-1,)), -100)
        next_sentence_reduced_logits = tf.boolean_mask(
            tensor=tf.reshape(tensor=logits[1], shape=(-1, 2)), mask=next_sentence_active_loss
        )
        next_sentence_label = tf.boolean_mask(
            tensor=tf.reshape(tensor=labels["next_sentence_label"], shape=(-1,)), mask=next_sentence_active_loss
        )
        masked_lm_loss = loss_fn(y_true=masked_lm_labels, y_pred=masked_lm_reduced_logits)
        next_sentence_loss = loss_fn(y_true=next_sentence_label, y_pred=next_sentence_reduced_logits)
        masked_lm_loss = tf.reduce_mean(input_tensor=masked_lm_loss, axis=0)

        return masked_lm_loss + next_sentence_loss

In [None]:
@tf.function
def train_step(input_ids,token_type_ids,attention_mask,next_sentence_label,labels):

    with tf.GradientTape() as tape:
        tape.watch(bert_model.trainable_variables)
        
        output = bert_model(input_ids=input_ids,
                             token_type_ids=token_type_ids,
                             attention_mask=attention_mask,
                             training=True)
        
        d_labels = {"labels": labels}
        d_labels["next_sentence_label"] = next_sentence_label
        logits=(output['prediction_logits'], output['seq_relationship_logits'])
        
        loss = tf.math.reduce_mean(compute_loss(d_labels, logits))
        
    grads = tape.gradient(loss, bert_model.trainable_weights)
    optimizer.apply_gradients(zip(grads, bert_model.trainable_weights))

    
    return loss

@tf.function
def val_step(input_ids,token_type_ids,attention_mask,next_sentence_label,labels):
        
    output = bert_model(input_ids=input_ids,
                         token_type_ids=token_type_ids,
                         attention_mask=attention_mask,
                         training=True)

    d_labels = {"labels": labels}
    d_labels["next_sentence_label"] = next_sentence_label
    logits=(output['prediction_logits'], output['seq_relationship_logits'])

    val_loss = tf.math.reduce_mean(compute_loss(d_labels, logits))
    
    return val_loss

## Custom TF BERT Pre train

In [None]:
losses_fnl, val_losses_fnl = [],[] 
optimizer = AdamWeightDecay(learning_rate=LEARNING_RATE, weight_decay_rate=0.01)
train_inputs = {}
val_inputs = {}

train_inputs['input_ids'] = tf.gather(inputs['input_ids'], indices = train_indices)
train_inputs['token_type_ids'] = tf.gather(inputs['token_type_ids'], indices = train_indices)
train_inputs['attention_mask'] = tf.gather(inputs['attention_mask'], indices = train_indices)
train_inputs['next_sentence_label'] = tf.gather(inputs['next_sentence_label'], indices = train_indices)
train_inputs['labels'] = tf.gather(inputs['labels'], indices = train_indices)

val_inputs['input_ids'] = tf.gather(inputs['input_ids'], indices = val_indices)
val_inputs['token_type_ids'] = tf.gather(inputs['token_type_ids'], indices = val_indices)
val_inputs['attention_mask'] = tf.gather(inputs['attention_mask'], indices = val_indices)
val_inputs['next_sentence_label'] = tf.gather(inputs['next_sentence_label'], indices = val_indices)
val_inputs['labels'] = tf.gather(inputs['labels'], indices = val_indices)

for epoch in range(NUM_EPOCHS):
    
    losses, val_losses = [],[] 
    
    train_loader = tf.data.Dataset.from_tensor_slices((train_inputs['input_ids'],
                                                       train_inputs['token_type_ids'],
                                                       train_inputs['attention_mask'],
                                                       train_inputs['next_sentence_label'],
                                                       train_inputs['labels']
                                                      )).batch(BATCH_SIZE,drop_remainder=True)
    
    val_loader = tf.data.Dataset.from_tensor_slices((val_inputs['input_ids'],
                                                     val_inputs['token_type_ids'],
                                                     val_inputs['attention_mask'],
                                                     val_inputs['next_sentence_label'],
                                                     val_inputs['labels']
                                                    )).batch(BATCH_SIZE,drop_remainder=True)
    
    train_loop = notebook.tqdm(train_loader, position=0, leave=True, colour='green')
    
    for k, batch in enumerate(train_loop):
        
        input_ids = batch[0]
        token_type_ids = batch[1]
        attention_mask = batch[2]
        next_sentence_label = batch[3]
        labels = batch[4]
        
        loss = train_step(input_ids,
                          token_type_ids,
                          attention_mask,
                          next_sentence_label,
                          labels
                         )
        
        losses.append(loss.numpy())
        avg_loss = sum(losses)/k
        
        train_loop.set_postfix(Epoch= epoch, loss=avg_loss, data="train", lr=LEARNING_RATE)
    
    # ZORG DAT JE HET MAPJE 'models' HEBT MET DAARIN 'bert'
    bert_model.save_pretrained(f'models/bert/bert_wordlevel-{epoch}')
    
    val_loop = notebook.tqdm(val_loader, position=0, leave=True, colour='yellow')
    
    for k, batch in enumerate(val_loop):
        
        input_ids = batch[0]
        token_type_ids = batch[1]
        attention_mask = batch[2]
        next_sentence_label = batch[3]
        labels = batch[4]
        
        val_loss = val_step(input_ids,
                            token_type_ids,
                            attention_mask,
                            next_sentence_label,
                            labels
                           )

        val_losses.append(val_loss.numpy())
        avg_val_loss = sum(val_losses)/k
        
        val_loop.set_postfix(Epoch= epoch, val_loss=avg_val_loss, data="validation", lr=LEARNING_RATE)
        
    losses_fnl.append(avg_loss)
    val_losses_fnl.append(avg_val_loss)
    

  0%|          | 0/833 [00:00<?, ?it/s]

2022-03-04 21:28:25.515520: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2022-03-04 21:28:25.516295: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
  avg_loss = sum(losses)/k


  0%|          | 0/208 [00:00<?, ?it/s]

2022-03-04 21:31:22.985117: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
  avg_val_loss = sum(val_losses)/k


  0%|          | 0/833 [00:00<?, ?it/s]

  0%|          | 0/208 [00:00<?, ?it/s]

  0%|          | 0/833 [00:00<?, ?it/s]

  0%|          | 0/208 [00:00<?, ?it/s]