# **Import packages**

In [None]:
from tokenizers.processors import TemplateProcessing
from tokenizers.pre_tokenizers import WhitespaceSplit
from tokenizers.models import BPE, WordLevel
from tokenizers import Tokenizer
from tokenizers.trainers import BpeTrainer, WordLevelTrainer
from transformers import PreTrainedTokenizerFast, BertForPreTraining
from tokenizers.decoders import ByteLevel as ByteLevelDecoder

from transformers import GPT2Config, GPT2LMHeadModel, GPT2Tokenizer, GPT2Model

import random
from math import *

import pandas as pd
import numpy as np


import os

In [None]:
from tqdm import tqdm
tqdm.pandas()

# **Import data**

In [None]:
TRAIN_PATH = 'TRAIN_PATH' 
TOKENIZER_PATH = 'TOKENIZER_PATH'
CHECKPOINTS_PATH = 'CHECKPOINTS_PATH'

In [None]:
df= pd.read_csv(TRAIN_PATH, header=None, nrows=1250000)#,nrows=500000)
df.columns = ["baskets"]

In [None]:
df_org = df

In [None]:
df_org.info()

In [None]:
baskets = df['baskets'].to_list()

In [None]:
txt_file = open("/tmp/baskets.txt", "w")
for element in baskets:
    txt_file.write(element + "\n")
txt_file.close()

In [None]:
baskets_set = [basket.split() for basket in baskets]

In [None]:
basket_items = []

for basket in baskets_set:
    basket_items.extend(basket)

In [None]:
words = len(set(basket_items))
print(f'{words} unique items in baskets...')

# **Universal Tokenizer**

Only run the code commented below, if you wish to make a new tokenizer.

In [None]:
# trainer = WordLevelTrainer(special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>"],vocab_size=words+5) # Amount of products plus 5 special tokens is 30779

# new_tokenizer = Tokenizer(WordLevel(unk_token="<unk>"))
# new_tokenizer.pre_tokenizer = WhitespaceSplit()

# files = ["/tmp/baskets.txt"]
# new_tokenizer.train(files, trainer)

# new_tokenizer.post_processor = TemplateProcessing(
#     single=f"<s>:0 $A:0 </s>:0",
#     pair=f"<s>:0 $A:0 </s>:0 $B:1 </s>:1",
#     special_tokens=[("<s>", new_tokenizer.token_to_id("<s>")), 
#                     ("</s>", new_tokenizer.token_to_id("</s>"))]
# )

# new_tokenizer.save(TOKENIZER_PATH)

## *TF GPT-2* 

In [None]:
import tensorflow as tf
from transformers import GPT2Config, TFGPT2LMHeadModel, GPT2Tokenizer
from tensorflow import keras
from tensorflow.keras import backend as K

from tokenizers.processors import TemplateProcessing
from tokenizers.pre_tokenizers import WhitespaceSplit
from tokenizers.models import BPE
from tokenizers import Tokenizer
from tokenizers.trainers import BpeTrainer
from transformers import PreTrainedTokenizerFast, BertForPreTraining, AdamWeightDecay
from tokenizers.decoders import ByteLevel as ByteLevelDecoder

from transformers import GPT2Config, GPT2LMHeadModel, GPT2Tokenizer, GPT2Model

import random
import math

import pandas as pd
import numpy as np

import os

import logging
logging.basicConfig(level="ERROR")

In [None]:
tf.config.list_physical_devices()

In [None]:
# loading tokenizer from the saved model path
tokenizer = PreTrainedTokenizerFast(
    tokenizer_file=TOKENIZER_PATH, # You can load from the tokenizer file, alternatively
    eos_token= "</s>",
    bos_token= "<s>",
    unk_token= "<unk>",
    pad_token= "<pad>",
    mask_token= "<mask>",
    cls_token="<s>",
    sep_token="</s>"
)
# creating the configurations from which the model can be made
config = GPT2Config(
    vocab_size=tokenizer.vocab_size,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
    n_layer=6,
    n_head=6,
    use_cache=False
)
# creating the model
#possibily load model from checkpoint
#gpt_model = TFGPT2LMHeadModel(config).from_pretrained(CHECKPOINT_PATH)

In [None]:
tokenizer.vocab_size

In [None]:
train_single_sentence = ""
val_single_sentence = ""

for i, basket in enumerate(baskets):
    if i < 0.8*len(baskets):
        train_single_sentence += basket + ' ' + tokenizer.eos_token + ' '
    elif i >= 0.8*len(baskets):
        val_single_sentence += basket + ' ' + tokenizer.eos_token + ' '
    
train_string_tokenized = tf.squeeze(tokenizer.encode(train_single_sentence, add_special_tokens=False, return_tensors='tf'))
val_string_tokenized = tf.squeeze(tokenizer.encode(val_single_sentence, add_special_tokens=False, return_tensors='tf'))

In [None]:
train_examples, val_examples = [], []

BLOCK_SIZE = 60 # Block size indicates the sequence length used by the model
BATCH_SIZE = 12
BUFFER_SIZE = 1000
LEARNING_RATE=2e-5
DISABLE_LR_SCHEDULE=True
NUM_EPOCHS = 20


for i in range(0, len(train_string_tokenized) - BLOCK_SIZE + 1, BLOCK_SIZE):
    train_examples.append(train_string_tokenized[i:i + BLOCK_SIZE])
    
for i in range(0, len(val_string_tokenized) - BLOCK_SIZE + 1, BLOCK_SIZE):
    val_examples.append(val_string_tokenized[i:i + BLOCK_SIZE])

In [None]:
train_inputs, train_labels = [], []
val_inputs, val_labels = [], []

for ex in train_examples:
    train_inputs.append(ex[:-1])
    train_labels.append(ex[1:])
    
    
for ex in val_examples:
    val_inputs.append(ex[:-1])
    val_labels.append(ex[1:])
    
    
train_dataset = tf.data.Dataset.from_tensor_slices((train_inputs, train_labels))
train_dataset = train_dataset.batch(BATCH_SIZE, drop_remainder=True)

val_dataset = tf.data.Dataset.from_tensor_slices((val_inputs, val_labels))
val_dataset = val_dataset.batch(BATCH_SIZE, drop_remainder=True)

In [None]:
class Checkpoint(tf.keras.callbacks.Callback):
    def __init__(self, dir):
        super(Checkpoint, self).__init__()

        self.dir = dir

    def on_epoch_end(self, epoch, logs=None):
        checkpoint_dir = os.path.join(self.dir, f'checkpoint-{epoch}')

        if not os.path.exists(checkpoint_dir):
            os.makedirs(checkpoint_dir)

        self.model.save_pretrained(checkpoint_dir)

    def on_train_end(self, logs=None):
        checkpoint_dir = os.path.join(self.dir, 'final_epoch')

        if not os.path.exists(checkpoint_dir):
            os.makedirs(checkpoint_dir)

        self.model.save_pretrained(checkpoint_dir)

In [None]:
class PPL(object):
    def __init__(self, name):
        self.__name__ = name

    def __call__(self, y_true, y_pred):
        cross_entropy = K.sparse_categorical_crossentropy(
            y_true, y_pred, from_logits=True)

        ppl = math.e ** K.mean(cross_entropy)

        return ppl

In [None]:
class WarmUp(tf.keras.optimizers.schedules.LearningRateSchedule):
    """Applys a warmup schedule on a given learning rate decay schedule."""

    def __init__(self, initial_learning_rate, decay_schedule_fn, warmup_steps, power=1.0, name=None):
        super().__init__()
        self.initial_learning_rate = initial_learning_rate
        self.warmup_steps = warmup_steps
        self.power = power
        self.decay_schedule_fn = decay_schedule_fn
        self.name = name

    def __call__(self, step):
        with tf.name_scope(self.name or "WarmUp") as name:
            # Implements polynomial warmup. i.e., if global_step < warmup_steps, the
            # learning rate will be `global_step/num_warmup_steps * init_lr`.
            global_step_float = tf.cast(step, tf.float32)
            warmup_steps_float = tf.cast(self.warmup_steps, tf.float32)
            warmup_percent_done = global_step_float / warmup_steps_float
            warmup_learning_rate = self.initial_learning_rate * \
                tf.math.pow(warmup_percent_done, self.power)
            return tf.cond(
                global_step_float < warmup_steps_float,
                lambda: warmup_learning_rate,
                lambda: self.decay_schedule_fn(step),
                name=name,
            )

    def get_config(self):
        return {
            "initial_learning_rate": self.initial_learning_rate,
            "decay_schedule_fn": self.decay_schedule_fn,
            "warmup_steps": self.warmup_steps,
            "power": self.power,
            "name": self.name,
        }

In [None]:
class WarmUpLinearDecayScheduler(keras.callbacks.Callback):
    """Cosine decay with warmup learning rate scheduler
    """

    def __init__(self,
                 learning_rate_base,
                 total_steps,
                 global_step_init=0,
                 warmup_learning_rate=0.0,
                 warmup_steps=0,
                 hold_base_rate_steps=0,
                 verbose=0):
        """Constructor for cosine decay with warmup learning rate scheduler.
    Arguments:
        learning_rate_base {float} -- base learning rate.
        total_steps {int} -- total number of training steps.
    Keyword Arguments:
        global_step_init {int} -- initial global step, e.g. from previous checkpoint.
        warmup_learning_rate {float} -- initial learning rate for warm up. (default: {0.0})
        warmup_steps {int} -- number of warmup steps. (default: {0})
        hold_base_rate_steps {int} -- Optional number of steps to hold base learning rate
                                    before decaying. (default: {0})
        verbose {int} -- 0: quiet, 1: update messages. (default: {0})
        """

        super(WarmUpLinearDecayScheduler, self).__init__()
        self.learning_rate_base = learning_rate_base
        self.total_steps = total_steps
        self.global_step = global_step_init
        self.warmup_learning_rate = warmup_learning_rate
        self.warmup_steps = warmup_steps
        self.hold_base_rate_steps = hold_base_rate_steps
        self.verbose = verbose
        self.learning_rates = []

        learning_rate_fn = tf.keras.optimizers.schedules.PolynomialDecay(
            initial_learning_rate=learning_rate_base, decay_steps=total_steps, end_learning_rate=0.0
        )

        self.sched = WarmUp(learning_rate_base,
                            learning_rate_fn, warmup_steps=warmup_steps)
        
    def on_batch_end(self, batch, logs=None):
        self.global_step = self.global_step + 1
        lr = K.get_value(self.model.optimizer.lr)
        self.learning_rates.append(lr)

    def on_batch_begin(self, batch, logs=None):

        # lr = cosine_decay_with_warmup(global_step=self.global_step,
        #                               learning_rate_base=self.learning_rate_base,
        #                               total_steps=self.total_steps,
        #                               warmup_learning_rate=self.warmup_learning_rate,
        #                               warmup_steps=self.warmup_steps,
        #                               hold_base_rate_steps=self.hold_base_rate_steps)

        lr = self.sched(self.global_step)

        K.set_value(self.model.optimizer.lr, lr)
        if self.verbose > 0:
            print('\nBatch %05d: setting learning '
                  'rate to %s.' % (self.global_step + 1, lr))

In [None]:
n_train_steps = int(len(list(train_dataset))) * NUM_EPOCHS

In [None]:
checkpoint_callback = Checkpoint('/content/drive/MyDrive/Seminar QM/Models/TF/AH GPT 625k Pretraining')
lr_callback = WarmUpLinearDecayScheduler(learning_rate_base=LEARNING_RATE, 
                                         total_steps=n_train_steps, 
                                         warmup_steps=int(0.1 * n_train_steps))
es_callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)
val_ppl = PPL('ppl')

In [None]:
# defining our optimizer
optimizer = AdamWeightDecay(learning_rate=LEARNING_RATE, weight_decay_rate=0.01)
# definining our loss function
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
# defining our metric which we want to observe
val_acc = tf.keras.metrics.SparseCategoricalAccuracy('acc')
val_acc_k = tf.keras.metrics.SparseTopKCategoricalAccuracy(name='acc_k',k=6)
# compiling the model
gpt_model.compile(optimizer=optimizer,
             #run_eagerly=True)#, 
              loss=[loss, *[None] * gpt_model.config.n_layer], 
              metrics=[val_acc,val_acc_k, val_ppl])
gpt_model.trainable = True

In [None]:
if DISABLE_LR_SCHEDULE:
    
    history = gpt_model.fit(train_dataset, 
                            validation_data=val_dataset, 
                            epochs=NUM_EPOCHS, 
                            callbacks=[checkpoint_callback, es_callback])
else:
    
    lr_callback = WarmUpLinearDecayScheduler(learning_rate_base=LEARNING_RATE,
                                             total_steps=n_train_steps, 
                                             warmup_steps=int(0.1 * n_train_steps))

    history = gpt_model.fit(train_dataset, 
                            validation_data=val_dataset, 
                            epochs=NUM_EPOCHS, 
                            callbacks=[checkpoint_callback, lr_callback, es_callback])