_This is a code by Sergi Abashidze, Paula García, Reem Hageali and Sidhant Singhal._

Inspired by: https://github.com/abhimishra91/transformers-tutorials/blob/master/transformers_summarization_wandb.ipynb

__Import necessary libraries__

In [None]:
!pip install sentencepiece==0.1.94

In [None]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np

import time
from tqdm import tqdm

from collections import Counter
import torch
from torchtext.vocab import Vocab

import transformers
from transformers import T5Tokenizer,T5ForConditionalGeneration,Adafactor

from rouge import Rouge

In [None]:
torch.manual_seed(42)
np.random.seed(42)

__Load training data__

In [None]:
traindata = pd.read_csv('train.csv')

In [None]:
traindata.head()

__Clean and treat training data__

In [None]:
# convert all text to lower case
traindata.name = traindata.name.str.lower()
traindata.description = traindata.description.str.lower()
# removing both the leading and the trailing characters
traindata.name = traindata.name.str.strip()
traindata.description = traindata.description.str.strip()
# remove "." from data
traindata.name = traindata.name.str.replace(".", "")
traindata.description = traindata.description.str.replace(".","")

In [None]:
max_token_desc = max(traindata.description.apply(lambda x: len(x.split(" "))))

In [None]:
print('The maximum number of tokens in the descriptions are', max_token_desc)

__Create Vocabulary from the tokens in training data__

In [None]:
# create the fullvocab object
fullvocab = Counter()
# update it by adding the product names
for line in traindata.name:
    fullvocab.update(line.split(" "))
# update it by adding the product descriptions
for line in traindata.description:
    fullvocab.update(line.split(" "))

In [None]:
fullvocab = Vocab(fullvocab, min_freq = 336)

__Create the tokenizer object__

In [None]:
# load pre-trained model tokenizer
tokenizer = T5Tokenizer.from_pretrained("t5-base")
original_tokens = len(tokenizer)

In [None]:
# add the tokens from training data
words_to_add = list(fullvocab.stoi.keys())[2:]

addwords = []
for word in words_to_add:
    if word not in tokenizer.get_vocab():
        addwords.append(word)
# if token not in tokenizer, we add it
tokenizer.add_tokens(addwords)

extended_tokens = len(tokenizer)

In [None]:
print(f'There were {original_tokens} tokens originally. By adding the training data tokens, we now have {extended_tokens} tokens.')

__Create our own custom dataset__

In [None]:
# create the object that holds the description tokens (input)
encodedesc = tokenizer.batch_encode_plus(traindata.description, max_length= 512, padding='max_length',\
                                         return_tensors='pt', truncation = True)

# extract the input ids
input_ids = encodedesc["input_ids"]

# extract the input masks
input_masks = encodedesc["attention_mask"]

# delete the object, as it is no longer needed
del encodedesc

In [None]:
print('The shape of the input ids is',input_ids.shape)
print('The shape of the input masks is',input_masks.shape)

In [None]:
# create the object that holds the names tokens (target)
encodednames = tokenizer.batch_encode_plus(traindata.name, max_length= 45, padding='max_length',\
                                           return_tensors='pt', truncation = True)

# extract the target ids
targ_ids = encodednames["input_ids"]

# extract the target masks
targ_masks = encodednames["attention_mask"]

# delete the object, as it is no longer needed
del encodednames

In [None]:
print('The shape of the target ids is',targ_ids.shape)
print('The shape of the target masks is',targ_masks.shape)

In [None]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, descid, descmask, targid, targmask):
        self.descid = descid
        self.descmask = descmask
        self.targid = targid
        self.targmask = targmask

    def __len__(self):
        return len(self.descid)

    def __getitem__(self, index):
        return {
            'source_ids': self.descid[index].to(dtype=torch.long), 
            'source_mask': self.descmask[index].to(dtype=torch.long), 
            'target_ids': self.targid[index].to(dtype=torch.long),
            'target_mask': self.targmask[index].to(dtype=torch.long)
        }

__Create the model__

In [None]:
# load pre-trained model: "t5-base"
model = T5ForConditionalGeneration.from_pretrained("t5-base")
# set the model in training mode
model.train()
# resize the token embeddings to the updated_tokenizer we just created
model.resize_token_embeddings(len(tokenizer)) 

__Train-Test Split__

In [None]:
# define the split: 80-20
seperator = int(len(traindata)*0.8)
allindexes = np.random.permutation([i for i in range(len(traindata))])

In [None]:
# define training set
traindexes = allindexes[:seperator]
training_set = CustomDataset(input_ids[traindexes, :], input_masks[traindexes, :],\
                             targ_ids[traindexes, :], targ_masks[traindexes,:])

In [None]:
# define validation set
valdexes = allindexes[seperator:]
val_set = CustomDataset(input_ids[valdexes, :], input_masks[valdexes, :],\
                        targ_ids[valdexes, :], targ_masks[valdexes,:])

__Data Loading__

In [None]:
# load train data
train_params = {'batch_size': 4,'shuffle': True,'num_workers': 0}
training_loader = torch.utils.data.DataLoader(training_set, **train_params)

In [None]:
# load validation data
val_params = {'batch_size': 4,'shuffle': False,'num_workers': 0}
val_loader = torch.utils.data.DataLoader(val_set, **val_params)

__Train the model on our data__

In [None]:
def train(epoch, tokenizer, model, loader, optimizer):
    model.train()
    for _,data in enumerate(loader, 0):
        y = data['target_ids']
        y_ids = y[:, :-1].contiguous()
        lm_labels = y[:, 1:].clone().detach()
        lm_labels[y[:, 1:] == tokenizer.pad_token_id] = -100
        ids = data['source_ids']
        mask = data['source_mask']

        outputs = model(input_ids = ids, attention_mask = mask, decoder_input_ids=y_ids, labels=lm_labels)
        loss = outputs[0]
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        if _%100 == 0 and _>0:
            print(time.time())
            print(loss)

In [None]:
# define the optimizer
optimizer = torch.optim.SGD(params =  model.parameters(), lr=0.001, momentum=0.9)

In [None]:
for epoch in tqdm(range(2)):
    train(epoch, tokenizer, model, training_loader, optimizer)
    torch.save(model.state_dict(), "T5_Epochs"+ str(epoch) +".pt")

In [None]:
# load the state dictionary we just created
model.load_state_dict(torch.load("T5_Epochs0.pt"))

__Validate the model__

In [None]:
def validate(epoch, tokenizer, model, loader):
    predictions = []
    actuals = []
    with torch.no_grad():
        for _, data in enumerate(loader, 0):
            y = data['target_ids']
            ids = data['source_ids']
            mask = data['source_mask']

            generated_ids = model.generate(
                input_ids = ids, attention_mask = mask, max_length=50, 
                num_beams=2,
                repetition_penalty=2.5, length_penalty=3, early_stopping=False)
            preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]
            target = [tokenizer.decode(t, skip_special_tokens=True, clean_up_tokenization_spaces=True) for t in y]
            
            if _%100==0 and _>10:
                print(f'Completed {_}')
                
            predictions.extend(preds)
            actuals.extend(target)
    return predictions, actuals

In [None]:
# set the model in evaluation mode
model.eval()

In [None]:
predictions, actuals = validate(1, tokenizer, model, val_loader)
final_df = pd.DataFrame({'Generated Text':predictions,'Actual Text':actuals})

__Get the Rouge score to our model__

In [None]:
rouge = Rouge()

In [None]:
# identify non-problematic records
nonproblems = []
for i in range(len(predictions)):
    try:
        rouge.get_scores(predictions[i], actuals[i], avg = True, ignore_empty = True)
        nonproblems.append(i)
    except:
        pass

In [None]:
# select non-problematic records
fullpreds = []
fullactuals = []
for i in nonproblems:
    fullpreds.append(predictions[i])
    fullactuals.append(actuals[i])

In [None]:
# get the rouge score for those non-problematic records
rouge.get_scores(fullpreds, fullactuals, avg = True)