## Fine-tuning T5 Text To Text Transformer for Translation

In [None]:
!pip install sentencepiece
!pip install transformers -U --force

# En-De Translation Data

In [None]:
import json
from pathlib import Path

import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
from sklearn.model_selection import train_test_split
from torch import cuda
from torch.utils.data import (DataLoader, Dataset, RandomSampler,
                              SequentialSampler)
from transformers import T5ForConditionalGeneration, T5Tokenizer

device = 'cuda' if cuda.is_available() else 'cpu'

# En-De Translation Data

In [None]:
!mkdir data
!curl https://nlp.stanford.edu/projects/nmt/data/wmt14.en-de/train.en --output data/train.en
!curl https://nlp.stanford.edu/projects/nmt/data/wmt14.en-de/train.de --output data/train.de

In [None]:
def restructure_data(
    train_en: Path,
    train_de: Path,
    train_data_output: Path,
    validation_data_output,
    test_size=0.33,
    random_state=42,
) -> dict:
    """
    Converts the text data into a json of list of dictionaries which map
    English sentence to corresponding German Sentence

    args :
      train_en : [ Path ] : Training data path for English Sentences
      train_de : [ Path ] : Training data path for German Sentences
      train_data_output : [ Path ] : Training data output path
      validation_data_output : [ Path ] : Validation data path
      test_size : [ float ] : size of test split (OPTIONAL) DEFAULT=0.33
      random_state : [ float ] : random state of train-test-split (OPTIONAL) DEFAULT=42

    Returns :
        [dict]: Training Data
        [dict]: Validation data
    """
    data = [
        {"src": en.split(), "trg": de.split()}
        for en, de in zip(train_en.open().readlines(), train_de.open().readlines())
    ]
    data = data[:5000]
    train_data, dev_data = train_test_split(
        data, test_size=test_size, random_state=random_state
    )
    json.dump(train_data, train_data_output.open("w"), indent=2)
    json.dump(dev_data, validation_data_output.open("w"), indent=2)
    return train_data, dev_data

In [None]:
data_path = Path("data")
train_en = data_path/"train.en"
train_de = data_path/"train.de"
train_data_output = data_path/"train.json"
validation_data_output = data_path/"dev.json"

In [None]:
train_data, dev_data = restructure_data(train_en, train_de, train_data_output, validation_data_output)

# Dataset Class for En-De dataset

In [None]:
class ENDEDataset(Dataset):

    def __init__(self, dataframe, tokenizer, source_len, summ_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.source_len = source_len
        self.summ_len = summ_len
        self.text = self.data.src
        self.ctext = self.data.trg

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        ctext = str(self.ctext[index])
        ctext = ' '.join(ctext.split())

        text = str(self.text[index])
        text = ' '.join(text.split())

        source = self.tokenizer.batch_encode_plus([ctext], max_length= self.source_len, pad_to_max_length=True,return_tensors='pt')
        target = self.tokenizer.batch_encode_plus([text], max_length= self.summ_len, pad_to_max_length=True,return_tensors='pt')

        source_ids = source['input_ids'].squeeze()
        source_mask = source['attention_mask'].squeeze()
        target_ids = target['input_ids'].squeeze()
        target_mask = target['attention_mask'].squeeze()

        return {
            'source_ids': source_ids.to(dtype=torch.long), 
            'source_mask': source_mask.to(dtype=torch.long), 
            'target_ids': target_ids.to(dtype=torch.long),
            'target_ids_y': target_ids.to(dtype=torch.long)
        }

# Training and Validation Functions

In [None]:
def train(epoch, tokenizer, model, device, loader, optimizer):
    model.train()
    for _,data in enumerate(loader, 0):
        y = data['target_ids'].to(device, dtype = torch.long)
        y_ids = y[:, :-1].contiguous()
        lm_labels = y[:, 1:].clone().detach()
        lm_labels[y[:, 1:] == tokenizer.pad_token_id] = -100
        ids = data['source_ids'].to(device, dtype = torch.long)
        mask = data['source_mask'].to(device, dtype = torch.long)

        outputs = model(input_ids = ids, attention_mask = mask, decoder_input_ids=y_ids, labels=lm_labels)
        loss = outputs[0]
        
        if _%10 == 0:
            print({"Training Loss": loss.item()})

        if _%500==0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        # xm.optimizer_step(optimizer)
        # xm.mark_step()

In [None]:
def validate(epoch, tokenizer, model, device, loader):
    model.eval()
    predictions = []
    actuals = []
    with torch.no_grad():
        for _, data in enumerate(loader, 0):
            y = data['target_ids'].to(device, dtype = torch.long)
            ids = data['source_ids'].to(device, dtype = torch.long)
            mask = data['source_mask'].to(device, dtype = torch.long)

            generated_ids = model.generate(
                input_ids = ids,
                attention_mask = mask, 
                max_length=150, 
                num_beams=2,
                repetition_penalty=2.5, 
                length_penalty=1.0, 
                early_stopping=True
                )
            preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]
            target = [tokenizer.decode(t, skip_special_tokens=True, clean_up_tokenization_spaces=True)for t in y]
            if _%100==0:
                print(f'Completed {_}')

            predictions.extend(preds)
            actuals.extend(target)
    return predictions, actuals

# Training Parameters

In [None]:
train_size = 0.8
train_dataset = pd.DataFrame(train_data)
val_dataset = pd.DataFrame(dev_data)
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(val_dataset.shape))

In [None]:
MAX_LEN = 512
SUMMARY_LEN = 150 
SEED = 42 
LEARNING_RATE = 1e-4
VAL_EPOCHS = 1
TRAIN_EPOCHS = 2 
VALID_BATCH_SIZE = 8
TRAIN_BATCH_SIZE = 8

In [None]:
train_params = {
    'batch_size': TRAIN_BATCH_SIZE,
    'shuffle': True,
    'num_workers': 0
    }

val_params = {
    'batch_size': VALID_BATCH_SIZE,
    'shuffle': False,
    'num_workers': 0
    }

In [None]:
tokenizer = T5Tokenizer.from_pretrained("t5-base")
training_set = ENDEDataset(train_dataset, tokenizer, MAX_LEN, SUMMARY_LEN)
val_set = ENDEDataset(val_dataset, tokenizer, MAX_LEN, SUMMARY_LEN)

training_loader = DataLoader(training_set, **train_params)
val_loader = DataLoader(val_set, **val_params)

In [None]:
model = T5ForConditionalGeneration.from_pretrained("t5-base", return_dict=True)
model = model.to(device)
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [None]:
for epoch in range(TRAIN_EPOCHS):
    train(epoch, tokenizer, model, device, training_loader, optimizer)

print('Now generating summaries on our fine tuned model for the validation dataset and saving it in a dataframe')
for epoch in range(VAL_EPOCHS):
    predictions, actuals = validate(epoch, tokenizer, model, device, val_loader)
    final_df = pd.DataFrame({'Generated Text':predictions,'Actual Text':actuals})
    final_df.to_csv('predictions.csv')
    print('Output Files generated for review')