<a href="https://colab.research.google.com/github/lukassanting/Idiom-Translation/blob/main/End_To_End_Training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Training T5 - Prefix+Idiom



#0. Installs, Imports, Setup


##0.1 Import Libraries & Packages

In [1]:
!pip install sentencepiece -q
!pip install transformers -q
!pip install torch -q
!pip install rich[jupyter] -q

[K     |████████████████████████████████| 1.2 MB 35.7 MB/s 
[K     |████████████████████████████████| 4.2 MB 6.0 MB/s 
[K     |████████████████████████████████| 6.6 MB 54.8 MB/s 
[K     |████████████████████████████████| 84 kB 2.2 MB/s 
[K     |████████████████████████████████| 596 kB 55.9 MB/s 
[K     |████████████████████████████████| 231 kB 24.4 MB/s 
[K     |████████████████████████████████| 51 kB 8.0 MB/s 
[?25h

In [2]:
# Drive
from google.colab import drive

# Plots
import IPython
import IPython.display
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
plt.figure(figsize=(12, 6))

# Util
import os
import pandas as pd
import numpy as np
import re
pd.set_option('display.max_rows', None)

# ML
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler

# Importing the T5 modules from huggingface/transformers
from transformers import T5Tokenizer, T5ForConditionalGeneration

# rich: for a better display on terminal
from rich.table import Column, Table
from rich.text import Text
from rich import box
from rich.console import Console

# define a rich console logger
console = Console(record=True)

# Setting up the device for GPU usage
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

<Figure size 864x432 with 0 Axes>

##0.2. Import Data

Either upload the data to the shared Google Drive and use this to download, or upload it directly yourself (whatever is easier)

In [3]:
!git clone https://github.com/marziehf/IdiomTranslationDS data

Cloning into 'data'...
remote: Enumerating objects: 83, done.[K
remote: Total 83 (delta 0), reused 0 (delta 0), pack-reused 83[K
Unpacking objects: 100% (83/83), done.


##0.3 Setup functions & classes

###0.3.1 FUNC: display_df

In [4]:
# to display dataframe in ASCII format
def display_df(df):
    """display dataframe in ASCII format"""

    console = Console()
    table = Table(
        Column("source_text", justify="center"),
        Column("target_text", justify="center"),
        title="Sample Data",
        pad_edge=False,
        box=box.ASCII,
    )

    for i, row in enumerate(df.values.tolist()):
        table.add_row(row[0], row[1])

    console.print(table)

# training logger to log training progress
training_logger = Table(
    Column("Epoch", justify="center"),
    Column("Steps", justify="center"),
    Column("Loss", justify="center"),
    title="Training Status",
    pad_edge=False,
    box=box.ASCII,
)

###0.3.2 CLASS: DataSet

A Dataset class for reading and loading the dataset into the dataloader, and then feed it into the neural network for fine-tuning the model.

In [5]:
class DataSet(Dataset):
    """
    Creating a dataset class for reading the dataset and
    loading it into the dataloader, to pass it to the
    neural network for finetuning the model

    """

    def __init__(
        self, dataframe, tokenizer, source_len, target_len, source_text, target_text
    ):
        """
        Initializes a Dataset class

        Args:
            dataframe (pandas.DataFrame): Input dataframe
            tokenizer (transformers.tokenizer): Transformers tokenizer
            source_len (int): Max length of source text
            target_len (int): Max length of target text
            source_text (str): column name of source text
            target_text (str): column name of target text
        """
        self.tokenizer = tokenizer
        self.data = dataframe
        self.source_len = source_len
        self.summ_len = target_len
        self.target_text = self.data[target_text]
        self.source_text = self.data[source_text]

    def __len__(self):
        """returns the length of dataframe"""

        return len(self.target_text)

    def __getitem__(self, index):
        """return the input ids, attention masks and target ids"""

        source_text = str(self.source_text[index])
        target_text = str(self.target_text[index])

        # cleaning data so as to ensure data is in string type
        source_text = " ".join(source_text.split())
        target_text = " ".join(target_text.split())

        source = self.tokenizer.batch_encode_plus(
            [source_text],
            max_length=self.source_len,
            pad_to_max_length=True,
            truncation=True,
            padding="max_length",
            return_tensors="pt",
        )
        target = self.tokenizer.batch_encode_plus(
            [target_text],
            max_length=self.summ_len,
            pad_to_max_length=True,
            truncation=True,
            padding="max_length",
            return_tensors="pt",
        )

        source_ids = source["input_ids"].squeeze()
        source_mask = source["attention_mask"].squeeze()
        target_ids = target["input_ids"].squeeze()
        target_mask = target["attention_mask"].squeeze()

        return {
            "source_ids": source_ids.to(dtype=torch.long),
            "source_mask": source_mask.to(dtype=torch.long),
            "target_ids": target_ids.to(dtype=torch.long),
            "target_ids_y": target_ids.to(dtype=torch.long),
        }

###0.3.3 FUNC: train

Train function, which will the put model on training mode, generate outputs and calculate loss

In [6]:
def train(epoch, tokenizer, model, device, loader, optimizer):

    """
    Function to be called for training with the parameters passed from main function

    Takes 6 arguments as input:
    
        epoch: epoch
        tokenizer: T5 tokenizer
        model: T5 model
        loader: Train Dataloader
        optimizer: Optimizer

    """

    model.train()
    for _, data in enumerate(loader, 0):
        y = data["target_ids"].to(device, dtype=torch.long)
        y_ids = y[:, :-1].contiguous()
        lm_labels = y[:, 1:].clone().detach()
        lm_labels[y[:, 1:] == tokenizer.pad_token_id] = -100
        ids = data["source_ids"].to(device, dtype=torch.long)
        mask = data["source_mask"].to(device, dtype=torch.long)

        outputs = model(
            input_ids=ids,
            attention_mask=mask,
            decoder_input_ids=y_ids,
            labels=lm_labels,
        )
        loss = outputs[0]

        if _ % 10 == 0:
            training_logger.add_row(str(epoch), str(_), str(loss.item()))
            console.print("Epoch: ", str(epoch), "Steps: ", str(_), "Loss: ",str(loss.item()))
            #console.print(training_logger)

        optimizer.zero_grad()
        loss.backward() 
        optimizer.step()

    console.print(training_logger)

###0.3.4 FUNC: validate

Validate function is same as the Train function, but for the validation data



In [7]:
def validate(epoch, tokenizer, model, device, loader):

  """
  Function to evaluate model for predictions

  """
  model.eval()
  predictions = []
  actuals = []
  with torch.no_grad():
      for _, data in enumerate(loader, 0):
          y = data['target_ids'].to(device, dtype = torch.long)
          ids = data['source_ids'].to(device, dtype = torch.long)
          mask = data['source_mask'].to(device, dtype = torch.long)

          generated_ids = model.generate(
              input_ids = ids,
              attention_mask = mask, 
              max_length=150, 
              num_beams=2,
              repetition_penalty=2.5, 
              length_penalty=1.0, 
              early_stopping=True
              )
          preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]
          target = [tokenizer.decode(t, skip_special_tokens=True, clean_up_tokenization_spaces=True)for t in y]
          if _%10==0:
              console.print(f'Completed {_}')

          predictions.extend(preds)
          actuals.extend(target)
  return predictions, actuals

###0.3.5 FUNC: T5Trainer

T5Trainer is our main function. It accepts input data, model type, model paramters to fine-tune the model. Under the hood, it utilizes, our Dataset class for data handling, train function to fine tune the model, validate to evaluate the model.

In [8]:
# CHANGED THIS to get it to work with my fixed train and validation data
def T5Trainer(
    train_data, val_data, source_text, target_text, model_params, output_dir="./outputs/"
):

    """
    T5 trainer has 6 arguments:

      train_data: Input dataframe of training data
      val_data: Input dataframe of validation data
      source_text: Column name of the input text i.e. idiomatic sentence
      target_text: Column name of the target text i.e. literal sentence
      model_params: T5 model parameters
      output_dir: Output directory to save fine tuned T5 model.

    """

    # Set random seeds and deterministic pytorch for reproducibility
    torch.manual_seed(model_params["SEED"])  # pytorch random seed
    np.random.seed(model_params["SEED"])  # numpy random seed
    torch.backends.cudnn.deterministic = True

    # logging
    console.log(f"""[Model]: Loading {model_params["MODEL"]}...\n""")

    # tokenzier for encoding the text
    tokenizer = T5Tokenizer.from_pretrained(model_params["MODEL"])

    # Defining the model. We are using t5-base model and added a Language model layer on top for generation of Summary.
    # Further this model is sent to device (GPU/TPU) for using the hardware.
    model = T5ForConditionalGeneration.from_pretrained(model_params["MODEL"])
    model = model.to(device)

    # logging
    console.log(f"[Data]: Reading data...\n")

    # Importing the raw dataset
    train_data = train_data[[source_text, target_text]]
    train_data = train_data.reset_index(drop=True)
    val_data = val_data[[source_text, target_text]]
    val_data = val_data.reset_index(drop=True)

    display_df(train_data.head(2))

    # Creation of Dataset and Dataloader
    console.print(f"TRAIN Dataset: {train_data.shape}")
    console.print(f"TEST Dataset: {val_data.shape}\n")

    # Creating the Training and Validation dataset for further creation of Dataloader
    training_set = DataSet(
        train_data,
        tokenizer,
        model_params["MAX_SOURCE_TEXT_LENGTH"],
        model_params["MAX_TARGET_TEXT_LENGTH"],
        source_text,
        target_text,
    )
    val_set = DataSet(
        val_data,
        tokenizer,
        model_params["MAX_SOURCE_TEXT_LENGTH"],
        model_params["MAX_TARGET_TEXT_LENGTH"],
        source_text,
        target_text,
    )

    # Defining the parameters for creation of dataloaders
    train_params = {
        "batch_size": model_params["TRAIN_BATCH_SIZE"],
        "shuffle": True,
        "num_workers": 0,
    }

    val_params = {
        "batch_size": model_params["VALID_BATCH_SIZE"],
        "shuffle": False,
        "num_workers": 0,
    }

    # Creation of Dataloaders for testing and validation. This will be used down for training and validation stage for the model.
    training_loader = DataLoader(training_set, **train_params)
    val_loader = DataLoader(val_set, **val_params)

    # Defining the optimizer that will be used to tune the weights of the network in the training session.
    optimizer = torch.optim.Adam(
        params=model.parameters(), lr=model_params["LEARNING_RATE"]
    )

    # Training loop
    console.log(f"[Initiating Fine Tuning]...\n")

    for epoch in range(model_params["TRAIN_EPOCHS"]):
        train(epoch, tokenizer, model, device, training_loader, optimizer)

    console.log(f"[Saving Model]...\n")
    # Saving the model after training
    path = os.path.join(output_dir, "model_files")
    model.save_pretrained(path)
    tokenizer.save_pretrained(path)

    # evaluating validation dataset
    console.log(f"[Initiating Validation]...\n")
    for epoch in range(model_params["VAL_EPOCHS"]):
        predictions, actuals = validate(epoch, tokenizer, model, device, val_loader)
        final_df = pd.DataFrame({"Input": val_data[source_text], "Generated Text": predictions, "Actual Text": actuals})
        final_df.to_csv(os.path.join(output_dir, "predictions.csv"))

    console.save_text(os.path.join(output_dir, "logs.txt"))

    console.log(f"[Validation Completed.]\n")
    console.print(
        f"""[Model] Model saved @ {os.path.join(output_dir, "model_files")}\n"""
    )
    console.print(
        f"""[Validation] Generation on Validation data saved @ {os.path.join(output_dir,'predictions.csv')}\n"""
    )
    console.print(f"""[Logs] Logs saved @ {os.path.join(output_dir,'logs.txt')}\n""")

# 1. Processing Data

In [9]:
def clean_dataset(data):
  data_clean = data.copy(deep=True)
  for i in range(len(data_clean.columns)):
    data_clean.iloc[:,i] = data_clean.iloc[:,i].apply(lambda x: re.sub('&apos;', '\'', x))
    data_clean.iloc[:,i] = data_clean.iloc[:,i].apply(lambda x: re.sub('&quot;', '\"', x))
    data_clean.iloc[:,i] = data_clean.iloc[:,i].apply(lambda x: re.sub('&#124;', '|', x))
    data_clean.iloc[:,i] = data_clean.iloc[:,i].apply(lambda x: re.sub('&#93;', ']', x))
    data_clean.iloc[:,i] = data_clean.iloc[:,i].apply(lambda x: re.sub('&#91;', '[', x))
    data_clean.iloc[:,i] = data_clean.iloc[:,i].apply(lambda x: re.sub('&gt;', '>', x))
    data_clean.iloc[:,i] = data_clean.iloc[:,i].apply(lambda x: re.sub('&lt;', '<', x))

  return data_clean

In [10]:
base_path="data/en-de/"
en = pd.read_csv(base_path+'idiom_trainplus.en',sep='\n', header=None,names=['en'])
de = pd.read_csv(base_path+'idiom_trainplus.de',sep='\n', header=None,names=['de'])
info = pd.read_csv(base_path+'idiom_trainplus.info',sep='\t',header=None,names=["spread", "en_idiom", "de_idiom",'frequency'])
train_data = pd.concat([en,de,info],axis=1)
train_data.columns = ['input','target',"spread", "en_idiom", "de_idiom",'frequency']
train_data = train_data.drop(['spread','frequency',"en_idiom", "de_idiom"],axis=1)

en = pd.read_csv(base_path+'idiom_test.en',sep='\n', header=None,names=['en'])
de = pd.read_csv(base_path+'idiom_test.de',sep='\n', header=None,names=['de'])
test_data = pd.concat([en,de],axis=1)
test_data.columns = ['input','target']

train_data = clean_dataset(train_data)
test_data = clean_dataset(test_data)

train_data['input'] = 'translate to german: '+ train_data['input']
test_data['input'] = 'translate to german: '+ test_data['input']

from sklearn.model_selection import train_test_split

train_data,val_data=train_test_split(train_data, test_size=0.15, random_state=42)

In [11]:
train_data.head()

Unnamed: 0,input,target
1080,translate to german: EU integration is like a ...,Die Integration in die EU ist wie Tangotanzen ...
203,translate to german: Whenever we go to a West ...,"Jedes Mal , wenn wir in westafrikanische Lände..."
174,translate to german: I hope that the Irish pre...,"Ich möchte , daß die irische Präsidentschaft i..."
772,translate to german: In a nutshell it would in...,Kurz gesagt würde es die Belastung für den bri...
120,"translate to german: Very well located , easy ...",Die Mitarbeiter waren freundlich und das Hotel...


In [12]:
test_data.head()

Unnamed: 0,input,target
0,"translate to german: In this day and age , whe...","In einer Zeit wie dieser , in der viele offens..."
1,"translate to german: In Romania , the election...",In Rumänien wird der Wahlkampf die Chance biet...
2,translate to german: Let us work to close the ...,Gemeinsam sollten wir wirksam die Schlupflöche...
3,translate to german: The region is easily reac...,"Das Gebiet ist mit der Eisenbahn erreichbar , ..."
4,translate to german: I cannot enter into debat...,"Ich kann mich auf keine Diskussion einlassen ,..."


In [13]:
# Check max length
lengths_train_in = train_data["input"].str.split(" ")
lengths_test_in = test_data["input"].str.split(" ")
lengths_val_in = val_data["input"].str.split(" ")

print("Max number of tokens input = ", max(lengths_train_in.str.len().max(),lengths_test_in.str.len().max()))

lengths_train_tar = train_data["target"].str.split(" ")
lengths_test_tar = test_data["target"].str.split(" ")
lengths_val_tar = val_data["target"].str.split(" ")

print("Max number of tokens target = ", max(lengths_train_tar.str.len().max(),lengths_test_tar.str.len().max()))

Max number of tokens input =  168
Max number of tokens target =  144


#2. Training Model

In [14]:
# let's define model parameters specific to T5
model_params = {
    "MODEL": "t5-small",  # model_type: t5-base/t5-small/t5-large
    "TRAIN_BATCH_SIZE": 4,  # training batch size
    "VALID_BATCH_SIZE": 4,  # validation batch size
    "TRAIN_EPOCHS": 50,  # number of training epochs
    "VAL_EPOCHS": 1,  # number of validation epochs
    "LEARNING_RATE": 1e-4,  # learning rate
    "MAX_SOURCE_TEXT_LENGTH": 156,  # max length of source text
    "MAX_TARGET_TEXT_LENGTH": 145,  # max length of target text
    "SEED": 42,  # set seed for reproducibility
}


In [15]:
#df_subset = train_data.head(100)
#df_subset


In [16]:
%mkdir outputs

In [None]:
# Train model
T5Trainer(train_data=train_data, val_data= val_data, source_text="input", target_text="target", model_params=model_params, output_dir="outputs")

Downloading:   0%|          | 0.00/773k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.17k [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-small automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


Downloading:   0%|          | 0.00/231M [00:00<?, ?B/s]

In [None]:
# Save the training loss to csv
table_data = {x.header: [Text.from_markup(y).plain for y in x.cells] for x in training_logger.columns}
training_logger_df = pd.DataFrame(table_data)
training_logger_df.to_csv(file_path+"outputs/"+"train_loss.csv", sep="=")

# Save the validation loss to csv
table_data = {x.header: [Text.from_markup(y).plain for y in x.cells] for x in validation_logger.columns}
validation_logger_df = pd.DataFrame(table_data)
validation_logger_df.to_csv(file_path+"outputs/"+"val_loss.csv", sep="=")

In [None]:
# # Plot the loss
# train_loss = pd.read_csv(file_path+"outputs/"+"train_loss.csv", sep="=")
# train_loss.plot(0,2)
# val_loss = pd.read_csv(file_path+"outputs/"+"val_loss.csv", sep="=")
# val_loss.plot(0,2)

In [None]:
# Plot the loss
loss = pd.read_csv("outputs/"+"loss.csv", sep="=")
loss.plot(0,3)

In [None]:
!zip -r outputs.zip outputs