In [3]:
cd /content/drive/MyDrive/projects/fra-eng-translation-with-t5-from-scratch

/content/drive/MyDrive/projects/fra-eng-translation-with-t5-from-scratch


In [4]:
!pip install transformers sentencepiece



In [7]:
import os
import re
import json
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from transformers import T5Tokenizer, T5ForConditionalGeneration, AdamW
from torch.utils.data import Dataset, DataLoader
from tqdm.auto import tqdm
import random

In [8]:
# Ensure reproducibility
# Setup seeds
SEED = 1234
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)

In [10]:
# Preprocess Data
train_df = pd.read_csv('fra.txt', names=['SRC', 'TRG', 'lic'], sep='\t')
del train_df['lic']
train_df = train_df.loc[:, 'SRC':'TRG']

# Simplify dataset for demonstration
train_df = train_df.sample(n=50000, random_state=SEED)

In [11]:
# Split Data
train_df, val_df = train_test_split(train_df, test_size=0.1)

In [12]:
train_df

Unnamed: 0,SRC,TRG
212441,Everything is costing more than it did last year.,Tout coûte plus cher que l'année dernière.
138684,Swimming is a form of exercise.,Nager est une sorte d'exercice.
77275,They enjoyed themselves.,Ils se sont amusés.
173141,Tom told me that Mary was on a diet.,Tom m'a dit que Mary était au régime.
71354,Acrobats are very agile.,Les acrobates sont très agiles.
...,...,...
44129,This isn't my money.,Ce n'est pas mon argent.
203722,Tom made a list of places he wants to visit.,Tom a fait une liste des endroits qu'il veut v...
228386,Greater demand for high-quality coffee has hel...,Une plus grande demande pour du café de haute ...
187856,Who told you that we wanted to do that?,Qui vous a dit que nous voulions faire ça ?


In [13]:
# Load Pretrained T5 Tokenizer
tokenizer = T5Tokenizer.from_pretrained('t5-small')


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [14]:
class TranslationDataset(Dataset):
    def __init__(self, dataframe, tokenizer, source_max_token_len=128, target_max_token_len=128):
        self.tokenizer = tokenizer
        self.dataframe = dataframe
        self.source_max_token_len = source_max_token_len
        self.target_max_token_len = target_max_token_len

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, index):
        data_row = self.dataframe.iloc[index]

        source_text = data_row['SRC']
        target_text = data_row['TRG']

        source_encoding = tokenizer(
            source_text,
            max_length=self.source_max_token_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            add_special_tokens=True,
            return_tensors='pt'
        )

        target_encoding = tokenizer(
            target_text,
            max_length=self.target_max_token_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            add_special_tokens=True,
            return_tensors='pt'
        )

        labels = target_encoding['input_ids']
        labels[labels == 0] = -100

        return dict(
            source_text=source_text,
            target_text=target_text,
            source_input_ids=source_encoding['input_ids'].flatten(),
            target_input_ids=labels.flatten()
        )


In [15]:
max_len = 41

# Create Data Loaders
BATCH_SIZE = 8

train_dataset = TranslationDataset(train_df, tokenizer, max_len, max_len)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

In [16]:
val_dataset = TranslationDataset(val_df, tokenizer, max_len, max_len)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [None]:
for batch in train_loader:
    print(batch)
    break

In [30]:
class TranslationDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_token_len=128):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.max_token_len = max_token_len

        self.src_texts = dataframe['SRC'].tolist()
        self.trg_texts = dataframe['TRG'].tolist()

        self.batch_encoded = tokenizer.batch_encode_plus(
            self.src_texts + self.trg_texts,
            max_length=self.max_token_len,
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        )

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        src_input_ids = self.batch_encoded['input_ids'][index]
        src_attn_mask = self.batch_encoded['attention_mask'][index]

        trg_input_ids = self.batch_encoded['input_ids'][len(self.data) + index]
        trg_attn_mask = self.batch_encoded['attention_mask'][len(self.data) + index]

        labels = trg_input_ids.clone()
        labels[labels == tokenizer.pad_token_id] = -100

        return {
            "input_ids": src_input_ids,
            "attention_mask": src_attn_mask,
            "labels": labels
        }


In [37]:
# Create Data Loaders
BATCH_SIZE = 64
max_len = 41
train_dataset = TranslationDataset(train_df, tokenizer, max_len)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

val_dataset = TranslationDataset(val_df, tokenizer, max_len)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

In [38]:
len(train_loader)

5625

In [20]:
|# for using GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cpu


In [21]:
# Load Pretrained T5 Model
model = T5ForConditionalGeneration.from_pretrained('t5-small').to(device)

In [22]:
# Training Setup
EPOCHS = 3
optimizer = AdamW(model.parameters(), lr=5e-5)



In [23]:
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader):
        optimizer.zero_grad()
        input_ids = batch['source_input_ids'].to(device)
        labels = batch['target_input_ids'].to(device)
        outputs = model(input_ids=input_ids, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
    avg_train_loss = total_loss / len(train_loader)
    print(f"Epoch: {epoch + 1}, Loss: {avg_train_loss:.3f}")

  0%|          | 0/5625 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [27]:
def translate(model, tokenizer, source_sentence, max_length=128):
    model.eval()  # Put the model in evaluation mode

    # Tokenize the source text
    source_encoding = tokenizer(
        source_sentence,
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        add_special_tokens=True,
        return_tensors='pt'
    ).to(device)

    # Generate the translation using the model
    with torch.no_grad():
        generated_ids = model.generate(
            input_ids=source_encoding['input_ids'],
            attention_mask=source_encoding['attention_mask'],
            max_length=max_length,
            num_beams=5,
            early_stopping=True
        )

    # Decode the generated ids to get the translated text
    translated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)

    return translated_text

# Example usage
source_sentence = "Good morning"
translated_text = translate(model, tokenizer, source_sentence)
print(f"Translated Text: {translated_text}")


Translated Text: Good morning


In [28]:
import random

def display_translation_example(model, tokenizer, dataset):
    model.eval()  # Ensure the model is in evaluation mode

    # Select a random sample from the dataset
    sample = random.choice(dataset)
    source_sentence = sample['source_text']
    ground_truth = sample['target_text']

    # Perform the translation
    predicted_translation = translate(model, tokenizer, source_sentence)

    # Display results
    print(f"Input (English): {source_sentence}")
    print(f"Prediction (French): {predicted_translation}")
    print(f"Ground Truth (French): {ground_truth}")

# Example usage
display_translation_example(model, tokenizer, val_dataset)


Input (English): We rested on some stones.
Prediction (French): Wir blieben auf einigen Steinen.
Ground Truth (French): Nous nous reposâmes sur quelques pierres.
