In [1]:
import numpy as np
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import T5Tokenizer, T5ForConditionalGeneration
from tqdm import tqdm

In [2]:
# Check for GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using {device} for training and evaluation.")

Using cuda for training and evaluation.


In [3]:
# Download dataset using kagglehub as in the eng_to_french.ipynb notebook
import kagglehub

# Download latest version
path = kagglehub.dataset_download("devicharith/language-translation-englishfrench")
print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/devicharith/language-translation-englishfrench?dataset_version_number=2...


100%|██████████| 3.51M/3.51M [00:00<00:00, 4.51MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/devicharith/language-translation-englishfrench/versions/2


In [4]:
# Step 1: Load and Explore the Dataset - using the English-French dataset
data = pd.read_csv(f"{path}/eng_-french.csv", names=["English", "French"])
print("Dataset loaded. Shape:", data.shape)
print(data.head())

Dataset loaded. Shape: (175622, 2)
                   English                  French
0  English words/sentences  French words/sentences
1                      Hi.                  Salut!
2                     Run!                 Cours !
3                     Run!                Courez !
4                     Who?                   Qui ?


In [5]:
# Step 2: Data Cleaning
data = data.dropna()  # Remove rows with missing values
data['English'] = data['English'].str.strip()
data['French'] = data['French'].str.strip()
print("Data cleaned. Remaining rows:", len(data))

Data cleaned. Remaining rows: 175622


In [6]:
# Remove the first header row if it exists
if data.iloc[0]['English'] == 'English words/sentences' and data.iloc[0]['French'] == 'French words/sentences':
    data = data.iloc[1:].reset_index(drop=True)
    print("Header row removed. New shape:", data.shape)

Header row removed. New shape: (175621, 2)


In [7]:
# Step 3: Prepare Data for Translation
data['input_text'] = "translate English to French: " + data['English']
data['target_text'] = data['French']
print("Sample prepared data:")
print(data[['input_text', 'target_text']].head())

Sample prepared data:
                          input_text target_text
0   translate English to French: Hi.      Salut!
1  translate English to French: Run!     Cours !
2  translate English to French: Run!    Courez !
3  translate English to French: Who?       Qui ?
4  translate English to French: Wow!  Ça alors !


In [8]:
# Step 4: Dataset Splitting
train_texts, test_texts, train_labels, test_labels = train_test_split(
    data['input_text'], data['target_text'], test_size=0.2, random_state=42
)
val_texts, test_texts, val_labels, test_labels = train_test_split(
    test_texts, test_labels, test_size=0.5, random_state=42
)

print(f"Training samples: {len(train_texts)}")
print(f"Validation samples: {len(val_texts)}")
print(f"Testing samples: {len(test_texts)}")

Training samples: 140496
Validation samples: 17562
Testing samples: 17563


In [9]:
# Step 5: Tokenization
model_name = "t5-small"  # Use t5-base or t5-large for larger models
tokenizer = T5Tokenizer.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [10]:
# Tokenizing the datasets
def tokenize_function(inputs, targets, tokenizer, max_length=128):
    model_inputs = tokenizer(list(inputs), max_length=max_length, truncation=True, padding="max_length", return_tensors="pt")
    labels = tokenizer(list(targets), max_length=max_length, truncation=True, padding="max_length", return_tensors="pt").input_ids
    labels[labels == tokenizer.pad_token_id] = -100  # Mask padding tokens in labels
    return model_inputs, labels

train_encodings, train_labels_enc = tokenize_function(train_texts, train_labels, tokenizer)
val_encodings, val_labels_enc = tokenize_function(val_texts, val_labels, tokenizer)

In [11]:
# Move data to GPU if available
train_encodings = {key: val.to(device) for key, val in train_encodings.items()}
train_labels_enc = train_labels_enc.to(device)
val_encodings = {key: val.to(device) for key, val in val_encodings.items()}
val_labels_enc = val_labels_enc.to(device)

In [12]:
# Step 6: Model Training
model = T5ForConditionalGeneration.from_pretrained(model_name).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [13]:
def compute_accuracy(predictions, labels):
    predictions = predictions.argmax(dim=-1)
    non_pad_elements = (labels != -100).sum()
    correct = (predictions == labels) & (labels != -100)
    return correct.sum().item() / non_pad_elements.item() if non_pad_elements.item() > 0 else 0

In [14]:
# Training loop
def train_model(model, train_encodings, train_labels_enc, val_encodings, val_labels_enc, epochs=5, batch_size=16):
    model.train()
    for epoch in range(epochs):
        print(f"Epoch {epoch + 1}/{epochs}")
        train_loss = 0
        train_accuracy = 0
        for i in tqdm(range(0, len(train_encodings['input_ids']), batch_size), desc="Training Batches"):
            input_ids = train_encodings['input_ids'][i:i + batch_size]
            attention_mask = train_encodings['attention_mask'][i:i + batch_size]
            labels = train_labels_enc[i:i + batch_size]

            optimizer.zero_grad()
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()

            train_loss += loss.item()
            train_accuracy += compute_accuracy(outputs.logits, labels)

        train_loss /= len(train_encodings['input_ids']) // batch_size
        train_accuracy /= len(train_encodings['input_ids']) // batch_size
        print(f"  Training Loss: {train_loss:.4f}, Training Accuracy: {train_accuracy:.4f}")

        # Validation loss and accuracy
        model.eval()
        val_loss = 0
        val_accuracy = 0
        with torch.no_grad():
            for i in tqdm(range(0, len(val_encodings['input_ids']), batch_size), desc="Validation Batches"):
                input_ids = val_encodings['input_ids'][i:i + batch_size]
                attention_mask = val_encodings['attention_mask'][i:i + batch_size]
                labels = val_labels_enc[i:i + batch_size]

                outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                val_loss += outputs.loss.item()
                val_accuracy += compute_accuracy(outputs.logits, labels)

        val_loss /= len(val_encodings['input_ids']) // batch_size
        val_accuracy /= len(val_encodings['input_ids']) // batch_size
        print(f"Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}")

        # Save the model checkpoint
        checkpoint_path = f"artifacts/t5/checkpoint_epoch_{epoch+1}.pt"
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': train_loss,
        }, checkpoint_path)
        print(f"Checkpoint saved to {checkpoint_path}")

In [16]:
# Train the model with English-French dataset
train_model(model, train_encodings, train_labels_enc, val_encodings, val_labels_enc)

Epoch 1/5


Training Batches: 100%|██████████| 8781/8781 [38:24<00:00,  3.81it/s]


  Training Loss: 0.7338, Training Accuracy: 0.8154


Validation Batches: 100%|██████████| 1098/1098 [01:34<00:00, 11.63it/s]


Validation Loss: 0.5512, Validation Accuracy: 0.8578
Checkpoint saved to artifacts/t5/checkpoint_epoch_1.pt
Epoch 2/5


Training Batches: 100%|██████████| 8781/8781 [36:53<00:00,  3.97it/s]


  Training Loss: 0.4813, Training Accuracy: 0.8699


Validation Batches: 100%|██████████| 1098/1098 [01:34<00:00, 11.65it/s]


Validation Loss: 0.4658, Validation Accuracy: 0.8739
Checkpoint saved to artifacts/t5/checkpoint_epoch_2.pt
Epoch 3/5


Training Batches: 100%|██████████| 8781/8781 [36:54<00:00,  3.96it/s]


  Training Loss: 0.3954, Training Accuracy: 0.8889


Validation Batches: 100%|██████████| 1098/1098 [01:34<00:00, 11.63it/s]


Validation Loss: 0.4403, Validation Accuracy: 0.8791
Checkpoint saved to artifacts/t5/checkpoint_epoch_3.pt
Epoch 4/5


Training Batches: 100%|██████████| 8781/8781 [36:54<00:00,  3.97it/s]


  Training Loss: 0.3344, Training Accuracy: 0.9033


Validation Batches: 100%|██████████| 1098/1098 [01:34<00:00, 11.64it/s]


Validation Loss: 0.4336, Validation Accuracy: 0.8815
Checkpoint saved to artifacts/t5/checkpoint_epoch_4.pt
Epoch 5/5


Training Batches: 100%|██████████| 8781/8781 [36:55<00:00,  3.96it/s]


  Training Loss: 0.2854, Training Accuracy: 0.9156


Validation Batches: 100%|██████████| 1098/1098 [01:34<00:00, 11.64it/s]


Validation Loss: 0.4378, Validation Accuracy: 0.8824
Checkpoint saved to artifacts/t5/checkpoint_epoch_5.pt


In [17]:
def comp_accuracy(predictions, labels):
    # Ensure predictions and labels have the same length
    min_length = min(predictions.size(1), labels.size(1))
    predictions = predictions[:, :min_length]
    labels = labels[:, :min_length]

    # Exclude padding tokens (-100)
    non_pad_elements = (labels != -100).sum()
    correct = ((predictions == labels) & (labels != -100)).sum()

    # Calculate accuracy
    return correct.item() / non_pad_elements.item() if non_pad_elements.item() > 0 else 0

In [18]:
def evaluate_model(model, tokenizer, texts, labels, batch_size=16, max_length=128):
    model.eval()
    total_accuracy = 0
    num_batches = len(texts) // batch_size + int(len(texts) % batch_size != 0)

    with torch.no_grad():
        for i in tqdm(range(0, len(texts), batch_size), desc="Testing Batches"):
            batch_texts = texts[i:i + batch_size].tolist()
            batch_labels = labels[i:i + batch_size].tolist()

            # Tokenize inputs and labels
            inputs = tokenizer(batch_texts, return_tensors="pt", truncation=True, padding=True, max_length=max_length).to(device)
            label_inputs = tokenizer(batch_labels, return_tensors="pt", truncation=True, padding=True, max_length=max_length).to(device)

            # Generate predictions
            outputs = model.generate(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'], max_length=max_length)

            # Convert predictions and labels to token IDs for comparison
            token_predictions = tokenizer.batch_decode(outputs, skip_special_tokens=True)
            token_labels = tokenizer.batch_decode(label_inputs['input_ids'], skip_special_tokens=True)

            # Re-tokenize for accuracy computation
            token_predictions = tokenizer(token_predictions, return_tensors="pt", padding=True).input_ids.to(device)
            token_labels = tokenizer(token_labels, return_tensors="pt", padding=True).input_ids.to(device)

            # Compute accuracy for the batch
            batch_accuracy = comp_accuracy(token_predictions, token_labels)
            total_accuracy += batch_accuracy

    total_accuracy /= num_batches
    print(f"Test Accuracy: {total_accuracy:.4f}")
    return total_accuracy

In [19]:
# Evaluate on test set
accuracy = evaluate_model(model, tokenizer, test_texts, test_labels, batch_size=16, max_length=128)

Testing Batches: 100%|██████████| 1098/1098 [06:05<00:00,  3.01it/s]

Test Accuracy: 0.6674





In [20]:
# Example translations
def translate_with_t5(text, model, tokenizer, max_length=128):
    model.eval()
    input_text = "translate English to French: " + text
    input_ids = tokenizer(input_text, return_tensors="pt", padding=True).input_ids.to(device)

    with torch.no_grad():
        outputs = model.generate(input_ids=input_ids, max_length=max_length)

    translation = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return translation

In [21]:
# Test with sample sentences
test_sentences = ["Hello, how are you?", "I love programming.", "The weather is nice today.", "Can you help me?"]

for sentence in test_sentences:
    translation = translate_with_t5(sentence, model, tokenizer)
    print(f"English: {sentence}")
    print(f"French: {translation}")
    print()

English: Hello, how are you?
French: Bonjour, comment êtes-vous?

English: I love programming.
French: J'adore programmer.

English: The weather is nice today.
French: Aujourd'hui, il fait beau.

English: Can you help me?
French: Pouvez-vous m'aider?



In [22]:
# Create directories if they don't exist
import os
os.makedirs("artifacts/t5/t5_english_french_model", exist_ok=True)

In [24]:
# Save the model and tokenizer
model_save_path = "artifacts/t5/t5_english_french_model"
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)
print(f"Model and tokenizer saved to {model_save_path}")

Model and tokenizer saved to artifacts/t5/t5_english_french_model
