In [107]:
#!pip install -r requirements.txt

# !pip install transformers torch datasets sentencepiece accelerate
!pip install torch transformers datasets sentencepiece accelerate

from datasets import load_dataset
from torch.utils.data import DataLoader
from transformers import MT5Tokenizer, MT5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer
import torch




[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python3.exe -m pip install --upgrade pip


In [108]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cpu


In [109]:
dataset = load_dataset("SKNahin/bengali-transliteration-data")
print(dataset['train'][0]) 

{'bn': 'স্ক্রোল করে ২০/৩০ সেকেন্ড এর ভিডিও পান নাই???', 'rm': 'scroll kore 20/30 second er video pann nai???'}


In [110]:
print(dataset['train'].features)

{'bn': Value(dtype='string', id=None), 'rm': Value(dtype='string', id=None)}


In [111]:
tokenizer = MT5Tokenizer.from_pretrained("google/mt5-small")
model = MT5ForConditionalGeneration.from_pretrained("google/mt5-small").to(device)


# Tokenize the dataset
def tokenize_function(examples):
    inputs = tokenizer(examples['rm'], padding="max_length", truncation=True, max_length=128)
    targets = tokenizer(examples['bn'], padding="max_length", truncation=True, max_length=128)
    inputs['labels'] = targets['input_ids']
    return inputs

tokenized_datasets = dataset.map(tokenize_function, batched=True)


In [113]:
#Split the dataset into train and validation sets
train_test_split = tokenized_datasets['train'].train_test_split(test_size=0.2)
train_dataset = train_test_split['train']
val_dataset = train_test_split['test']

In [114]:
def collate_fn(batch):
    input_ids = torch.tensor([item['input_ids'] for item in batch], dtype=torch.long)
    attention_mask = torch.tensor([item['attention_mask'] for item in batch], dtype=torch.long)
    labels = torch.tensor([item['labels'] for item in batch], dtype=torch.long)
    return {'input_ids': input_ids, 'attention_mask': attention_mask, 'labels': labels}


In [115]:
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=collate_fn)
val_dataloader = DataLoader(val_dataset, batch_size=16, collate_fn=collate_fn)



In [116]:
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

In [None]:
num_epochs = 3
for epoch in range(num_epochs):
    model.train()
    for batch in train_dataloader:
        # Move batch to device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        
        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    print(f"Epoch {epoch + 1}/{num_epochs} completed.")

    # Validation loop
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for batch in val_dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            total_loss += outputs.loss.item()
    
    avg_val_loss = total_loss / len(val_dataloader)
    print(f"Validation Loss: {avg_val_loss}")

print("Training completed.")

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
