In [1]:
!pip install --upgrade datasets
!pip install tensorboard
!pip install evaluate
!pip install sacrebleu
!pip install sentencepiece
!pip install accelerate

Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Downloading datasets-3.0.1-py3-none-any.whl (471 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: datasets
  Attempting uninstall: datasets
    Found existing installation: datasets 3.0.0
    Uninstalling datasets-3.0.0:
      Successfully uninstalled datasets-3.0.0
Successfully installed datasets-3.0.1
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3
Collecting sacrebleu
  Downloading sacrebleu-2.4.3-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import evaluate
from datasets import load_dataset
from transformers import MT5ForConditionalGeneration, MT5Tokenizer, Trainer, TrainingArguments


dataset = load_dataset("persiannlp/parsinlu_translation_en_fa", cache_dir="./cache")
model_name = "google/mt5-small"
tokenizer = MT5Tokenizer.from_pretrained(model_name)
model = MT5ForConditionalGeneration.from_pretrained(model_name, low_cpu_mem_usage=True)
model = model.to('cuda')

train_size = len(dataset['train'])
valid_size = len(dataset['validation'])

train_subset = dataset['train'].shuffle(seed=42).select(range(min(100000, train_size)))

valid_subset = dataset['validation'].shuffle(seed=42).select(range(min(100000, valid_size)))

0000.parquet:   0%|          | 0.00/135M [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/12.2M [00:00<?, ?B/s]

parsinlu-repo/validation/0000.parquet:   0%|          | 0.00/242k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1621665 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/48359 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2137 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/82.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/553 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


pytorch_model.bin:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [3]:
print(train_subset.shape)
print(valid_subset.shape)

(100000, 3)
(2137, 3)


In [4]:
def preprocess_function(examples):
    inputs = [f"translate English to Persian: {source}" for source in examples['source']]

    targets = [target[0] if isinstance(target, list) else target for target in examples['targets']]

    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length")

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length").input_ids


    labels = [[(label if label != tokenizer.pad_token_id else -100) for label in label_set] for label_set in labels]


    model_inputs["labels"] = labels

    return model_inputs

In [5]:
tokenized_train = train_subset.map(preprocess_function, batched=True, remove_columns=["category"])
tokenized_valid = valid_subset.map(preprocess_function, batched=True, remove_columns=["category"])

Map:   0%|          | 0/100000 [00:00<?, ? examples/s]



Map:   0%|          | 0/2137 [00:00<?, ? examples/s]

In [6]:
sample_sentence = "This is a test sentence for tokenization."

tokenized_output = tokenizer(sample_sentence)

print("Tokenized Output:", tokenized_output)
print("Token IDs:", tokenized_output['input_ids'])
print("Attention Mask:", tokenized_output['attention_mask'])

tokens = tokenizer.convert_ids_to_tokens(tokenized_output['input_ids'])
print("Tokens:", tokens)


Tokenized Output: {'input_ids': [1494, 339, 259, 262, 2978, 259, 98923, 332, 259, 67185, 14534, 260, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
Token IDs: [1494, 339, 259, 262, 2978, 259, 98923, 332, 259, 67185, 14534, 260, 1]
Attention Mask: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Tokens: ['▁This', '▁is', '▁', 'a', '▁test', '▁', 'sentence', '▁for', '▁', 'token', 'ization', '.', '</s>']


In [7]:
for param in model.encoder.parameters():
    param.requires_grad = False

for param in model.encoder.block[-2:].parameters():
    param.requires_grad = True

In [8]:
import logging
import torch
from torch.utils.tensorboard import SummaryWriter
from tqdm import tqdm

writer = SummaryWriter(log_dir='./logs')

logging.basicConfig(filename='./logs/training.log',
                    filemode='a',
                    format='%(asctime)s - %(message)s', 
                    level=logging.INFO)

logger = logging.getLogger()

tokenized_train.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
tokenized_valid.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

train_dataloader = torch.utils.data.DataLoader(tokenized_train, batch_size=16, shuffle=True)
valid_dataloader = torch.utils.data.DataLoader(tokenized_valid, batch_size=16)

optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
num_epochs = 3

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

checkpoint_dir = '/kaggle/working/'

for epoch in range(num_epochs):
    logger.info(f"Epoch {epoch + 1}/{num_epochs} started")

    model.train()
    total_train_loss = 0

    for step, batch in enumerate(tqdm(train_dataloader), start=1):
        inputs = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()

        outputs = model(input_ids=inputs, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        loss.backward()
        optimizer.step()

        total_train_loss += loss.item()

        if step % 100 == 0:
            logger.info(f"Step {step}: Training loss = {loss.item()}")
            writer.add_scalar('Training Loss', loss.item(), global_step=step + (epoch * len(train_dataloader)))

    avg_train_loss = total_train_loss / len(train_dataloader)
    logger.info(f"Training loss after epoch {epoch + 1}: {avg_train_loss}")
    writer.add_scalar('Average Training Loss', avg_train_loss, global_step=epoch)

    model.eval()
    total_eval_loss = 0
    for batch in valid_dataloader:
        with torch.no_grad():
            inputs = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=inputs, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_eval_loss += loss.item()

    avg_eval_loss = total_eval_loss / len(valid_dataloader)
    logger.info(f"Validation loss after epoch {epoch + 1}: {avg_eval_loss}")
    writer.add_scalar('Validation Loss', avg_eval_loss, global_step=epoch)

    checkpoint = {
        'epoch': epoch + 1,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'train_loss': avg_train_loss,
        'valid_loss': avg_eval_loss,
    }
    torch.save(checkpoint, checkpoint_dir + f'model_epoch_{epoch + 1}.pth')
    logger.info(f"Checkpoint saved for epoch {epoch + 1}")

    
final_model_path = checkpoint_dir + 'final_model.pth'
torch.save(model.state_dict(), final_model_path)
logger.info("Final model saved for inference.")
writer.close()

100%|██████████| 6250/6250 [37:14<00:00,  2.80it/s]
100%|██████████| 6250/6250 [37:13<00:00,  2.80it/s]
100%|██████████| 6250/6250 [37:14<00:00,  2.80it/s]


In [9]:
import torch
from transformers import MT5Tokenizer, MT5ForConditionalGeneration

model_name = '/kaggle/working/final_model.pth' 
tokenizer = MT5Tokenizer.from_pretrained('google/mt5-small')
model = MT5ForConditionalGeneration.from_pretrained('google/mt5-small')
model.load_state_dict(torch.load(model_name))
model = model.to('cuda')

input_text = "He thanked all fellow bloggers and organizations that showed support."

inputs = tokenizer(input_text, return_tensors="pt", padding=True).to('cuda')

with torch.no_grad():
    generated_tokens = model.generate(
        inputs['input_ids'], 
        attention_mask=inputs['attention_mask'], 
        max_length=50,
        num_beams=4,
        early_stopping=True
    )

predicted_translation = tokenizer.decode(generated_tokens[0], skip_special_tokens=True)

print(f"Generated Translation: {predicted_translation}")


  model.load_state_dict(torch.load(model_name))  # Load the trained model state


Generated Translation: همه افراد و افراد و افرادی که در این رابطه بودند، همه افراد و گروه های مختلفی بودند.
