## Import libraries

In [1]:
!pip install evaluate
!pip install sacrebleu
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Trainer, TrainingArguments
from datasets import load_dataset
import evaluate
import torch

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3
Collecting sacrebleu
  Downloading sacrebleu-2.4.3-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting portalocker (from sacrebleu)
  Downloading portalocker-3.0.0-py3-none-any.whl.metadata (8.5 kB)
Downloading sacrebleu-2.4.3-py3-none-any.whl (103 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.0/104.0 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading portalocker-3.0.0-py3-none-any.whl (19 kB)
Installing collected packages: portalocker, sacrebleu
Successfully installed portalocker-3.0.0 sacrebleu-2.4.

## Task 1: Load the Dataset
- Spliting the dataset into training and validation subsets (90/10) to train the model on a majority of the data while validating on unseen data.

In [2]:
dataset = load_dataset("SKNahin/bengali-transliteration-data")

dataset = dataset['train'].train_test_split(test_size=0.1)
train_data = dataset['train']
val_data = dataset['test']

README.md:   0%|          | 0.00/300 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/333k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5006 [00:00<?, ? examples/s]


## Task 2: Data Preprocessing
- Tokenization is performed on both Banglish and Bangla text to convert them into numerical representations that the model can understand.
- Padding and truncation ensure uniform input length, allowing the model to process the data in batches efficiently.

## Task 3: Choose a Model
- T5 (`t5-small`) is selected for its efficiency in sequence-to-sequence tasks, multilingual capabilities, and light computational footprint, ideal for low-resource settings.

In [3]:
model_name = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

def preprocess(samples):
    inputs = samples['rm']
    targets = samples['bn']
    
    model_inputs = tokenizer(inputs, max_length=128, padding='max_length', truncation=True)
    labels = tokenizer(targets, max_length=128, padding='max_length', truncation=True)
    
    model_inputs["labels"] = labels["input_ids"]
    
    return model_inputs

tokenized_train = train_data.map(preprocess, batched=True)
tokenized_val = val_data.map(preprocess, batched=True)

tokenized_train = tokenized_train.remove_columns(["rm", "bn"])
tokenized_val = tokenized_val.remove_columns(["rm", "bn"])

tokenized_train.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
tokenized_val.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

metric = evaluate.load("sacrebleu")

def compute_metrics(p):
    predictions, labels = p
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    return metric.compute(predictions=decoded_preds, references=[[label] for label in decoded_labels])

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Map:   0%|          | 0/4505 [00:00<?, ? examples/s]

Map:   0%|          | 0/501 [00:00<?, ? examples/s]

Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

## Task 4: Train the Model
- The model is fine-tuned with appropriate hyperparameters like learning rate, batch size, and epochs, ensuring efficient training and model convergence.
- Mixed precision (`fp16=True`) is enabled to speed up training and reduce memory usage during fine-tuning.

In [5]:
training_args = TrainingArguments(
    output_dir="./results",          
    eval_strategy="no",  
    learning_rate=5e-5,           
    per_device_train_batch_size=2,  
    num_train_epochs=2,             
    weight_decay=0.01,              
    logging_dir="./logs",           
    logging_steps=100,               
    save_strategy="epoch",           
    report_to="none",                
    fp16=True,                       
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [6]:
trainer.train()
torch.cuda.empty_cache()



Step,Training Loss
100,2.8926
200,0.1373
300,0.0608
400,0.0445
500,0.0341
600,0.0315
700,0.0311
800,0.0297
900,0.0305
1000,0.0269




In [6]:
import os

os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
torch.cuda.empty_cache()

checkpoint_dir = './results/checkpoint-2254'
checkpoint = AutoModelForSeq2SeqLM.from_pretrained(checkpoint_dir)

training_args = TrainingArguments(
    output_dir="./results",       
    per_device_eval_batch_size=1,
    no_cuda=False, 
    fp16=True,  
)

trainer = Trainer(
    model=checkpoint,
    args=training_args,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

eval_results = trainer.evaluate()
print("Evaluation results:", eval_results)

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


OutOfMemoryError: CUDA out of memory. Tried to allocate 3.34 GiB. GPU 0 has a total capacity of 14.74 GiB of which 3.32 GiB is free. Process 2431 has 11.42 GiB memory in use. Of the allocated memory 10.70 GiB is allocated by PyTorch, and 541.84 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)