#  Install the Required Libraries


In [1]:
!pip install transformers datasets accelerate evaluate torch

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m8

# Load the Dataset

In [2]:
from datasets import load_dataset

# Load the dataset
dataset = load_dataset("SKNahin/bengali-transliteration-data")

# Split the dataset (80% train, 20% validation)
dataset = dataset["train"].train_test_split(test_size=0.2, seed=42)

# Separate train and validation sets
train_data = dataset["train"]
val_data = dataset["test"]

# Use only subsets of train and validation data
train_data = train_data.select(range(1500))  # Use first 2000 samples
val_data = val_data.select(range(300))      # Use first 400 samples

# Display a sample
print("Sample data:")
print(train_data[0])
print(f"Training data size: {len(train_data)}")
print(f"Validation data size: {len(val_data)}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/300 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/333k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5006 [00:00<?, ? examples/s]

Sample data:
{'bn': 'এটা কোনো পোস্ট হলো মিয়া আবাল', 'rm': 'eta kono post holo mia abal'}
Training data size: 1500
Validation data size: 300


In [3]:
train_data.shape

(1500, 2)

In [4]:
val_data.shape

(300, 2)

# Preprocessing with Tokenization

In [5]:
from transformers import AutoTokenizer

# Select a pre-trained model (you can experiment with these)
model_checkpoints = {
    "mT5": "google/mt5-small",
}

# Initialize tokenizer for mT5
selected_model = "mT5"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoints[selected_model])

# Tokenization function
def preprocess_function(batch):
    inputs = tokenizer(batch["bn"], truncation=True, max_length=64, padding="max_length")
    targets = tokenizer(batch["rm"], truncation=True, max_length=64, padding="max_length")
    inputs["labels"] = targets["input_ids"]
    return inputs

# Preprocess datasets
train_data = train_data.map(preprocess_function, batched=True, num_proc=4)
val_data = val_data.map(preprocess_function, batched=True, num_proc=4)


The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

tokenizer_config.json:   0%|          | 0.00/82.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/553 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Map (num_proc=4):   0%|          | 0/1500 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/300 [00:00<?, ? examples/s]

# Data Formatting for PyTorch

In [6]:
import torch
from datasets import DatasetDict

# Define data collator to dynamically pad inputs and labels during training
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model_checkpoints[selected_model])

# Convert datasets to PyTorch format
train_data.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
val_data.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])


# Load Pre-trained Model

In [7]:
from transformers import AutoModelForSeq2SeqLM

# Load the pre-trained model
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoints[selected_model])


pytorch_model.bin:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

#  Define Training Pipeline

In [8]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer
import torch

# Training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="/results",
    evaluation_strategy="steps",
    eval_steps=1000,
    learning_rate=3e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,  # Reduce epochs
    save_steps=1000,
    save_total_limit=2,
    logging_steps=500,
    predict_with_generate=False,  # Disable beam search during training
    fp16=torch.cuda.is_available(),  # Mixed precision
    logging_dir='./logs',
    gradient_accumulation_steps=2  # Simulate larger batch size
)


# Define the Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_data,
    tokenizer=tokenizer
)



  trainer = Seq2SeqTrainer(


# Model training

In [9]:
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: [32m[41mERROR[0m API key must be 40 characters long, yours was 37


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: [32m[41mERROR[0m API key must be 40 characters long, yours was 37


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss,Validation Loss


TrainOutput(global_step=188, training_loss=0.0, metrics={'train_runtime': 385.8579, 'train_samples_per_second': 7.775, 'train_steps_per_second': 0.487, 'total_flos': 198281134080000.0, 'train_loss': 0.0, 'epoch': 2.0})

# Save the Model

In [10]:
model.save_pretrained("/banglish-to-bangla")
tokenizer.save_pretrained("/banglish-to-bangla")


('/banglish-to-bangla/tokenizer_config.json',
 '/banglish-to-bangla/special_tokens_map.json',
 '/banglish-to-bangla/spiece.model',
 '/banglish-to-bangla/added_tokens.json',
 '/banglish-to-bangla/tokenizer.json')

#  Predict with Beam Search

In [11]:
def predict(text, num_beams=4):
    inputs = tokenizer(text, return_tensors="pt", max_length=64, truncation=True).to("cuda")
    outputs = model.generate(
        inputs.input_ids,
        max_length=64,
        num_beams=num_beams,
        early_stopping=True
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

text = "ami test korei code disi..."
translated = predict(text)
print(f"Input: {text}")
print(f"Output: {translated}")


Input: ami test korei code disi...
Output: <0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x02>


# Prediction 2

In [15]:
# Test a sample Banglish input
text = "ami test korei code disi..."
translated = predict(text)
print(f"Input (Banglish): {text}")
print(f"Output (Bangla): {translated}")


Input (Banglish): ami test korei code disi...
Output (Bangla): <0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x02>


In [16]:
print(f"Output (Bangla): {translated.encode('utf-8').decode('utf-8')}")


Output (Bangla): <0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x04><0x02>
