In [6]:
# Install required libraries
!pip install datasets transformers torch scikit-learn

# Import necessary libraries
import pandas as pd
import numpy as np
from datasets import load_dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments
from sklearn.model_selection import train_test_split

# Step 1: Load the Dataset
# Load the Hugging Face dataset
dataset = load_dataset("SKNahin/bengali-transliteration-data")



In [7]:
dataset

DatasetDict({
    train: Dataset({
        features: ['bn', 'rm'],
        num_rows: 5006
    })
})

In [32]:
# Check the column names in the dataset
print(dataset_dict["train"].column_names)


['bn', 'rm']


In [33]:
# Split the dataset into training and validation subsets (80/20 split)
dataset_dict = dataset['train'].train_test_split(test_size=0.2, seed=42)

In [34]:
# Step 2: Data Preprocessing
# Load a tokenizer suitable for sequence-to-sequence tasks
tokenizer = AutoTokenizer.from_pretrained("facebook/mbart-large-50")

In [35]:
# Inspect dataset rows for None or empty values
invalid_rows = dataset_dict.filter(lambda example: not example['rm'] or not example['bn'])
print(invalid_rows)


Filter:   0%|          | 0/4004 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1002 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['bn', 'rm'],
        num_rows: 0
    })
    test: Dataset({
        features: ['bn', 'rm'],
        num_rows: 0
    })
})


In [36]:
# Validate dataset structure
print(dataset_dict)

# Check for issues in the first few examples
print(dataset_dict["train"][0:5])  # Inspect the first 5 rows


DatasetDict({
    train: Dataset({
        features: ['bn', 'rm'],
        num_rows: 4004
    })
    test: Dataset({
        features: ['bn', 'rm'],
        num_rows: 1002
    })
})
{'bn': ['‡¶è‡¶ü‡¶æ ‡¶ï‡ßã‡¶®‡ßã ‡¶™‡ßã‡¶∏‡ßç‡¶ü ‡¶π‡¶≤‡ßã ‡¶Æ‡¶ø‡ßü‡¶æ ‡¶Ü‡¶¨‡¶æ‡¶≤', '‡¶°‡¶ø‡¶≤‡¶ø‡¶ü ‡¶ï‡¶∞‡ßá‡¶® ‡¶™‡ßã‡¶∏‡ßç‡¶ü‚Ä¶', '‡¶ú‡¶ø ‡¶≠‡¶æ‡¶á ‡¶Ö‡¶∏‡¶æ‡¶ß‡¶æ‡¶∞‡¶£ ‡¶π‡¶á‡¶õ‡ßá ‡¶¨‡¶æ‡¶ü ‡¶´‡ßÅ‡¶≤ ‡¶°‡¶ø‡¶ü‡ßá‡¶á‡¶≤‡¶∏ ‡¶ú‡¶æ‡¶®‡¶§‡ßá ‡¶™‡¶æ‡¶∞‡¶≤‡¶æ‡¶Æ ‡¶®‡¶æ, ‡¶Ü‡¶∞ ‡¶è‡¶ü‡¶æ ‡¶ï‡¶ø ‡¶¨‡¶ø‡¶°‡¶ø ‡¶§‡ßá ‡¶∏‡¶¨ ‡¶ú‡¶æ‡ßü‡¶ó‡¶æ‡¶§‡ßá‡¶á ‡¶™‡¶æ‡¶ì‡ßü‡¶æ ‡¶Ø‡¶æ‡¶¨‡ßá?', '‡¶≠‡¶æ‡¶á...‡¶Ü‡¶∞ ‡ß®‡ß™ ‡¶Ü‡¶ì‡ßü‡¶æ‡¶∞ ‡¶ì‡ßü‡ßá‡¶ü ‡¶ï‡¶∞‡ßá‡¶®..‡¶Ü‡¶Æ‡¶æ‡¶∞ ‡¶è‡¶´‡¶¨‡¶ø ‡¶Ü‡¶á‡¶°‡¶ø ‡¶¨‡ßç‡¶Ø‡¶æ‡¶ï ‡¶Ü‡¶∏‡¶¨‡ßá ‡¶Ü‡¶Æ‡¶ø ‡¶∞‡¶ø‡¶ï‡ßÅ‡ßü‡ßá‡¶∏‡ßç‡¶ü ‡¶™‡¶æ‡¶†‡¶æ‡ßü‡¶æ ‡¶¶‡¶ø‡¶õ‡¶ø...', '‡¶è‡¶ï ‡¶∏‡¶æ‡¶•‡ßá ‡¶ï‡ßü‡¶ü‡¶æ ‡¶è‡¶ï‡¶æ‡¶â‡¶®‡ßç‡¶ü ‡¶≤‡¶ó‡¶á‡¶® ‡¶ï‡¶∞‡ßá ‡¶∞‡¶æ‡¶ñ‡¶æ ‡¶Ø‡¶æ‡ßü ‡¶¨‡ßç‡¶∞‡ßã?'], 'rm': ['eta kono post holo mia abal', 'Delete koren post‚Ä¶', 'ji bai osadaron hoice but full detailes 

In [39]:
from datasets import load_dataset
from transformers import AutoTokenizer

# Load dataset
dataset_dict = load_dataset("SKNahin/bengali-transliteration-data")

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("facebook/mbart-large-50")

# Filter invalid rows (None or empty)
dataset_dict = dataset_dict.filter(lambda example: example['rm'] is not None and example['bn'] is not None)
dataset_dict = dataset_dict.filter(lambda example: len(example['rm'].strip()) > 0 and len(example['bn'].strip()) > 0)

Filter:   0%|          | 0/5006 [00:00<?, ? examples/s]

In [40]:
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['bn', 'rm'],
        num_rows: 5006
    })
})

In [41]:
def preprocess_function(examples):
    inputs = examples['rm']  # Banglish text
    targets = examples['bn']  # Bengali text

    # Tokenize inputs
    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length")

    # Tokenize targets
    labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length")

    # Add labels to the model inputs
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [42]:
# Apply preprocessing
tokenized_dataset = dataset_dict.map(preprocess_function, batched=True)

Map:   0%|          | 0/5006 [00:00<?, ? examples/s]

In [43]:
# Inspect the tokenized dataset
print(tokenized_dataset)

DatasetDict({
    train: Dataset({
        features: ['bn', 'rm', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 5006
    })
})


In [44]:
print(dataset_dict['train'][0:5])


{'bn': ['‡¶∏‡ßç‡¶ï‡ßç‡¶∞‡ßã‡¶≤ ‡¶ï‡¶∞‡ßá ‡ß®‡ß¶/‡ß©‡ß¶ ‡¶∏‡ßá‡¶ï‡ßá‡¶®‡ßç‡¶° ‡¶è‡¶∞ ‡¶≠‡¶ø‡¶°‡¶ø‡¶ì ‡¶™‡¶æ‡¶® ‡¶®‡¶æ‡¶á???', '‡¶ì ‡¶ó‡ßÅ‡¶≤‡¶æ ‡¶ü‡¶∞‡ßá‡¶®‡ßç‡¶ü ‡¶∏‡¶æ‡¶á‡¶ü ‡¶è ‡¶™‡¶æ‡¶¨‡ßá‡¶®', '‡¶≠‡¶ï‡ßç‡¶ï‡¶∞ ‡¶ö‡¶ï‡ßç‡¶ï‡¶∞ ‡¶™‡ßã‡¶∏‡ßç‡¶ü ‡¶è‡¶ï‡¶ü‡¶æ ‡¶ï‡¶∞‡¶≤‡ßá‡¶á ‡¶è‡¶™‡ßç‡¶∞‡ßÅ‡¶≠‡¶°.‚Ä¶ ‡¶®‡¶ø‡¶∂‡ßç‡¶ö‡¶á  ‡¶ò‡¶æ‡¶¨‡¶≤‡¶æ ‡¶Ü‡¶õ‡ßá', '‡¶Ü‡¶Æ‡¶ø ‡¶ü‡ßá‡¶∏‡ßç‡¶ü ‡¶ï‡¶∞‡ßá‡¶á ‡¶ï‡ßã‡¶° ‡¶¶‡¶ø‡¶õ‡¶ø‚Ä¶', '‡¶è‡¶§‡ßã ‡¶ï‡¶∑‡ßç‡¶ü‡ßá‡¶∞ ‡¶ï‡¶ø ‡¶Ü‡¶õ‡ßá ‡¶∏‡¶æ‡¶ï‡¶ø‡¶¨‡¶ì‡ßü‡¶æ‡¶™.‡¶ü‡¶ï,‡¶∏‡¶æ‡¶ï‡¶ø‡¶¨‡¶ì‡ßü‡¶æ‡¶™.‡¶Æ‡¶≤&‡¶è‡¶Ü‡¶á‡¶ì‡¶≠‡¶ø‡¶°‡¶ø‡¶ì‡¶°‡ßç‡¶≤.‡¶Æ‡¶≤ ‡¶•‡ßá‡¶ï‡ßá ‡¶∏‡¶π‡¶ú‡ßá‡¶á ‡¶°‡¶æ‡¶â‡¶®‡¶≤‡ßã‡¶° ‡¶ï‡¶∞‡¶æ ‡¶Ø‡¶æ‡ßü'], 'rm': ['scroll kore 20/30 second er video pann nai???', 'o gula Torrent site e paben', 'vokkor chokkor post akta korlei approved‚Ä¶. nishchoi ghabla ache', 'ami test koreii code disi‚Ä¶', 'eto koster ki ache shakibwap.tk,shakibwap.ml&aiovideodl.ml theke shohojei downlod kora jay']}


In [45]:
print(tokenized_dataset["train"][0:5])


{'bn': ['‡¶∏‡ßç‡¶ï‡ßç‡¶∞‡ßã‡¶≤ ‡¶ï‡¶∞‡ßá ‡ß®‡ß¶/‡ß©‡ß¶ ‡¶∏‡ßá‡¶ï‡ßá‡¶®‡ßç‡¶° ‡¶è‡¶∞ ‡¶≠‡¶ø‡¶°‡¶ø‡¶ì ‡¶™‡¶æ‡¶® ‡¶®‡¶æ‡¶á???', '‡¶ì ‡¶ó‡ßÅ‡¶≤‡¶æ ‡¶ü‡¶∞‡ßá‡¶®‡ßç‡¶ü ‡¶∏‡¶æ‡¶á‡¶ü ‡¶è ‡¶™‡¶æ‡¶¨‡ßá‡¶®', '‡¶≠‡¶ï‡ßç‡¶ï‡¶∞ ‡¶ö‡¶ï‡ßç‡¶ï‡¶∞ ‡¶™‡ßã‡¶∏‡ßç‡¶ü ‡¶è‡¶ï‡¶ü‡¶æ ‡¶ï‡¶∞‡¶≤‡ßá‡¶á ‡¶è‡¶™‡ßç‡¶∞‡ßÅ‡¶≠‡¶°.‚Ä¶ ‡¶®‡¶ø‡¶∂‡ßç‡¶ö‡¶á  ‡¶ò‡¶æ‡¶¨‡¶≤‡¶æ ‡¶Ü‡¶õ‡ßá', '‡¶Ü‡¶Æ‡¶ø ‡¶ü‡ßá‡¶∏‡ßç‡¶ü ‡¶ï‡¶∞‡ßá‡¶á ‡¶ï‡ßã‡¶° ‡¶¶‡¶ø‡¶õ‡¶ø‚Ä¶', '‡¶è‡¶§‡ßã ‡¶ï‡¶∑‡ßç‡¶ü‡ßá‡¶∞ ‡¶ï‡¶ø ‡¶Ü‡¶õ‡ßá ‡¶∏‡¶æ‡¶ï‡¶ø‡¶¨‡¶ì‡ßü‡¶æ‡¶™.‡¶ü‡¶ï,‡¶∏‡¶æ‡¶ï‡¶ø‡¶¨‡¶ì‡ßü‡¶æ‡¶™.‡¶Æ‡¶≤&‡¶è‡¶Ü‡¶á‡¶ì‡¶≠‡¶ø‡¶°‡¶ø‡¶ì‡¶°‡ßç‡¶≤.‡¶Æ‡¶≤ ‡¶•‡ßá‡¶ï‡ßá ‡¶∏‡¶π‡¶ú‡ßá‡¶á ‡¶°‡¶æ‡¶â‡¶®‡¶≤‡ßã‡¶° ‡¶ï‡¶∞‡¶æ ‡¶Ø‡¶æ‡ßü'], 'rm': ['scroll kore 20/30 second er video pann nai???', 'o gula Torrent site e paben', 'vokkor chokkor post akta korlei approved‚Ä¶. nishchoi ghabla ache', 'ami test koreii code disi‚Ä¶', 'eto koster ki ache shakibwap.tk,shakibwap.ml&aiovideodl.ml theke shohojei downlod kora jay'], 'input_ids': [[250004, 192046, 20867, 387, 108355, 17932, 72, 120

In [46]:
from transformers import MBartForConditionalGeneration, Seq2SeqTrainer, Seq2SeqTrainingArguments


In [47]:
# Load the model
model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50")


pytorch_model.bin:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/261 [00:00<?, ?B/s]

In [48]:
# Set up the training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",                # Output directory for model checkpoints
    evaluation_strategy="epoch",          # Evaluate at the end of every epoch
    learning_rate=5e-5,                   # Learning rate
    per_device_train_batch_size=8,        # Batch size for training
    per_device_eval_batch_size=8,         # Batch size for evaluation
    weight_decay=0.01,                    # Weight decay for optimization
    save_total_limit=2,                   # Limit the number of saved checkpoints
    num_train_epochs=3,                   # Number of training epochs
    predict_with_generate=True,           # Use `generate` for evaluation
    logging_dir="./logs",                 # Directory for logging
    logging_steps=200,                    # Log every 200 steps
    save_strategy="epoch",                # Save model checkpoint at the end of each epoch
    push_to_hub=False                     # Set to True if you want to push to Hugging Face Hub
)



In [49]:
# Define the data collator
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [51]:
pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m84.0/84.0 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [53]:
pip install sacrebleu

Collecting sacrebleu
  Downloading sacrebleu-2.4.3-py3-none-any.whl.metadata (51 kB)
[?25l     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m0.0/51.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m51.8/51.8 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting portalocker (from sacrebleu)
  Downloading portalocker-3.0.0-py3-none-any.whl.metadata (8.5 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading sacrebleu-2.4.3-py3-none-any.whl (103 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m104.0/104.0 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorama-0.4.6-py2.py3-none-any.

In [54]:
import evaluate

# Define metrics for evaluation
metric = evaluate.load("sacrebleu")


In [55]:
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    # Decode predictions and labels
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Remove padding
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels]

    # Compute BLEU score
    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    return {"bleu": result["score"]}

In [59]:
# Split the dataset into training (80%) and validation (20%)
train_test_split = dataset_dict["train"].train_test_split(test_size=0.2, seed=42)

In [66]:
from datasets import DatasetDict
# Create a new DatasetDict with train and validation splits
tokenized_dataset = DatasetDict({
    "train": train_test_split["train"],
    "validation": train_test_split["test"]
})

KeyError: 'bn'

In [61]:
print("Train dataset size:", len(tokenized_dataset["train"]))
print("Validation dataset size:", len(tokenized_dataset["validation"]))


Train dataset size: 4004
Validation dataset size: 1002


In [62]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)


  trainer = Seq2SeqTrainer(


In [64]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    remove_unused_columns=False,  # Add this line
    logging_dir="./logs",
    logging_steps=100
)




In [67]:
# Remove unused columns (only keep the ones the model needs)
# train_dataset = train.remove_columns(["bn", "rm"])
train_dataset = tokenized_dataset["train"].remove_columns(["bn", "rm"])
# eval_dataset = validation.remove_columns(["bn", "rm"])
eval_dataset = tokenized_dataset["validation"].remove_columns(["bn", "rm"])


In [68]:
# Initialize the trainer with the updated datasets
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

  trainer = Seq2SeqTrainer(


In [None]:
# Train the model
trainer.train()

In [None]:
# Example inputs for inference
example_inputs = ["ami bangladesh theke bolchi", "valo lage bangla bolte"]

# Tokenize the inputs
inputs = tokenizer(example_inputs, return_tensors="pt", max_length=128, truncation=True, padding="max_length")

# Generate predictions
outputs = model.generate(inputs["input_ids"], max_length=128, num_beams=4)

# Decode the predictions
decoded_predictions = tokenizer.batch_decode(outputs, skip_special_tokens=True)
print("Translated Outputs:", decoded_predictions)
