In [1]:
!pip install datasets
import re
!pip install evaluate
!pip install sacrebleu

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

In [2]:
import os
import re
import gc
import torch
import pandas as pd
from datasets import load_dataset, Dataset
from sklearn.model_selection import train_test_split
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    set_seed
)


def configure_device():
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    torch.cuda.empty_cache()
    if torch.cuda.is_available():
        torch.cuda.set_per_process_memory_fraction(0.9)
        torch.backends.cudnn.benchmark = True
    return device

device = configure_device()
set_seed(42)

def load_and_preprocess_dataset(dataset_name, train_ratio=0.8):

    dataset = load_dataset(dataset_name)
    df = dataset['train'].to_pandas()
    df.rename(columns={'rm': 'Banglish', 'bn': 'Bangla'}, inplace=True)


    train_df, val_df = train_test_split(df, test_size=1 - train_ratio, random_state=42)
    return train_df, val_df

def preprocess_text(text):
    if pd.isna(text): return ""
    text = ' '.join(text.split())
    text = re.sub(r'[!@#$%^&*()_+={}\[\]:;<>?,./]', '', text)
    replacements = {
        'aa': 'a', 'ee': 'i', 'oo': 'u',
        'kh': 'k', 'sh': 's', 'th': 't',
        'dh': 'd', 'ph': 'f', 'gh': 'g',
        'ch': 'c', 'nn': 'n', 'mm': 'm'
    }
    for old, new in replacements.items():
        text = text.replace(old, new)
    text = re.sub(r'\d+', '<NUM>', text)
    return text.lower()

def preprocess_dataset(df):
    df = df.dropna()
    df['Banglish'] = df['Banglish'].apply(preprocess_text)
    df = df.drop_duplicates(subset=['Banglish']).reset_index(drop=True)

    def validate_lengths(example):
        try:
            banglish_len = len(str(example['Banglish']))
            bangla_len = len(str(example['Bangla']))
            ratio = banglish_len / bangla_len if bangla_len > 0 else float('inf')
            return 0.5 <= ratio <= 2.5
        except:
            return False

    dataset = Dataset.from_pandas(df)
    return dataset.filter(validate_lengths)

train_data, val_data = load_and_preprocess_dataset("SKNahin/bengali-transliteration-data")
train_dataset = preprocess_dataset(train_data)
val_dataset = preprocess_dataset(val_data)


tokenizer = AutoTokenizer.from_pretrained("facebook/mbart-large-50")
tokenizer.src_lang = "en_XX"
tokenizer.tgt_lang = "bn_IN"

def tokenize_function(examples):
    inputs = [str(text) for text in examples['Banglish']]
    targets = [str(text) for text in examples['Bangla']]
    model_inputs = tokenizer(
        inputs,
        max_length=128,
        padding="max_length",
        truncation=True
    )
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            targets,
            max_length=128,
            padding="max_length",
            truncation=True
        )
    model_inputs['labels'] = labels['input_ids']
    return model_inputs

train_dataset = train_dataset.map(tokenize_function, batched=True, remove_columns=train_dataset.column_names)
val_dataset = val_dataset.map(tokenize_function, batched=True, remove_columns=val_dataset.column_names)


model = AutoModelForSeq2SeqLM.from_pretrained("facebook/mbart-large-50").to(device)

training_args = Seq2SeqTrainingArguments(
    output_dir="./banglish_translator",
    eval_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=2,
    num_train_epochs=8,
    save_total_limit=2,
    predict_with_generate=True,
    logging_dir="./logs",
    logging_steps=100,
    save_strategy="epoch",
    weight_decay=0.01,
    warmup_steps=300,
    fp16=True,
    report_to=[],
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss"
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)


trainer.train()
trainer.save_model("./final_model")
tokenizer.save_pretrained("./final_model")


def translate_banglish_to_bengali(text, model, tokenizer, device):
    text = preprocess_text(text)
    inputs = tokenizer(text, return_tensors="pt", max_length=128, truncation=True)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            forced_bos_token_id=tokenizer.lang_code_to_id["bn_IN"],
            max_length=128,
            num_beams=5,
            length_penalty=1.0,
            early_stopping=True
        )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)


model.eval()
example_translation = translate_banglish_to_bengali("ami tomake valobashi", model, tokenizer, device)
print(example_translation)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/300 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/333k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5006 [00:00<?, ? examples/s]

Filter:   0%|          | 0/3748 [00:00<?, ? examples/s]

Filter:   0%|          | 0/989 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/531 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/649 [00:00<?, ?B/s]

Map:   0%|          | 0/3747 [00:00<?, ? examples/s]



Map:   0%|          | 0/988 [00:00<?, ? examples/s]

pytorch_model.bin:   0%|          | 0.00/2.44G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/261 [00:00<?, ?B/s]

Epoch,Training Loss,Validation Loss
1,9.5394,0.579122
2,0.4567,0.132232
3,0.1863,0.102287
4,0.1148,0.094896
5,0.0678,0.093307
6,0.0426,0.094918
7,0.0218,0.09662


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


আমি তোমাকে ভালোবাসি
