In [1]:
import os

os.chdir("..")

In [2]:
import transformers
from transformers.utils import send_example_telemetry

send_example_telemetry("translation_notebook", framework="pytorch")

In [3]:
model_checkpoint = "csebuetnlp/banglat5_small"
# model_checkpoint = "t5-small"

In [4]:
split_config = {
    "train": "./datasets/correction_train.jsonl",
    "test": "./datasets/correction_test.jsonl",
    "val": "./datasets/correction_val.jsonl",
}

In [5]:
from datasets import load_dataset
import evaluate

raw_datasets = load_dataset("json", data_files=split_config)
metric = evaluate.load("sacrebleu")

The `dataset` object itself is [`DatasetDict`](https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasetdict), which contains one key for the training, validation and test set:

In [6]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['from', 'to'],
        num_rows: 553064
    })
    test: Dataset({
        features: ['from', 'to'],
        num_rows: 69134
    })
    val: Dataset({
        features: ['from', 'to'],
        num_rows: 69133
    })
})

To access an actual element, you need to select a split first, then give an index:

In [7]:
raw_datasets["train"][0]

{'from': 'কিন্তু ফের অসুস্থ বোধ করায় তাকে আবারও হাসপাতালে  ভর্তি  করা  হয় ।',
 'to': 'কিন্তু ফের অসুস্থ বোধ করায় তাকে আবারও হাসপাতালে ভর্তি করা হয়।'}

In [8]:
fake_preds = ["hello there", "general kenobi"]
fake_labels = [["hello there"], ["general kenobi"]]
metric.compute(predictions=fake_preds, references=fake_labels)

{'score': 0.0,
 'counts': [4, 2, 0, 0],
 'totals': [4, 2, 0, 0],
 'precisions': [100.0, 100.0, 0.0, 0.0],
 'bp': 1.0,
 'sys_len': 4,
 'ref_len': 4}

In [9]:
import sentencepiece

In [10]:
from transformers import AutoTokenizer

# tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, legacy=True)
tokenizer = AutoTokenizer.from_pretrained(
    "./model_artifacts/pretrained_tokenizer_generic/"
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [11]:
tokenizer("Hello, this one sentence!")

{'input_ids': [15525, 20652, 11, 5785, 6867, 15512, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}

In [12]:
tokenizer(["Hello, this one sentence!", "This is another sentence."])

{'input_ids': [[15525, 20652, 11, 5785, 6867, 15512, 0], [27764, 1532, 17169, 15512, 13]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1]]}

In [13]:
tokenizer(["good morning ."])

{'input_ids': [[70, 3404, 19771, 876]], 'attention_mask': [[1, 1, 1, 1]]}

In [14]:
tokenizer(text_target="fa")

{'input_ids': [69, 64], 'attention_mask': [1, 1]}

In [15]:
print(tokenizer(text_target=["Hello, this one sentence!", "This is another sentence."]))

{'input_ids': [[15525, 20652, 11, 5785, 6867, 15512, 0], [27764, 1532, 17169, 15512, 13]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1]]}


In [16]:
max_input_length = 128
max_target_length = 128
source_lang = "from"
target_lang = "to"


def preprocess_function(examples):
    inputs = examples["from"]
    targets = examples["to"]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)
    labels = tokenizer(
        text_target=targets, max_length=max_target_length, truncation=True
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [17]:
preprocess_function(raw_datasets["train"][:2])

{'input_ids': [[265, 263, 264, 262, 266, 273, 329, 260, 261, 635, 273, 272, 262, 296, 275, 278, 298, 306, 259, 270, 307, 259, 265, 260, 480, 259, 625, 292, 259, 647, 259, 266, 259, 267, 260, 220, 737, 262, 266, 263, 220, 306, 259, 220, 339, 486], [266, 403, 261, 628, 556, 260, 313, 11, 299, 274, 260, 264, 3365, 319, 411, 468, 259, 261, 282, 262, 344, 286, 265, 292, 263, 272, 260, 1267, 321, 260, 289, 260, 270, 260, 277, 260, 9992, 260, 288, 16777, 260, 301, 259, 548, 278, 399, 838]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'labels': [[265, 263, 264, 262, 266, 273, 329, 260, 261, 635, 273, 272, 262, 296, 275, 278, 298, 306, 259, 270, 307, 259, 265, 260, 480, 259, 625, 292, 259, 647, 259, 266, 259, 267, 260, 737, 262, 266, 263, 3

In [18]:
tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)

Map:   0%|          | 0/553064 [00:00<?, ? examples/s]

Map:   0%|          | 0/69134 [00:00<?, ? examples/s]

Map:   0%|          | 0/69133 [00:00<?, ? examples/s]

In [19]:
from transformers import (
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
)

# model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint, vocab_size=30_000, ignore_mismatched_sizes=True)
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

In [20]:
batch_size = 32
model_name = model_checkpoint.split("/")[-1]
args = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned-{source_lang}-to-{target_lang}",
    evaluation_strategy="epoch",
    learning_rate=2e-4,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=100,
    predict_with_generate=True,
    fp16=False,
    push_to_hub=False,
)

In [21]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [22]:
import numpy as np


def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [
        np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds
    ]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [23]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"].select(range(batch_size * 10)),
    eval_dataset=tokenized_datasets["train"].select(range(batch_size * 3)),
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [24]:
tokenized_datasets["train"]

Dataset({
    features: ['from', 'to', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 553064
})

In [25]:
try:
    trainer.train()
except KeyboardInterrupt:
    print("wtf!")

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Bleu,Gen Len
1,No log,3.64461,0.4568,20.0
2,No log,3.401154,1.3521,20.0
3,No log,3.198591,1.678,20.0
4,No log,3.038283,1.9902,20.0
5,No log,2.913416,1.8845,20.0
6,No log,2.730365,2.1441,20.0
7,No log,2.637524,2.1985,20.0
8,No log,2.545238,2.3355,20.0
9,No log,2.446885,2.0652,20.0
10,No log,2.356518,2.1707,20.0


wtf!


# Test

In [26]:
import torch

In [27]:
torch.Tensor(tokenizer("আজ শুক্রবার").input_ids).int()

tensor([  92, 1251,    1], dtype=torch.int32)

In [32]:
input_ids = tokenizer(
    "দুর্ভাগ্যজননকভাববেে এমন একটি দৃশ্যপটের সঙ্গে  এইই গানটটি জুড়ে দেওয়া হয়েছে,, যা কেববল  জঘন্্যই নয় বরং অসহনীয়।",
    return_tensors="pt",
).input_ids.cuda()

In [29]:
tokenized_datasets["train"].select(range(batch_size * 10))[:100]

{'from': ['কিন্তু ফের অসুস্থ বোধ করায় তাকে আবারও হাসপাতালে  ভর্তি  করা  হয় ।',
  'তাঁর ড়মতেও,ভ কেন সবসময় যৌনতার প্রতীক হিসেউবঠে মেয়েদেরকযেই রদকেখাসনো হবে?',
  '‘মনের মতোভ মানুষ প়াএইলাম না’ চলচ্চিত্রের জন্য যৌথভামবে শ্রেষ্ঢ়ঠ শিল্প নির্দেশক হয়েছেন রহমত উঞল্লাহ বাসুঠ ও ফযরিদ আহমেদ।',
  'শাশশ্বত সচদেব এবং  জাসললিন রয়াল এর সসঙ্গগীত  পরিচালনা কেরছেন।',
  'সাজিদ খ ানভের বিরুদ্ধে বলউিউডর অখ ভিনমেড়ত্রী রন\u200c্যাচওেল হোয়াইটের যৌন হয়রানি র অভিযোগ শইুনে ক্ষেপে যান অক্ষয় কু ম ার।',
  'লফে কোন চ্যানেলে কোন নাঠক প্রচারিত হচ্ছে তাও বুঝতে পারছে না দর্শক।',
  'এই  গ াটির ও সঙ্গ ী থ আয়োজন করেছেন াজা ক্যশেফ।',
  'আশীর্বাধ সিনেমায় মাহির বিপরীতে চুক্তিবদ্ধ হয়েছেন রোশান।',
  'অনেক জনপ্রিয় শিল্পী এতে অভিনয় করছেন।',
  'তার কণ্ঠে  অ সংখ্ য নজরুল সঙ ্ গী ত শ ্রোতারা শুনে ছেন।',
  'প্রসঙ্গত, ১৯৯৩ সালে মুক্তি পাওয়া ‘চাঁদের আলো’ সিনেমা দিয়ে আলোচনায় আসেন ওমর সানি।',
  'এদিকে ারতেও অনেকেই সন্দেহ প্রকাশ করছেনশ্রদেবী মৃত্যু মটে স্বাভবক ছিল ন।',
  'দদুর্ভাগ্যজননকভাববেে এমন একটি দৃশ্যপটের সঙ্গে  এইই গানটটি জুড়ে দেওয়া 

In [30]:
input_ids

tensor([[ 3244, 11324, 17746,   897,    70,   141,  3712,   348,    10,   135,
            60, 21410,     6,    75,    14,    13,   405,   393,    77,  1395,
           170,    55,     4,     4,   125,    96,   225,  1691,  2107, 12874,
           274,   346,    13,    97,   841, 23297,     3,     1]],
       device='cuda:0')

In [33]:
with torch.no_grad():
    ret = model.generate(input_ids=input_ids)
print(tokenizer.batch_decode(ret))

['<pad><extra_id_0> এমন একটি দৃশ্যপটের সঙ্গে এই গানটি জুড়ে দেওয়া হয়েছে, যা কেবল জঘন্যই নয় বরং অসহনীয়']
