## Translation from English to French

In [1]:
!pip install datasets



In [2]:
from datasets import load_dataset

raw_dataset = load_dataset("kde4", lang1='en', lang2 = 'hi')

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


In [3]:
raw_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 97227
    })
})

In [4]:
split_dataset = raw_dataset["train"].train_test_split(train_size=0.9, seed=20)
split_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 87504
    })
    test: Dataset({
        features: ['id', 'translation'],
        num_rows: 9723
    })
})

In [5]:
split_dataset["validation"] = split_dataset.pop("test")

In [6]:
split_dataset['train'][0]

{'id': '14666',
 'translation': {'en': 'This button saves all your changes and exits the program.',
  'hi': 'यह बटन आपके सभी परिवर्तनों को सहेजता है तथा प्रोग्राम को बाहर कर देता है.'}}

In [7]:
!pip install sentencepiece



In [8]:
from transformers import pipeline

model_checkpoint = "Helsinki-NLP/opus-mt-en-hi"
translator = pipeline("translation", model=model_checkpoint)
translator("Default to expanded threads")



[{'translation_text': 'सभी लड़ी फैलाएँ (A)'}]

In [9]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt")

In [10]:
split_dataset["train"][0]["translation"]

{'en': 'This button saves all your changes and exits the program.',
 'hi': 'यह बटन आपके सभी परिवर्तनों को सहेजता है तथा प्रोग्राम को बाहर कर देता है.'}

In [11]:
en_sentence = split_dataset["train"][1]["translation"]["en"]
hi_sentence = split_dataset["train"][1]["translation"]["hi"]

inputs = tokenizer(en_sentence, text_target=hi_sentence)
inputs

{'input_ids': [2866, 16910, 0], 'attention_mask': [1, 1, 1], 'labels': [8161, 10238, 0]}

In [12]:
max_length = 128

def preprocess_function(examples):
    inputs = [ex["en"] for ex in examples["translation"]]
    targets = [ex["hi"] for ex in examples["translation"]]
    model_inputs = tokenizer(
        inputs, text_target=targets, max_length=max_length, truncation=True
    )
    return model_inputs

In [13]:
tokenized_datasets = split_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=split_dataset["train"].column_names,
)

## Fine-tuning using Trainer API


In [14]:
from transformers import AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

In [15]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [16]:
batch = data_collator([tokenized_datasets["train"][i] for i in range(1, 3)])

batch.keys()

dict_keys(['input_ids', 'attention_mask', 'labels', 'decoder_input_ids'])

In [17]:
!pip install sacrebleu



In [18]:
!pip install evaluate



In [19]:
import evaluate

metric = evaluate.load("sacrebleu")

In [20]:
predictions = [
    "This plugin lets you translate web pages between several languages automatically."
]
references = [
    [
        "This plugin allows you to automatically translate web pages between several languages."
    ]
]
metric.compute(predictions=predictions, references=references)

{'score': 46.750469682990165,
 'counts': [11, 6, 4, 3],
 'totals': [12, 11, 10, 9],
 'precisions': [91.66666666666667,
  54.54545454545455,
  40.0,
  33.333333333333336],
 'bp': 0.9200444146293233,
 'sys_len': 12,
 'ref_len': 13}

In [21]:
import numpy as np


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    # In case the model returns more than the prediction logits
    if isinstance(preds, tuple):
        preds = preds[0]

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100s in the labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    return {"bleu": result["score"]}

In [22]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [23]:
!pip install transformers[torch]



In [24]:
from transformers import Seq2SeqTrainingArguments

args = Seq2SeqTrainingArguments(
    f"marian-finetuned-kde4-en-to-hi",
    evaluation_strategy="no",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=True,
)

In [25]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [26]:
trainer.evaluate(max_length=max_length)

{'eval_loss': 2.1637542247772217,
 'eval_bleu': 56.63623190376266,
 'eval_runtime': 301.6515,
 'eval_samples_per_second': 32.233,
 'eval_steps_per_second': 0.504}

In [27]:
trainer.train()

Step,Training Loss
500,1.5658
1000,1.369
1500,1.2491
2000,1.2601
2500,1.2559
3000,1.0989
3500,1.0469
4000,1.0538
4500,1.0349
5000,1.0789


TrainOutput(global_step=8205, training_loss=1.1062491928928149, metrics={'train_runtime': 926.7001, 'train_samples_per_second': 283.276, 'train_steps_per_second': 8.854, 'total_flos': 1896668602564608.0, 'train_loss': 1.1062491928928149, 'epoch': 3.0})

In [29]:
trainer.evaluate(max_length=max_length)

{'eval_loss': 0.9627804160118103,
 'eval_bleu': 48.920803517377614,
 'eval_runtime': 512.5312,
 'eval_samples_per_second': 18.971,
 'eval_steps_per_second': 0.297,
 'epoch': 3.0}