In [2]:
!pip install datasets
!pip install sentencepiece
!pip install transformers[torch]

Collecting datasets
  Downloading datasets-2.16.0-py3-none-any.whl (507 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m17.6 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m21.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pyarrow-hotfix, dill, multiprocess, datasets
Successfully installed datasets-2.16.0 dill-0.3.7 multiprocess-0.70.15 pyarrow-hotfix-0.6
Collecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x

# Test pipeline

In [1]:
from transformers import pipeline

model_checkpoint = "Helsinki-NLP/opus-mt-en-fr"
translator = pipeline("translation", model=model_checkpoint)
translator("Default to expanded threads")

source.spm:   0%|          | 0.00/778k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.34M [00:00<?, ?B/s]



[{'translation_text': 'Par défaut pour les threads élargis'}]

# Finetuning

## Load dataset

In [12]:
from datasets import load_dataset, load_metric

raw_datasets = load_dataset("kde4", "en-fr")
raw_datasets = raw_datasets.filter(lambda examle, idx: idx % 100 == 0, with_indices=True)
split_datasets = raw_datasets["train"].train_test_split(train_size=0.9, seed=20)
split_datasets["validation"] = split_datasets.pop("test")

## Tokenize

In [13]:
from transformers import AutoTokenizer

model_checkpoint = "Helsinki-NLP/opus-mt-en-fr"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt")




In [14]:
en_sentence = split_datasets["train"][1]["translation"]["en"]
fr_sentence = split_datasets["train"][1]["translation"]["fr"]

inputs = tokenizer(en_sentence)
with tokenizer.as_target_tokenizer():
    targets = tokenizer(fr_sentence)




In [15]:
wrong_targets = tokenizer(fr_sentence)
print(tokenizer.convert_ids_to_tokens(wrong_targets["input_ids"]))
print(tokenizer.convert_ids_to_tokens(targets["input_ids"]))


['▁La', '▁m', 'é', 'th', 'ode', '▁est', '▁la', '▁m', 'ê', 'me', '▁pour', '▁con', 'stru', 'ire', '▁d', "'", 'au', 'tres', '▁ob', 'jet', 's', '▁&', '▁#1', '60', ';', ':', '▁c', 'lique', 'z', '▁sur', '▁l', "'", 'ent', 'ré', 'e', '▁dé', 's', 'ir', 'ée', '▁de', '▁la', '▁bar', 're', '▁de', '▁menu', ',', '▁ou', '▁sur', '▁le', '▁bou', 'ton', '▁de', '▁la', '▁bar', 're', '▁d', "'", 'out', 'ils', ',', '▁et', '▁s', 'él', 'ection', 'n', 'ez', '▁les', '▁', 'élé', 'ments', '▁', 'né', 'cess', 'aires', '▁pour', '▁con', 'stru', 'ire', '▁l', "'", 'ob', 'jet', '.', '</s>']
['▁La', '▁méthode', '▁est', '▁la', '▁même', '▁pour', '▁construire', '▁d', "'", 'autres', '▁objets', '▁&', '▁#160;:', '▁cliquez', '▁sur', '▁l', "'", 'entrée', '▁désirée', '▁de', '▁la', '▁barre', '▁de', '▁menu', ',', '▁ou', '▁sur', '▁le', '▁bouton', '▁de', '▁la', '▁barre', '▁d', "'", 'outils', ',', '▁et', '▁sélectionnez', '▁les', '▁éléments', '▁nécessaires', '▁pour', '▁construire', '▁l', "'", 'objet', '.', '</s>']


In [16]:
max_input_length = 128
max_target_length = 128


def preprocess_function(examples):
    inputs = [ex["en"] for ex in examples["translation"]]
    targets = [ex["fr"] for ex in examples["translation"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


In [17]:
tokenized_datasets = split_datasets.map(
    preprocess_function,
    batched=True,
    remove_columns=split_datasets["train"].column_names,
)


Map:   0%|          | 0/1891 [00:00<?, ? examples/s]

Map:   0%|          | 0/211 [00:00<?, ? examples/s]

## Load model

In [18]:
from transformers import AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)


## Data collator

In [19]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)


In [20]:
batch = data_collator([tokenized_datasets["train"][i] for i in range(1, 3)])
batch.keys()


dict_keys(['input_ids', 'attention_mask', 'labels', 'decoder_input_ids'])

In [21]:
for i in range(1, 3):
    print(tokenized_datasets["train"][i]["labels"])

[80, 2823, 43, 8, 223, 27, 8354, 20, 6, 362, 6117, 402, 38492, 10170, 36, 14, 6, 2133, 36086, 5, 8, 11831, 5, 2805, 2, 59, 36, 19, 9376, 5, 8, 11831, 20, 6, 14847, 2, 11, 20899, 16, 1224, 1145, 27, 8354, 14, 6, 1954, 3, 0]
[38123, 2383, 685, 4076, 1072, 23772, 0]


## Metrics

In [22]:
!python3 -m pip install sacrebleu

Collecting sacrebleu
  Downloading sacrebleu-2.4.0-py3-none-any.whl (106 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/106.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━[0m [32m81.9/106.3 kB[0m [31m2.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m106.3/106.3 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting portalocker (from sacrebleu)
  Downloading portalocker-2.8.2-py3-none-any.whl (17 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Installing collected packages: portalocker, colorama, sacrebleu
Successfully installed colorama-0.4.6 portalocker-2.8.2 sacrebleu-2.4.0


In [23]:
from datasets import load_metric

metric = load_metric("sacrebleu")


  metric = load_metric("sacrebleu")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/2.85k [00:00<?, ?B/s]

In [24]:
predictions = [
    "This plugin lets you translate web pages between several languages automatically."
]

references = [
    [
        "This plugin allows you to automatically translate web pages between several languages."
    ]
]
metric.compute(predictions=predictions, references=references)


{'score': 46.750469682990165,
 'counts': [11, 6, 4, 3],
 'totals': [12, 11, 10, 9],
 'precisions': [91.66666666666667,
  54.54545454545455,
  40.0,
  33.333333333333336],
 'bp': 0.9200444146293233,
 'sys_len': 12,
 'ref_len': 13}

In [25]:
import numpy as np


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    # other predictions
    if isinstance(preds, tuple):
        preds = preds[0]

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # pass -100
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # postprocess
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    return {"bleu": result["score"]}


## Train

In [26]:
from huggingface_hub import notebook_login

notebook_login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [27]:
from transformers import Seq2SeqTrainingArguments

args = Seq2SeqTrainingArguments(
    f"marian-finetuned-kde4-en-to-fr",
    evaluation_strategy="no",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=True,
)


In [28]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)


In [29]:
trainer.evaluate(max_length=max_target_length)

{'eval_loss': 1.8624041080474854,
 'eval_bleu': 39.3071646478729,
 'eval_runtime': 13.5578,
 'eval_samples_per_second': 15.563,
 'eval_steps_per_second': 0.295}

In [30]:
trainer.train()

Step,Training Loss


TrainOutput(global_step=180, training_loss=1.4560743543836805, metrics={'train_runtime': 40.4776, 'train_samples_per_second': 140.151, 'train_steps_per_second': 4.447, 'total_flos': 111624667398144.0, 'train_loss': 1.4560743543836805, 'epoch': 3.0})

In [31]:
trainer.evaluate(max_length=max_target_length)

{'eval_loss': 1.5832053422927856,
 'eval_bleu': 43.76421445795146,
 'eval_runtime': 17.7221,
 'eval_samples_per_second': 11.906,
 'eval_steps_per_second': 0.226,
 'epoch': 3.0}

In [32]:
trainer.push_to_hub(tags="tanslation", commit_message="Training complete")

model.safetensors:   0%|          | 0.00/299M [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

events.out.tfevents.1703284780.62c5a87291a0.1446.0:   0%|          | 0.00/5.77k [00:00<?, ?B/s]

events.out.tfevents.1703284849.62c5a87291a0.1446.1:   0%|          | 0.00/407 [00:00<?, ?B/s]

'https://huggingface.co/naninya/marian-finetuned-kde4-en-to-fr/tree/main/'

## Without trainer

In [33]:
import torch
torch.cuda.empty_cache()

In [34]:
from torch.utils.data import DataLoader

tokenized_datasets.set_format("torch")
train_dataloader = DataLoader(
    tokenized_datasets["train"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=8,
)
eval_dataloader = DataLoader(
    tokenized_datasets["validation"],
    collate_fn=data_collator,
    batch_size=8
)


In [35]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

In [36]:
from transformers import AdamW

optimizer = AdamW(model.parameters(), lr=2e-5)



In [37]:
from accelerate import Accelerator

accelerator = Accelerator()
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader
)

In [38]:
from transformers import get_scheduler

num_train_epochs = 3
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)


In [39]:
from huggingface_hub import Repository, get_full_repo_name, create_repo

model_name = "marian-finetuned-kde4-en-to-fr-accelerate"
repo_name = get_full_repo_name(model_name)
repo_name


'naninya/marian-finetuned-kde4-en-to-fr-accelerate'

In [40]:
create_repo(repo_name)
output_dir = "marian-finetuned-kde4-en-to-fr-accelerate"
repo = Repository(output_dir, clone_from=repo_name)


For more details, please read https://huggingface.co/docs/huggingface_hub/concepts/git_vs_http.
Cloning https://huggingface.co/naninya/marian-finetuned-kde4-en-to-fr-accelerate into local empty directory.


In [41]:
def postprocess(predictions, labels):
    predictions = predictions.cpu().numpy()
    labels = labels.cpu().numpy()

    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

    # Replace -100 in the labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels]
    return decoded_preds, decoded_labels


In [51]:
outputs.logits.shape

torch.Size([3, 6, 59514])

In [42]:
from tqdm.auto import tqdm
import torch
import numpy as np

progress_bar = tqdm(range(num_training_steps))

for epoch in range(num_train_epochs):
    # train
    model.train()
    for batch in train_dataloader:
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

    # eval
    model.eval()
    for batch in tqdm(eval_dataloader):
        with torch.no_grad():
            generated_tokens = accelerator.unwrap_model(model).generate(
                batch["input_ids"],
                attention_mask=batch["attention_mask"],
                max_length=128,
            )
        labels = batch["labels"]

        # padding
        generated_tokens = accelerator.pad_across_processes(
            generated_tokens, dim=1, pad_index=tokenizer.pad_token_id
        )
        labels = accelerator.pad_across_processes(labels, dim=1, pad_index=-100)

        predictions_gathered = accelerator.gather(generated_tokens)
        labels_gathered = accelerator.gather(labels)

        decoded_preds, decoded_labels = postprocess(predictions_gathered, labels_gathered)
        metric.add_batch(predictions=decoded_preds, references=decoded_labels)

    results = metric.compute()
    print(f"epoch {epoch}, BLEU score: {results['score']:.2f}")

    # save & upload
    accelerator.wait_for_everyone()
    unwrapped_model = accelerator.unwrap_model(model)
    unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
    if accelerator.is_main_process:
        tokenizer.save_pretrained(output_dir)
        repo.push_to_hub(
            commit_message=f"Training in progress epoch {epoch}", blocking=False
        )


  0%|          | 0/711 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

epoch 0, BLEU score: 43.92


Adding files tracked by Git LFS: ['source.spm', 'target.spm']. This may take a bit of time if the files are large.


  0%|          | 0/27 [00:00<?, ?it/s]

epoch 1, BLEU score: 44.55


  0%|          | 0/27 [00:00<?, ?it/s]

epoch 2, BLEU score: 45.06


In [43]:
from transformers import pipeline

model_checkpoint = "naninya/marian-finetuned-kde4-en-to-fr"
translator = pipeline("translation", model=model_checkpoint)
translator("Default to expanded threads")


config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/299M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/288 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/842 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/778k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.46M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/74.0 [00:00<?, ?B/s]



[{'translation_text': 'Par défaut pour les threads élargis'}]

In [44]:
translator(
    "Unable to import %1 using the OFX importer plugin. This file is not the correct format."
)

[{'translation_text': "Impossible d'importer %1 en utilisant le module externe d'importation OFX. Ce fichier n'est pas le bon format."}]