In [None]:
import os

import numpy as np
import pandas as pd
from datasets import load_metric
from transformers import AutoTokenizer
from datasets import Dataset
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer




In [3]:
# https://cs.wikipedia.org/wiki/Logika
dataset_file_path = './data/translation.csv'
assert os.path.exists(dataset_file_path)
df = pd.read_csv(dataset_file_path, sep="|")

In [31]:
df

Unnamed: 0,Czech,English
0,Formální (matematická) logika je exaktní věda ...,Formal (mathematical) logic is an exact scien...
1,"Jazyk formální logiky je umělý formální, pro k...",The language of formal logic is an artificial...
2,Umělými formálními jazyky jsou jazyky všech ty...,Artificial formal languages are languages of ...
3,Nelze tedy např. v jakékoli formální logice po...,"Thus, for example, natural language cannot be..."
4,Je to překročení hranic exaktního světa poruše...,It is crossing the boundaries of the exact wo...
5,Logika v pojetí filosofie s podporou psycholog...,Logic as conceived by philosophy with the sup...
6,"Logika filosofická není exaktní věda, vypovídá...",Philosophical logic is not an exact science; ...
7,Jejím centrálním motivem je lidské usuzování (...,Its central motif is human reasoning (not thi...
8,"Zkoumá podmínky vyplývání, tedy hledá požadavk...","It examines the conditions of entailment, i.e..."
9,Cílem logiky je určit množinu korektních argum...,The goal of logic is to determine the set of ...


In [None]:
metric = load_metric("sacrebleu")
model_checkpoint = "Helsinki-NLP/opus-mt-mul-en"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

In [6]:
tokenizer.src_lang = "cs"
tokenizer.tgt_lang = "en"

In [7]:
d = [{'cs': str(row[1]), 'en': str(row[2])} for row in df.itertuples()]
dataset = Dataset.from_dict({'translation': d})

In [8]:
max_input_length = 128
max_target_length = 128
source_lang = "cs"
target_lang = "en"
prefix = ""

def preprocess_function(examples):
    inputs = [prefix + ex[source_lang] for ex in examples["translation"]]
    targets = [ex[target_lang] for ex in examples["translation"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
tokenized_datasets = dataset.map(preprocess_function, batched=True)


In [10]:
batch_size = 2
model_name = model_checkpoint.split("/")[-1]
args = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned-{source_lang}-to-{target_lang}",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=30,
    predict_with_generate=True,
    # fp16=True,
    push_to_hub=False,
)

In [11]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)


In [12]:
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [13]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets,
    eval_dataset=tokenized_datasets,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()


#### Results on training data
{'eval_loss': 0.026741966605186462, 'eval_bleu': 100.0, 'eval_gen_len': 28.0, 'eval_runtime': 43.9061, 'eval_samples_per_second': 0.228, 'eval_steps_per_second': 0.114, 'epoch': 18.0}


It is visible that after 18 epochs the model achieved 100% with SacreBLEU metric on the training dataset.

## Inference Example

In [29]:
sentence = "Formální (matematická) logika je exaktní věda používající umělý formální jazyk, který je schopný vypovídat jen a jen o entitách exaktního světa."
inputs = tokenizer(sentence, return_tensors="pt").input_ids
outputs = model.generate(inputs, max_new_tokens=40, do_sample=True, top_k=30, top_p=0.95)


Generate config GenerationConfig {
  "bad_words_ids": [
    [
      64171
    ]
  ],
  "bos_token_id": 0,
  "decoder_start_token_id": 64171,
  "eos_token_id": 0,
  "forced_eos_token_id": 0,
  "max_length": 512,
  "num_beams": 6,
  "pad_token_id": 64171,
  "transformers_version": "4.26.1"
}



In [30]:
tokenizer.decode(outputs[0], skip_special_tokens=True)


'Formal (mathematical) logic is an exact science using an artificial formal language that is capable of telling only and only about entities of the exact world.'

## Sources
- [Translation tutorial HF](https://huggingface.co/docs/transformers/tasks/translation)
- [Translation tutorial #2 HF](https://github.com/huggingface/notebooks/blob/main/examples/translation.ipynb)
