# Translation english to ukrainian

In [1]:
!pip install datasets transformers peft evaluate sacrebleu accelerate bitsandbytes bert_score



In [2]:
import numpy as np
import pandas as pd
import torch
import evaluate
from tqdm import tqdm
from datasets import load_dataset
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, default_data_collator
from peft import get_peft_model, get_peft_config, PrefixTuningConfig, TaskType, LoraConfig, PeftModel

## Selecting model
Using nllb because it was pretrained on 100+ languages and knows ukrainian tokens

In [3]:
dataset_name = 'opus100'
model_name = 'facebook/nllb-200-distilled-600M'
model_save_path = 'en-uk-nllb/'
max_length = 32
batch_size = 32
device = 'cuda' if torch.cuda.is_available() else 'cpu'

## Loading data

In [4]:
train_dataset = load_dataset(dataset_name, 'en-uk', split='train[:2%]')
test_dataset = load_dataset(dataset_name, 'en-uk', split='test')
validation_dataset = load_dataset(dataset_name, 'en-uk', split='validation')

In [5]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

## Vectorizing dataset

In [6]:
def preprocess(examples):
    input = [e['en'] for e in examples['translation']]
    target = [e['uk'] for e in examples['translation']]
    input = tokenizer(input, max_length=max_length, padding='max_length', truncation=True, return_tensors='pt')
    target = tokenizer(target, max_length=max_length, padding='max_length', truncation=True, return_tensors='pt').input_ids
    target[target == tokenizer.pad_token_id] = -100
    input['labels'] = target
    return input

In [7]:
processed_train_dataset = train_dataset.map(
    preprocess,
    batched=True,
    remove_columns=train_dataset.column_names
)
processed_test_dataset = test_dataset.map(
    preprocess,
    batched=True,
    remove_columns=test_dataset.column_names
)

In [8]:
train_dataloader = DataLoader(
    processed_train_dataset,
    batch_size=batch_size,
    collate_fn=default_data_collator,
    shuffle=True,
    pin_memory=True
)
test_dataloader = DataLoader(
    processed_test_dataset,
    batch_size=batch_size,
    collate_fn=default_data_collator,
    shuffle=False, # don't shuffle to calculate metrics
    pin_memory=True
)

## Lora config
Using LoRA because it saves gpu memory and has much better score then prefix tuning

In [13]:
prefix_config = PrefixTuningConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,
    inference_mode=False,
    num_virtual_tokens=100
)

lora_config = LoraConfig(
    r=32,
    lora_alpha=16,
    target_modules=['q_proj', 'v_proj'],
    bias='none',
    task_type='SEQ_2_SEQ_LM'
)

model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
# model = get_peft_model(model, prefix_config)
model = get_peft_model(model, lora_config)
model.to(device);

## Metrics

In [14]:
bleu = evaluate.load('bleu')
chrf = evaluate.load('chrf')
bert = evaluate.load('bertscore')
references = [e['uk'] for e in test_dataset['translation']]
def compute_metrics(model):
  model.eval()
  eval_loss = 0
  eval_preds = []

  for eval_batch in test_dataloader:
    eval_batch = {k: v.to(device) for k, v in eval_batch.items()}
    with torch.no_grad():
      out = model(**eval_batch)
    eval_loss += out.loss.detach().cpu().item()
    eval_preds.extend(
        tokenizer.batch_decode(
            torch.argmax(out.logits, -1).detach().cpu().numpy(),
            skip_special_tokens=True
        )
    )

  bleu_score = bleu.compute(predictions=eval_preds, references=references)['bleu']
  chrf_score = chrf.compute(predictions=eval_preds, references=references)['score']
  bert_score = bert.compute(predictions=eval_preds, references=references, lang='uk')
  bert_f1 = np.mean(bert_score['f1'])

  return eval_loss / len(test_dataloader), bleu_score, chrf_score, bert_f1

## Training

In [15]:
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[1], gamma=0.1)

In [16]:
eval_steps = 200
results_df = pd.DataFrame({
    'step': [],
    'train_loss': [],
    'test_loss': [],
    'bleu_score': [],
    'chrf_score': [],
    'bert_f1': []
    })
train_loss = 0

model.train()
for step, batch in enumerate(tqdm(train_dataloader), start=1):
  batch = {k: v.to(device) for k, v in batch.items()}
  out = model(**batch)
  loss = out.loss
  loss.backward()
  optimizer.step()
  optimizer.zero_grad()
  train_loss += loss.detach().cpu().item()

  if step % eval_steps == 0:
    test_loss, bleu_score, chtf_score, bert_f1 = compute_metrics(model)
    metrics = [
        step,
        train_loss/eval_steps,
        test_loss,
        bleu_score,
        chtf_score,
        bert_f1
    ]

    results_df.loc[len(results_df)] = metrics
    print(f'\nStep: {metrics[0]}, train loss: {metrics[1]}, test loss: {metrics[2]}, bleu: {metrics[3]}, chtf: {metrics[4]}, bert: {metrics[5]}')

    train_loss = 0
    model.save_pretrained(model_save_path + f'step-{step}')
    scheduler.step()
    model.train()

results_df

 32%|███▏      | 200/625 [03:17<1:34:58, 13.41s/it]


Step: 200, train loss: 2.4943442064523698, test loss: 2.1185390646495517, bleu: 0.16230913081816817, chtf: 40.1785590147101, bert: 0.7864089741408825


 64%|██████▍   | 400/625 [06:26<41:39, 11.11s/it]


Step: 400, train loss: 2.1118676936626435, test loss: 2.0991179621408858, bleu: 0.17170187669657797, chtf: 40.629937145397385, bert: 0.7936508451998234


 96%|█████████▌| 600/625 [09:34<04:29, 10.79s/it]


Step: 600, train loss: 2.0796703481674195, test loss: 2.0881263396096608, bleu: 0.180446705133743, chtf: 41.010826436402176, bert: 0.7974831014275551


100%|██████████| 625/625 [09:53<00:00,  1.05it/s]


Unnamed: 0,step,train_loss,test_loss,bleu_score,chrf_score,bert_f1
0,200.0,2.494344,2.118539,0.162309,40.178559,0.786409
1,400.0,2.111868,2.099118,0.171702,40.629937,0.793651
2,600.0,2.07967,2.088126,0.180447,41.010826,0.797483


## Inference
Comparing original and trained model. Loading model in 8bit to save gpu memory and do faster predictions

In [20]:
original_model = AutoModelForSeq2SeqLM.from_pretrained(model_name, load_in_8bit=True)
trained_model = PeftModel.from_pretrained(original_model, model_save_path + 'step-600')
trained_model.to(device);

In [33]:
examples = pd.DataFrame(validation_dataset['translation']).sample(5)['en'].to_list()

In [34]:
for i, example in enumerate(examples, start=1):
  print(f'{i} {example}')
  example_for_original = tokenizer(example, return_tensors='pt').input_ids.to(device)
  example_for_trained = tokenizer(example, return_tensors='pt').input_ids.to(device)
  original_translation = tokenizer.decode(
      original_model.generate(input_ids=example_for_original, forced_bos_token_id=tokenizer.lang_code_to_id['rus_Cyrl'], max_new_tokens=100)[0],
      skip_special_tokens=True
  )
  trained_translation = tokenizer.decode(
      trained_model.generate(input_ids=example_for_trained, max_new_tokens=100)[0],
      skip_special_tokens=True
  )
  print('Original model translation: ')
  print(original_translation)
  print("Trained model translation: ")
  print(trained_translation)

1 I got $500.
Original model translation: 
У меня есть 500 долларов.
Trained model translation: 
У мене 500 доларів.
2 And there is one more thing, Mr Briggs.
Original model translation: 
И еще одна вещь, мистер Бригггс.
Trained model translation: 
І є ще одне, містер Бріггс.
3 But it's here, it's getting in.
Original model translation: 
Но он здесь, он входит.
Trained model translation: 
Но он здесь, он входит.
4 Make sure that so duplicate symbol exists already in the row/ column/ section you are entering it to.
Original model translation: 
Убедитесь, что символы, которые вы вводят в строку/ столб/ раздел, уже имеются.
Trained model translation: 
Перевіртесь, що такий дубликаційний символ вже існує в строці/ колоні/ розділі, в який ви його вводите.
5 That was hard to believe.
Original model translation: 
Это было трудно поверить.
Trained model translation: 
В это было трудно поверить.


As we can see, model after training can translate english sentences into ukrainian even without adding forced_bos_token_id.
Also, sometimes model can translate into russian, it probably happens because original nllb model wasn't trained on ukrainian and it tries to translate into closest language it knows. This can be solved by training model longer.

As for different metrics: bleu, chrF and bert score are all increasing by similar proportion, the difference is absolute score.
This means that all metrics can be used interchangeably. But I think that chrF shows score that, in absolute value, better correlates with model performance.

The problem with the bleu metric is that it matches whole tokens and penilizes when model uses correct word in incorrect form. As we can see from predicted examples, model can succesfully translate overall sentence meaning, but makes some mistakes in used words. This means that bleu score underestimates model performance.

The problem with bert score is that, while it much better estimates similar meaning of sentence, it doesn't pay much attention to word spelling. And since russian and ukrainian have similar spelling for many words, this causes bert to not recognize that model correctly translates sentence, but into the wrong language. And since biggest problem with model is that it translates into russian and not ukrainian, bert overestimates model performance.

On the other hand, chrF calculates all character and word n-grams matches. That alows this metric to evaluate word spelling, while beeing less sensetive to smaller mistakes. And from predicted examples we can see that model translates into russian about half of the sentences and makes some mistakes in ukrainian translations. Therefore we can estimate model score at about 40%, which is similar to chrF evaluation.