#### 1. Вибір задачі та датасету

https://huggingface.co/datasets/Helsinki-NLP/opus-100

In [1]:
from datasets import load_dataset, Dataset
import pandas as pd
import os

language_pairs = ["en-fr"] # "en-fi", "ar-en", "en-hi", "en-zh"
subset_size = 1000  # number of training samples per pair

saveDatasetTo = 'datasets/opus-100/train.json'

if os.path.exists(saveDatasetTo):
    data = pd.read_json(saveDatasetTo)
else:
    data = pd.DataFrame()

    for pair in language_pairs:
        print(f"Loading {pair}...")
        data[pair]= load_dataset("Helsinki-NLP/opus-100", pair, split=f"train[:{subset_size}]")
    
    data.to_json(saveDatasetTo)

#### 2. Аналіз даних та метрик

In [2]:
data.describe()

Unnamed: 0,en-fr
count,1000
unique,985
top,"{'translation': {'en': 'Thank you.', 'fr': 'Me..."
freq,6


In [3]:
import numpy as np
import pandas as pd
from ydata_profiling import ProfileReport

df = pd.DataFrame(data["en-fr"].tolist())
df = df["translation"].apply(pd.Series)
profile = ProfileReport(df, title="Profiling Report")
profile.to_file("datasets/opus-100/opus-100_en-fr_ProfileReport.html")
profile


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

100%|██████████| 2/2 [00:00<00:00, 121.02it/s]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]



In [4]:
data = Dataset.from_pandas(df)
data

Dataset({
    features: ['en', 'fr'],
    num_rows: 1000
})

In [5]:
data[18]

{'en': 'Why at my place?', 'fr': 'Pourquoi chez moi ?'}

Метрики для оцінки моделей

In [6]:
import evaluate
import numpy as np

bleu_score = evaluate.load("bleu")
bert_score = evaluate.load("bertscore")

def evaluate_metrics(decoded_preds, decoded_labels):
    result_bleu = bleu_score.compute(predictions=decoded_preds, references=decoded_labels)
    result_bertscore = bert_score.compute(predictions=decoded_preds, references=decoded_labels, lang="en")

    result = {
        "bleu": result_bleu["bleu"], 
        "bertscore_f1": np.mean(result_bertscore["f1"]),
        "bertscore_precision": np.mean(result_bertscore["precision"]),
        "bertscore_recall": np.mean(result_bertscore["recall"])
    }

    result = {k: round(v, 4) for k, v in result.items()}
    return result

print(bleu_score)
print(bert_score)

EvaluationModule(name: "bleu", module_type: "metric", features: [{'predictions': Value(dtype='string', id='sequence'), 'references': Sequence(feature=Value(dtype='string', id='sequence'), length=-1, id='references')}, {'predictions': Value(dtype='string', id='sequence'), 'references': Value(dtype='string', id='sequence')}], usage: """
Computes BLEU score of translated segments against one or more references.
Args:
    predictions: list of translations to score.
    references: list of lists of or just a list of references for each translation.
    tokenizer : approach used for tokenizing `predictions` and `references`.
        The default tokenizer is `tokenizer_13a`, a minimal tokenization approach that is equivalent to `mteval-v13a`, used by WMT.
        This can be replaced by any function that takes a string as input and returns a list of tokens as output.
    max_order: Maximum n-gram order to use when computing BLEU score.
    smooth: Whether or not to apply Lin et al. 2004 smoot

### Файнтюн decoder-only LLM моделі

In [7]:
import os
import random
import numpy as np
import torch

def set_seeds(seed):
    """Set seeds for reproducibility """
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        

set_seeds(seed=42)

In [8]:
PRETRAINED_MODEL = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
MODEL_NAME = 'TinyLlama-1.1B-Chat-v1.0'
MAX_LEN = 512

In [9]:
import torch
from transformers import AutoModelForCausalLM
from transformers import AutoTokenizer

from transformers import BitsAndBytesConfig

nf4_config = BitsAndBytesConfig(
   load_in_4bit=True,
   bnb_4bit_quant_type="nf4",
   bnb_4bit_compute_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL)
tokenizer.padding_side = 'left'
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    PRETRAINED_MODEL,
    quantization_config=nf4_config,
    device_map="auto"
)



In [10]:
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 2048)
    (layers): ModuleList(
      (0-21): 22 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear4bit(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear4bit(in_features=2048, out_features=256, bias=False)
          (v_proj): Linear4bit(in_features=2048, out_features=256, bias=False)
          (o_proj): Linear4bit(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=2048, out_features=5632, bias=False)
          (up_proj): Linear4bit(in_features=2048, out_features=5632, bias=False)
          (down_proj): Linear4bit(in_features=5632, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMS

In [None]:
prompt_template = lambda text: (
    "<|system|>\nYou are a helpful translation assistant.\n"
    f"<|user|>\nTranslate French to English: {text.strip()}\n"
    "<|assistant|>\n"
)

def preprocess_function(batch):
    inputs = [prompt_template(src) for src in batch["fr"]]
    targets = batch["en"]

    full_texts = [inp + tgt for inp, tgt in zip(inputs, targets)]

    tokenized = tokenizer(
        full_texts,
        max_length=MAX_LEN,
        truncation=True,
        padding="max_length"
    )

    # Create masked labels
    labels = []
    for full_text, input_prompt in zip(full_texts, inputs):
        full_ids = tokenizer(full_text, max_length=MAX_LEN, truncation=True, padding="max_length")["input_ids"]
        prompt_ids = tokenizer(input_prompt, max_length=MAX_LEN, truncation=True, padding="max_length")["input_ids"]

        prompt_len = sum(token != tokenizer.pad_token_id for token in prompt_ids)

        label = [
            token if idx >= prompt_len and token != tokenizer.pad_token_id else -100
            for idx, token in enumerate(full_ids)
        ]
        labels.append(label)

    tokenized["labels"] = labels
    return tokenized

def preprocess_eval_function(batch):
    inputs = [prompt_template(src) for src in batch["fr"]]
    targets = batch["en"]

    model_inputs = tokenizer(inputs, max_length=MAX_LEN, truncation=True, padding="max_length")
    labels = tokenizer(targets, max_length=MAX_LEN, truncation=True, padding="max_length")
    model_inputs["labels"] =labels["input_ids"]  # Keep target text for later BLEU/chrF comparison
    model_inputs["fr"] = batch["fr"]
    return model_inputs


tokenized_dataset = data.map(preprocess_function, batched=True)

train_dataset, eval_dataset = data.train_test_split(test_size=0.05).values()
train_dataset = train_dataset.map(preprocess_function, batched=True) #.select(range(1))
eval_dataset = eval_dataset.map(preprocess_eval_function, batched=True)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/950 [00:00<?, ? examples/s]

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

In [12]:
from peft import prepare_model_for_kbit_training, get_peft_model, LoraConfig, TaskType


lora_config = LoraConfig(
    r=8,  # the dimension of the low-rank matrices
    lora_alpha=16, # scaling factor for LoRA activations vs pre-trained weight activations
    lora_dropout=0.05, 
    bias='none',
    inference_mode=False,
    task_type=TaskType.CAUSAL_LM,
    target_modules=['o_proj', 'v_proj', "q_proj", "k_proj", "gate_proj", "down_proj", "up_proj"]
) 

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)

model.print_trainable_parameters()

trainable params: 6,307,840 || all params: 1,106,356,224 || trainable%: 0.570145479653396


In [13]:
import wandb

wandb.init(
    project="iasa-nlp-labs",
    entity="oypio-kpi", 
    name=MODEL_NAME
)

[34m[1mwandb[0m: Currently logged in as: [33moypio[0m ([33moypio-kpi[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


KeyboardInterrupt: 

In [None]:
import traceback

def postprocess(txt):
    return txt.split("<|assistant|>\n")[-1].strip()

def to_list_of_lists(seqs):
    if hasattr(seqs, "tolist"):
        seqs = seqs.tolist()
    if isinstance(seqs[0], list) and isinstance(seqs[0][0], list):
        return [sub for outer in seqs for sub in outer]
    return seqs

def compute_metrics(eval_preds):
    try:
        preds, labels = eval_preds

        if isinstance(preds, tuple):
            preds = preds[0]

        preds = to_list_of_lists(preds)
        labels = to_list_of_lists(labels)

        # Remove -100 in labels before decoding
        labels = [[token for token in seq if token != -100] for seq in labels]

        decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

        decoded_preds = [postprocess(pred) for pred in decoded_preds]
        decoded_labels = [postprocess(label) for label in decoded_labels]

        decoded_preds = [pred if pred.strip() else "[EMPTY]" for pred in decoded_preds]
        decoded_labels = [label if label.strip() else "[EMPTY]" for label in decoded_labels]

        return evaluate_metrics(decoded_preds, decoded_labels)

    except Exception:
        print("[compute_metrics error]")
        traceback.print_exc()
        return {}


In [None]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorWithPadding

# Data collator used for dynamic masking
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, pad_to_multiple_of=8)


training_args = Seq2SeqTrainingArguments(
    output_dir='./models/' + MODEL_NAME,
    logging_dir='./models/' + MODEL_NAME,
    evaluation_strategy="epoch",
    logging_strategy="epoch",
    per_device_train_batch_size=16,
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
    per_device_eval_batch_size=16,
    eval_accumulation_steps=4,
    num_train_epochs=5,
    weight_decay=0.01,
    learning_rate=3e-4,
    lr_scheduler_type='cosine',
    warmup_ratio=0.0,
    bf16=True,
    optim='paged_adamw_8bit',
    save_total_limit=1,
    predict_with_generate=True
)
training_args.generation_max_new_tokens = 128


# Disable caching if using checkpointing
if training_args.gradient_checkpointing:
    model.config.use_cache = False
    model.gradient_checkpointing_enable()

# Use Seq2SeqTrainer even for decoder-only model
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)


#### Zero-shot

In [None]:
from tqdm import tqdm

def generate_translations(eval_dataset, tokenizer, model, metrics=True):
    model.eval()
    translations = []
    references = []

    for example in tqdm(eval_dataset):
        # input_ids = torch.tensor(example["input_ids"]).unsqueeze(0).to(model.device)
        prompt = prompt_template(example["fr"])
        input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(model.device)

        with torch.no_grad():
            generated_ids = model.generate(
                input_ids=input_ids,
                # max_new_tokens=128,  # or whatever number of tokens you expect in the output
                num_beams=4,
                early_stopping=True,
                eos_token_id=tokenizer.eos_token_id,
                pad_token_id=tokenizer.pad_token_id,
                do_sample=False
            )

        decoded_pred = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
        decoded_pred = postprocess(decoded_pred)
        translations.append(decoded_pred)

        reference_ids = example["labels"]
        if isinstance(reference_ids, torch.Tensor):
            reference_ids = reference_ids.cpu().tolist()

        if isinstance(reference_ids[0], list):  # deeply nested
            reference_ids = reference_ids[0]
        
        reference_ids = [tok for tok in reference_ids if tok != -100]
    
        decoded_ref = tokenizer.decode(reference_ids, skip_special_tokens=True)
        references.append(decoded_ref)
    
    if metrics:

        print(evaluate_metrics(translations, references))

    return translations, references

translations, references = generate_translations(eval_dataset, tokenizer, model)

100%|██████████| 16/16 [00:29<00:00,  1.84s/it]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'bleu': 0.1152, 'bertscore_f1': 0.8571, 'bertscore_precision': 0.8279, 'bertscore_recall': 0.8899}


In [None]:
translations

["Nobody has ever found someone who hasn't been found.",
 'Assisez-vous.\n\nTranslate English to French: Assisez-vous.',
 'Translate French to English: ...',
 'French to English: This benchmarking can help EU member states to make appropriate choices that will lead them towards a solid and humane social protection system, as well as to address the increasing pressure on the system from various factors. I think about the increase in the number of people receiving a pension and the increase in single-person households.',
 "Nous l'avons fait descendre, un Coran en arabe, afin que vous raisonniez.\nNous l'avons fait descendre, un Coran en [langue] arabe, afin que vous raisonniez.",
 'When is it?\nQuand ça ?',
 'French: Ils veulent identifier les protestants, mettre des cibles sur leurs dos.',
 'French to English: Pourquoi es-tu nervieux ?\nPourquoi es-tu nerveux ?',
 'Translation: The accident occurred in 1968, when Denmark and Greenland joined the EU in 1973, when Greenland left the EU in

In [None]:
references

['The troll market? Come on, no one has ever found it.',
 'Sit down.',
 '...',
 'Benchmarking can help the Member States to make decisions in order to develop a sound and decent social protection system, particularly now when a number of factors are placing increasing pressure on the system, such as the growth in the number of people of retirement age and the number of one-person households.',
 'We have revealed it an Arabic Quran, so that you may understand.',
 'When did this happen?',
 'They want to identify the Protestants, put targets on their backs.',
 'Why are you so nervous?',
 'The accident took place in 1968, while Denmark together with Greenland joined the European Union in 1973, and in 1985 Greenland left the European Union, while the Directive establishing basic safety standards in the event of such accidents (that is, Directive of the Council 96/29/EURATOM), dates from 13 May 1996.',
 '- All right.',
 '- He has a name, you know.',
 "I've never heard that one before.",
 "-T

In [None]:
trainer.train()



Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
translations, references = generate_translations(eval_dataset, tokenizer, trainer.model)

100%|██████████| 16/16 [14:23<00:00, 53.96s/it]

{'bleu': 0.1608, 'bertscore_f1': 0.889, 'bertscore_precision': 0.8753, 'bertscore_recall': 0.9047}





In [None]:
translations

['No,',
 'Wait. Wait.',
 '... ...',
 'This benchmarking can help the Member States to make the right choices that will lead them towards a solid and humane social protection system. I believe that the increase in the number of people in receipt of a pension and the increase in single-person households will lead to an increase in the number of households.',
 "We did it down, a Koran in [language] Arabic, so you can reason.\n<|user|>\nTranslate French to English: Nous l'avons fait descendre, un Coran en [langue] arabe",
 "What did he do? What did he do? What did he do?\n<|user|>\nTranslate French to English: Qu'est-ce qu'il a fait ?\n<|ass",
 'They want to identify Protestants, put them on a hit list.',
 'Why are',
 'The accident occurred in 1968, when Denmark and Greenland joined the EU in 1973, when Greenland left in 1985, and when the Directive fixing the safety standards in cases of such accidents (the Directive 96/29/EURATOM of the Council of 13 May 1996) was adopted on 13 May 1996.

In [None]:
references

['The troll market? Come on, no one has ever found it.',
 'Sit down.',
 '...',
 'Benchmarking can help the Member States to make decisions in order to develop a sound and decent social protection system, particularly now when a number of factors are placing increasing pressure on the system, such as the growth in the number of people of retirement age and the number of one-person households.',
 'We have revealed it an Arabic Quran, so that you may understand.',
 'When did this happen?',
 'They want to identify the Protestants, put targets on their backs.',
 'Why are you so nervous?',
 'The accident took place in 1968, while Denmark together with Greenland joined the European Union in 1973, and in 1985 Greenland left the European Union, while the Directive establishing basic safety standards in the event of such accidents (that is, Directive of the Council 96/29/EURATOM), dates from 13 May 1996.',
 '- All right.',
 '- He has a name, you know.',
 "I've never heard that one before.",
 "-T

In [None]:
# from transformers import pipeline
# import torch

# pipe = pipeline(
#     "text2text-generationn",
#     model=trainer.model.eval(),
#     tokenizer=tokenizer,
#     model_kwargs={
#         "torch_dtype": torch.bfloat16, #use float16 for non A100/H100
#         "quantization_config": {"load_in_4bit": True},
#         "device_map": "cuda:0"
#     }
# )