### ЛАБОРАТОРНА РОБОТА 2

#### 1. Вибір задачі та датасету

https://huggingface.co/datasets/Helsinki-NLP/opus-100

In [1]:
from datasets import load_dataset, Dataset
import pandas as pd
import os

language_pairs = ["en-fr"] # "en-fi", "ar-en", "en-hi", "en-zh"
subset_size = 1000  # number of training samples per pair

saveDatasetTo = 'datasets/opus-100/train.json'

if os.path.exists(saveDatasetTo):
    data = pd.read_json(saveDatasetTo)
else:
    data = pd.DataFrame()

    for pair in language_pairs:
        print(f"Loading {pair}...")
        data[pair]= load_dataset("Helsinki-NLP/opus-100", pair, split=f"train[:{subset_size}]")
    
    data.to_json(saveDatasetTo)

#### 2. Аналіз даних та метрик

In [41]:
data.describe()

Unnamed: 0,en-fr
count,1000
unique,985
top,"{'translation': {'en': 'Thank you.', 'fr': 'Me..."
freq,6


In [42]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1000 entries, 0 to 999
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   en-fr   1000 non-null   object
dtypes: object(1)
memory usage: 15.6+ KB


In [43]:
data.isnull().sum()

en-fr    0
dtype: int64

In [2]:
df = pd.DataFrame(data["en-fr"].tolist())
df = df["translation"].apply(pd.Series)
data = Dataset.from_pandas(df)
data

Dataset({
    features: ['en', 'fr'],
    num_rows: 1000
})

In [45]:
data[18]

{'en': 'Why at my place?', 'fr': 'Pourquoi chez moi ?'}

Метрики для оцінки моделей

In [3]:
import evaluate
import numpy as np

bleu_score = evaluate.load("bleu")
bert_score = evaluate.load("bertscore")

def evaluate_metrics(decoded_preds, decoded_labels):
    result_bleu = bleu_score.compute(predictions=decoded_preds, references=decoded_labels)
    result_bertscore = bert_score.compute(predictions=decoded_preds, references=decoded_labels, lang="en")

    result = {
        "bleu": result_bleu["bleu"], 
        "bertscore_f1": np.mean(result_bertscore["f1"]),
        "bertscore_precision": np.mean(result_bertscore["precision"]),
        "bertscore_recall": np.mean(result_bertscore["recall"])
    }

    result = {k: round(v, 4) for k, v in result.items()}
    return result

print(bleu_score)
print(bert_score)

EvaluationModule(name: "bleu", module_type: "metric", features: [{'predictions': Value(dtype='string', id='sequence'), 'references': Sequence(feature=Value(dtype='string', id='sequence'), length=-1, id='references')}, {'predictions': Value(dtype='string', id='sequence'), 'references': Value(dtype='string', id='sequence')}], usage: """
Computes BLEU score of translated segments against one or more references.
Args:
    predictions: list of translations to score.
    references: list of lists of or just a list of references for each translation.
    tokenizer : approach used for tokenizing `predictions` and `references`.
        The default tokenizer is `tokenizer_13a`, a minimal tokenization approach that is equivalent to `mteval-v13a`, used by WMT.
        This can be replaced by any function that takes a string as input and returns a list of tokens as output.
    max_order: Maximum n-gram order to use when computing BLEU score.
    smooth: Whether or not to apply Lin et al. 2004 smoot

Бенчмарки

#### 3. Експериментальна частина


Бейслайн: **grok3**

Варто зазначити деякі недоліки в результатах від grok3:
- в перекладі був пропущений 1 рядок (елемент), тому довелось власноруч його заповнити аби усі результати не з'їхали на один індекс
- деякі речення розбились на декілька колонок, тож їх потрібно було об'єднати

In [None]:
labels = pd.read_csv('datasets/opus-100/en-fr_labels.csv')
predicted = pd.read_csv('datasets/opus-100/grok3_en-fr_preds.csv')

labels = labels['en-fr']
predicted = predicted['en-fr']


for label, pred in list(zip(labels, predicted))[:5]:
    print('label:', label)
    print('pred:', pred)
    print()

evaluate_metrics(labels, predicted)

label: Open the gate!
pred: Open the gate quickly!

label: No! Tara?
pred: Tara?

label: Who do you work for?
pred: Who do you work for?

label: Where have you been?
pred: Where were you then?

label: "Tetsutaro.
pred: Tetsutaro.



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'bleu': 0.3667,
 'bertscore_f1': 0.9395,
 'bertscore_precision': 0.9344,
 'bertscore_recall': 0.945}

### Файнтюн encoder-decoder моделі

In [4]:
import os
import random
import numpy as np
import torch

def set_seeds(seed):
    """Set seeds for reproducibility """
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        

set_seeds(seed=42)

In [12]:
import wandb

wandb.init(
    project="iasa-nlp-labs",
    entity="oypio-kpi", 
    name='lab2-t5-base-fr-en'
)

[34m[1mwandb[0m: Currently logged in as: [33moypio[0m ([33moypio-kpi[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
import torch
from datasets import load_dataset
import evaluate
from transformers import (
    MT5ForConditionalGeneration,
    MT5Tokenizer,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq,
)

model_name = "google/mt5-base"
tokenizer = MT5Tokenizer.from_pretrained(model_name)
model = MT5ForConditionalGeneration.from_pretrained(model_name)
MAX_LEN = 256

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [8]:
def preprocess_function(batch):
    prefix = "translate French to English: "
    inputs = [prefix + text for text in batch['fr']]
    targets = batch['en']
    # Tokenize without padding (dynamic padding will be applied in the collator)
    model_inputs = tokenizer(inputs, max_length=MAX_LEN, truncation=True, padding=False)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=MAX_LEN, truncation=True, padding=False)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Tokenize the dataset
tokenized_dataset = data.map(preprocess_function, batched=True)

# Split the dataset into training and evaluation sets
split_datasets = tokenized_dataset.train_test_split(test_size=0.05)
train_dataset = split_datasets["train"]
eval_dataset = split_datasets["test"]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]



In [15]:
train_dataset[0]

{'en': 'Here it is.',
 'fr': 'Le voilà.',
 'input_ids': [37194, 21273, 288, 5413, 267, 764, 3133, 23068, 260, 1],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 'labels': [10421, 609, 339, 260, 1]}

In [10]:
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels

def compute_metrics(eval_preds):
    try:
        preds, labels = eval_preds
        if isinstance(preds, tuple):
            preds = preds[0]

        preds = np.where(preds != -100, preds, tokenizer.pad_token_id)
        decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=False)
    
        # Replace -100 in the labels as we can't decode them.
        labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=False)
    
        # Some simple post-processing
        decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
    
        result_bleu = bleu_score.compute(predictions=decoded_preds, references=decoded_labels)
        result_bertscore = bert_score.compute(predictions=decoded_preds, references=decoded_labels, lang="en")
        prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]

        result = {
            "bleu": result_bleu["bleu"], 
            "bertscore_f1": np.mean(result_bertscore["f1"]),
            "bertscore_precision": np.mean(result_bertscore["precision"]),
            "bertscore_recall": np.mean(result_bertscore["recall"]),
            "gen_len": np.mean(prediction_lens)
        }
    
        result = {k: round(v, 4) for k, v in result.items()}
        return result
    except Exception as exc:
        print(str(exc))

In [17]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, label_pad_token_id=-100)

training_args = Seq2SeqTrainingArguments(
    output_dir="./mt5_fr2en",
    evaluation_strategy="epoch",
    logging_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    predict_with_generate=True,  # This ensures that model.generate() is used for predictions. Think about where do we need it?
    num_train_epochs=15,
    learning_rate=1e-4,
    weight_decay=0.01,
    save_total_limit=1,
    push_to_hub=False
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [18]:
inference_output = trainer.predict(eval_dataset)


# Decode generated predictions
generated_texts = tokenizer.batch_decode(
    inference_output.predictions,
    skip_special_tokens=True,
    clean_up_tokenization_spaces=True
)




Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
inference_output.metrics

{'test_loss': 12.52834415435791,
 'test_bleu': 0.335,
 'test_bertscore_f1': 0.8312,
 'test_bertscore_precision': 0.8412,
 'test_bertscore_recall': 0.8248,
 'test_gen_len': 5.34,
 'test_runtime': 3.0481,
 'test_samples_per_second': 16.403,
 'test_steps_per_second': 1.312}

In [20]:
generated_texts

['<extra_id_0> / Archives',
 '<extra_id_0> - Peter',
 '<extra_id_0> -',
 '<extra_id_0>',
 '<extra_id_0> : -',
 '<extra_id_0>.',
 '<extra_id_0> - french.',
 '<extra_id_0>!',
 '<extra_id_0>.',
 '<extra_id_0> -',
 '<extra_id_0>?',
 '<extra_id_0> french to English',
 '<extra_id_0> french to english',
 '<extra_id_0>?',
 '<extra_id_0> asthm',
 '<extra_id_0>',
 '<extra_id_0> - french',
 '<extra_id_0>.',
 '<extra_id_0> french dictionary',
 '<extra_id_0> French (CA)',
 '<extra_id_0> -?',
 '<extra_id_0> non non non',
 '<extra_id_0> - eHow',
 '<extra_id_0> -?',
 '<extra_id_0> à droit',
 '<extra_id_0> - Wikimedia Foundation',
 '<extra_id_0> - french to English:',
 '<extra_id_0> -  <extra_id_1> -',
 '<extra_id_0> English - French',
 '<extra_id_0>?',
 '<extra_id_0> - french.',
 '<extra_id_0>.',
 '<extra_id_0>:?',
 '<extra_id_0> - french.',
 '<extra_id_0>.',
 '<extra_id_0> great great great great great',
 '<extra_id_0>.',
 '<extra_id_0> :',
 '<extra_id_0> French to English',
 '<extra_id_0> french to 

In [21]:
trainer.train()

Epoch,Training Loss,Validation Loss,Bleu,Bertscore F1,Bertscore Precision,Bertscore Recall,Gen Len
1,11.7445,6.100876,0.3168,0.8289,0.8338,0.8269,5.78
2,7.0016,3.74366,0.3662,0.8417,0.8848,0.805,4.36
3,4.1789,2.621285,0.1604,0.852,0.8838,0.8242,9.74
4,3.3169,2.36967,0.1642,0.8632,0.8772,0.8511,9.4
5,2.8257,2.255475,0.2309,0.8777,0.8868,0.8697,7.42
6,2.4896,2.239475,0.2367,0.8755,0.8904,0.8622,7.26
7,2.3434,2.174262,0.2134,0.8736,0.8889,0.86,7.94
8,2.1923,2.149492,0.221,0.8763,0.8885,0.8653,7.74
9,2.0642,2.178177,0.2066,0.8723,0.8874,0.8587,8.14
10,1.9392,2.158738,0.2105,0.8738,0.8916,0.8579,8.06




TrainOutput(global_step=900, training_loss=3.2612314860026044, metrics={'train_runtime': 1116.0188, 'train_samples_per_second': 12.769, 'train_steps_per_second': 0.806, 'total_flos': 1962112532649984.0, 'train_loss': 3.2612314860026044, 'epoch': 15.0})

In [22]:
eval_dataset

Dataset({
    features: ['en', 'fr', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 50
})

In [23]:
model.to("cuda")

inference_output = trainer.predict(eval_dataset)


# Decode generated predictions
generated_texts = tokenizer.batch_decode(
    inference_output.predictions,
    skip_special_tokens=True,
    clean_up_tokenization_spaces=True
)


In [24]:
inference_output.metrics

{'test_loss': 2.141897201538086,
 'test_bleu': 0.2227,
 'test_bertscore_f1': 0.8721,
 'test_bertscore_precision': 0.8875,
 'test_bertscore_recall': 0.8583,
 'test_gen_len': 7.66,
 'test_runtime': 3.1738,
 'test_samples_per_second': 15.754,
 'test_steps_per_second': 1.26}

In [25]:
from transformers import pipeline

generator = pipeline(
    "text2text-generation", model=trainer.model.eval(), tokenizer=trainer.tokenizer
)

In [26]:
outputs = generator(
    "translate French to English: " + "Bonjour comment allez-vous?",
    max_length=128,
    do_sample=True,
    temperature=0.8
)
outputs

[{'generated_text': 'Hello, how are you gonna go?'}]

Завантаження збереженої моделі

In [12]:
import torch
from datasets import load_dataset
from transformers import pipeline
import evaluate
from transformers import (
    MT5ForConditionalGeneration,
    MT5Tokenizer,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq,
)

model_name = "./mt5_fr2en_copy/checkpoint-500"
tokenizer = MT5Tokenizer.from_pretrained(model_name)
model = MT5ForConditionalGeneration.from_pretrained(model_name)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, label_pad_token_id=-100)

training_args = Seq2SeqTrainingArguments(
    output_dir="./mt5_fr2en",
    evaluation_strategy="epoch",
    logging_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    predict_with_generate=True,  # This ensures that model.generate() is used for predictions. Think about where do we need it?
    num_train_epochs=15,
    learning_rate=1e-4,
    weight_decay=0.01,
    save_total_limit=1,
    push_to_hub=False
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

model.to("cuda")

inference_output = trainer.predict(eval_dataset)

generated_texts = tokenizer.batch_decode(
    inference_output.predictions,
    skip_special_tokens=True,
    clean_up_tokenization_spaces=True
)



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
inference_output.metrics

{'test_loss': 1.1302889585494995,
 'test_bleu': 0.0114,
 'test_bertscore_f1': 0.8398,
 'test_bertscore_precision': 0.8674,
 'test_bertscore_recall': 0.8162,
 'test_gen_len': 8.58,
 'test_runtime': 3.6516,
 'test_samples_per_second': 13.692,
 'test_steps_per_second': 1.095}

In [15]:
generator = pipeline(
    "text2text-generation", model=trainer.model.eval(), tokenizer=trainer.tokenizer
)

outputs = generator(
    "translate French to English: " + "Bonjour comment allez-vous?",
    max_length=128,
    do_sample=True,
    temperature=0.8
)
outputs


[{'generated_text': 'Hello, how are you doing now?'}]

### Файнтюн decoder-only LLM моделі

In [6]:
PRETRAINED_MODEL = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
MAX_LEN = 512

In [7]:
import torch
from transformers import AutoModelForCausalLM
from transformers import AutoTokenizer

from transformers import BitsAndBytesConfig

nf4_config = BitsAndBytesConfig(
   load_in_4bit=True,
   bnb_4bit_quant_type="nf4",
   bnb_4bit_compute_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    PRETRAINED_MODEL,
    torch_dtype=torch.bfloat16,
    quantization_config=nf4_config
)

`low_cpu_mem_usage` was None, now set to True since model is quantized.


In [8]:
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 2048)
    (layers): ModuleList(
      (0-21): 22 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear4bit(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear4bit(in_features=2048, out_features=256, bias=False)
          (v_proj): Linear4bit(in_features=2048, out_features=256, bias=False)
          (o_proj): Linear4bit(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=2048, out_features=5632, bias=False)
          (up_proj): Linear4bit(in_features=2048, out_features=5632, bias=False)
          (down_proj): Linear4bit(in_features=5632, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMS

In [9]:
def preprocess_function(batch):
    prefix = "translate French to English: "
    inputs = [prefix + text for text in batch['fr']]
    targets = batch['en']
    # Tokenize without padding (dynamic padding will be applied in the collator)
    model_inputs = tokenizer(inputs, max_length=MAX_LEN, truncation=True, padding=False)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=MAX_LEN, truncation=True, padding=False)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Tokenize the dataset
tokenized_dataset = data.map(preprocess_function, batched=True)

# Split the dataset into training and evaluation sets
split_datasets = tokenized_dataset.train_test_split(test_size=0.05)
train_dataset = split_datasets["train"]
eval_dataset = split_datasets["test"]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]



In [11]:
from transformers import BitsAndBytesConfig
from peft import get_peft_config, prepare_model_for_kbit_training, PeftModel, PeftConfig, get_peft_model, LoraConfig, TaskType


lora_config = LoraConfig(
    r=8,  # the dimension of the low-rank matrices
    lora_alpha=16, # scaling factor for LoRA activations vs pre-trained weight activations
    lora_dropout=0.05, 
    bias='none',
    inference_mode=False,
    task_type=TaskType.CAUSAL_LM,
    target_modules=['o_proj', 'v_proj', "q_proj", "k_proj", "gate_proj", "down_proj", "up_proj"]
) 

model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)
# Trainable Parameters
model.print_trainable_parameters()

ImportError: cannot import name 'clear_device_cache' from 'accelerate.utils.memory' (/home/yaroslavp/workspace/master_IASA/iasa_nlp_course/.venv/lib/python3.11/site-packages/accelerate/utils/memory.py)

In [2]:
import accelerate
# import peft

print("Accelerate version:", accelerate.__version__)
# print("PEFT version:", peft.__version__)

Accelerate version: 0.23.0


In [None]:
import wandb

# Initialize with team/entity
wandb.init(
    project="iasa-nlp-labs",
    entity="oypio-kpi", 
    name='llama3-1b-pretrain'
)

In [None]:
from transformers import XLMRobertaConfig, XLMRobertaTokenizer, XLMRobertaForMaskedLM
from transformers import LineByLineTextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments


# Data collator used for dynamic masking
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False
)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./model_checkpoints_llama3_pretrain',
    logging_dir='./model_logs_llama3_pretrain',
    weight_decay=0.01,
    learning_rate=1e-4,
    lr_scheduler_type='cosine',
    warmup_ratio=0.0,
    num_train_epochs=2,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=2,
    do_train=True,
    do_eval=False,
    bf16=True,
    report_to="wandb",
    optim='adamw_8bit',
    save_strategy="steps",
    logging_steps=5
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset['train']
)

In [None]:
trainer.train()

In [None]:
from transformers import pipeline
import torch

pipe = pipeline(
    "text2text-generationn",
    model=trainer.model.eval(),
    tokenizer=tokenizer,
    model_kwargs={
        "torch_dtype": torch.bfloat16, #use float16 for non A100/H100
        "quantization_config": {"load_in_4bit": True},
        "device_map": "cuda:0"
    }
)