## Задание 1: Fine-tuning модели с использованием LoRA

In [3]:
# !pip install datasets

In [None]:
"""
Задание 1: Fine-tuning модели с использованием LoRA

Цель: провести дообучение LLM с применением метода Low-Rank Adaptation (LoRA)
для решения задачи классификации текстов.

Пункты выполнения:
1. Подготовка данных:
   - Выбран датасет BoolQ (вопросы с да/нет ответами)
   - Данные предобработаны для задачи бинарной классификации

2. Выбор LLM:
   - Использована предобученная модель BERT (bert-base-uncased)
   - Модель адаптирована для задачи классификации

3. Настройка LoRA:
   - Использован метод LoRA для уменьшения числа обучаемых параметров
   - Настроены гиперпараметры LoRA (ранг, альфа, целевые слои)

4. Процесс обучения:
   - Реализовано с использованием Hugging Face Transformers
   - Настроены параметры обучения (learning rate, batch size, epochs)

5. Оценка результатов:
   - Измерены метрики качества (accuracy, F1-score)
   - Отслеживается время обучения и потребление памяти

6. Сравнение с полной донастройкой:
   - Реализованы оба подхода (LoRA и полный fine-tuning)
   - Сравниваются метрики качества, время и память
"""

In [27]:
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)
from peft import get_peft_config, get_peft_model, LoraConfig, TaskType
import numpy as np
from sklearn.metrics import accuracy_score, f1_score
import time
import psutil
import os
import warnings

warnings.filterwarnings('ignore', category=FutureWarning)

def prepare_dataset():
    dataset = load_dataset("boolq")

    def preprocess_function(examples):
        texts = [q + " [SEP] " + p for q, p in zip(examples["question"], examples["passage"])]
        tokenized = tokenizer(
            texts,
            truncation=True,
            max_length=512,
            padding=True,
            return_tensors=None  # Изменено для работы с батчами
        )

        # Преобразуем boolean метки в числовые (0 или 1)
        labels = [1 if ans else 0 for ans in examples["answer"]]
        tokenized["labels"] = labels

        return tokenized

    tokenized_dataset = dataset.map(
        preprocess_function,
        batched=True,
        remove_columns=dataset["train"].column_names
    )

    return tokenized_dataset

def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    return {
        "accuracy": accuracy_score(labels, predictions),
        "f1": f1_score(labels, predictions, average='binary')
    }

def get_memory_usage():
    process = psutil.Process(os.getpid())
    return process.memory_info().rss / 1024 / 1024

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.get("labels")
        outputs = model(**{k: v for k, v in inputs.items() if k != "labels"})
        logits = outputs.logits

        loss_fct = torch.nn.CrossEntropyLoss()
        loss = loss_fct(logits.view(-1, self.model.config.num_labels),
                       labels.view(-1))

        return (loss, outputs) if return_outputs else loss

def train_model(use_lora=True):
    model = AutoModelForSequenceClassification.from_pretrained(
        "bert-base-uncased",
        num_labels=2,
        ignore_mismatched_sizes=True
    )

    if use_lora:
        peft_config = LoraConfig(
            task_type=TaskType.SEQ_CLS,
            r=8,
            lora_alpha=32,
            lora_dropout=0.1,
            bias="none",
            target_modules=["query", "value"]
        )

        model = get_peft_model(model, peft_config)
        model.print_trainable_parameters()

    training_args = TrainingArguments(
        output_dir="./results",
        learning_rate=2e-5,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=3,
        weight_decay=0.01,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        report_to="none",
        run_name=None,
        remove_unused_columns=False  # Добавлено для предотвращения удаления столбцов
    )

    trainer = CustomTrainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset["train"],
        eval_dataset=tokenized_dataset["validation"],
        compute_metrics=compute_metrics,
        data_collator=DataCollatorWithPadding(tokenizer)
    )

    start_time = time.time()
    initial_memory = get_memory_usage()

    trainer.train()

    training_time = time.time() - start_time
    memory_used = get_memory_usage() - initial_memory

    eval_results = trainer.evaluate()

    return {
        "eval_results": eval_results,
        "training_time": training_time,
        "memory_used": memory_used
    }

if __name__ == "__main__":
    print("Initializing tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

    print("Preparing dataset...")
    tokenized_dataset = prepare_dataset()

    print("\nTraining with LoRA...")
    lora_results = train_model(use_lora=True)

    print("\nTraining with full fine-tuning...")
    full_results = train_model(use_lora=False)

    print("\nResults comparison:")
    print("LoRA results:", lora_results)
    print("Full fine-tuning results:", full_results)

Initializing tokenizer...
Preparing dataset...


Map:   0%|          | 0/3270 [00:00<?, ? examples/s]


Training with LoRA...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 296,450 || all params: 109,780,228 || trainable%: 0.2700


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.6649,0.662805,0.621713,0.766736
2,0.6538,0.656188,0.625076,0.766298
3,0.6537,0.655186,0.623242,0.765779



Training with full fine-tuning...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.6312,0.615742,0.676147,0.760461
2,0.5338,0.574272,0.708869,0.773225
3,0.3989,0.676665,0.711621,0.773589



Results comparison:
LoRA results: {'eval_results': {'eval_loss': 0.6551863551139832, 'eval_accuracy': 0.6232415902140673, 'eval_f1': 0.7657794676806083, 'eval_runtime': 99.6828, 'eval_samples_per_second': 32.804, 'eval_steps_per_second': 4.103, 'epoch': 3.0}, 'training_time': 2347.093881368637, 'memory_used': 37.72265625}
Full fine-tuning results: {'eval_results': {'eval_loss': 0.5742719769477844, 'eval_accuracy': 0.708868501529052, 'eval_f1': 0.7732253454025727, 'eval_runtime': 97.1285, 'eval_samples_per_second': 33.667, 'eval_steps_per_second': 4.211, 'epoch': 3.0}, 'training_time': 3074.156408071518, 'memory_used': 61.21875}


In [23]:
# !pip install evaluate