<a href="https://colab.research.google.com/github/mariagrandury/unia-2024/blob/main/ai_alignment_sft.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Supervised Fine Tuning (SFT)

Vamos a adaptar Llama 3.1 8B para seguir instrucciones en español.

### 0. Preparar el entorno

In [1]:
!pip install huggingface_hub -q
!pip install --no-deps "argilla==2.0.0" -q
!pip install --no-deps "xformers<0.0.27" bitsandbytes -q
!pip install --no-deps datasets transformers trl peft accelerate -q
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git" -q
!pip install wandb -q

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m133.7/133.7 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m222.7/222.7 MB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m137.5/137.5 MB[0m [31m15.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m527.3/527.3 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m245.8/245.8 kB[0m [31m14.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m296.4/296.4 kB[0m [31m15.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.7/43.7 kB[0m [31m656.0 kB/s[0m

In [2]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) 
Token is valid (permission: write).
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub.
Run the following command in your term

In [3]:
!nvidia-smi

Tue Aug 20 20:06:59 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          Off | 00000000:00:04.0 Off |                    0 |
| N/A   32C    P0              45W / 400W |      2MiB / 40960MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
                                                                    

Variables extraídas por simplicidad

In [4]:
HF_USERNAME = "mariagrandury"

ORIGINAL_MODEL_NAME = "meta-llama/Meta-Llama-3.1-8B"

IT_DATASET_NAME = "mariagrandury/elgrancorpus-it"
IT_DATASET_SPLIT = "train"
INSTRUCTION_COL = "pregunta"
OUTPUT_COL = "respuesta"

PROMPT = """A continuación hay una instrucción que describe una tarea, escribe una respuesta apropiada.

### Instrucción:
{}

### Respuesta:
{}"""

SFT_MODEL_NAME = f"{HF_USERNAME}/{ORIGINAL_MODEL_NAME.split('/')[1]}-sft-LoRA-{IT_DATASET_NAME.split('/')[1]}"

## 1. Configurar modelo y tokenizador

In [5]:
from transformers import AutoTokenizer
from trl import ModelConfig, get_peft_config

model_config = ModelConfig(
    model_name_or_path=ORIGINAL_MODEL_NAME,
    model_revision="main",
    trust_remote_code=True,
    torch_dtype="bfloat16",
    use_peft=True, # decidir si queremos hacer un FT común o con PEFT
    lora_r=64,
    lora_alpha=16
)

peft_config = get_peft_config(model_config)

model_kwargs = dict(
    revision=model_config.model_revision,
    trust_remote_code=model_config.trust_remote_code,
    torch_dtype=model_config.torch_dtype,
    use_cache=False, # != gradient_checkpoint
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/50.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

In [None]:
tokenizer = AutoTokenizer.from_pretrained(
    model_config.model_name_or_path,
    trust_remote_code=model_config.trust_remote_code,
    use_fast=True
)

EOS_TOKEN = tokenizer.eos_token

tokenizer.pad_token = EOS_TOKEN

## 2. Preparar el dataset

In [6]:
from datasets import load_dataset

dataset = load_dataset(IT_DATASET_NAME, split = IT_DATASET_SPLIT)

Downloading readme:   0%|          | 0.00/328 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/339M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/126M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/238M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/151M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/286M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/4052104 [00:00<?, ? examples/s]

In [7]:
df = dataset.to_pandas()
df.head()

Unnamed: 0,pregunta,respuesta
0,¿Cuándo empezó a operar Virgin Australia?\n\nV...,Virgin Australia inició sus servicios el 31 de...
1,¿Cuál es una especie de pez? Topo o cuerda\n\n,Tope
2,¿Por qué los camellos pueden sobrevivir mucho ...,Los camellos utilizan la grasa de sus jorobas ...
3,"Los padres de Alice tienen tres hijas: Amy, Je...",El nombre de la tercera hija es Alicia
4,¿Cuándo nació Tomoaki Komorida?\n\nKomorida na...,Tomoaki Komorida nació el 10 de julio de 1981.


In [8]:
def formatting_prompts_func(examples):
    instructions = examples[INSTRUCTION_COL]
    outputs = examples[OUTPUT_COL]
    texts = []
    for instruction, output in zip(instructions, outputs):
        text = PROMPT.format(instruction, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }
pass

In [9]:
dataset = dataset.map(formatting_prompts_func, batched = True,)

Map:   0%|          | 0/4052104 [00:00<?, ? examples/s]

In [10]:
df = dataset.to_pandas()
df.head()

Unnamed: 0,pregunta,respuesta,text
0,¿Cuándo empezó a operar Virgin Australia?\n\nV...,Virgin Australia inició sus servicios el 31 de...,A continuación hay una instrucción que describ...
1,¿Cuál es una especie de pez? Topo o cuerda\n\n,Tope,A continuación hay una instrucción que describ...
2,¿Por qué los camellos pueden sobrevivir mucho ...,Los camellos utilizan la grasa de sus jorobas ...,A continuación hay una instrucción que describ...
3,"Los padres de Alice tienen tres hijas: Amy, Je...",El nombre de la tercera hija es Alicia,A continuación hay una instrucción que describ...
4,¿Cuándo nació Tomoaki Komorida?\n\nKomorida na...,Tomoaki Komorida nació el 10 de julio de 1981.,A continuación hay una instrucción que describ...


## 3. Adaptar el LLM usando SFT

Vamos a utilizar el módulo `SFTTrainer` de la librería [TRL (Transformer Reinforcement Learning)](https://github.com/huggingface/trl).

In [None]:
from trl import SFTConfig, SFTTrainer

sft_config = SFTConfig(
    model_init_kwargs = model_kwargs,
    dataset_text_field="text",
    max_seq_length=1024, #2048,
    # A partir de aquí son los parámetros que solemos pasar a TrainingArgs
    learning_rate=1.41e-5,
    per_device_train_batch_size=64,
    gradient_checkpointing=True,
    gradient_accumulation_steps=1,
    num_train_epochs=1, #3, OOM
    max_steps=-1,
    logging_steps=1,
    report_to="wandb",
    output_dir="/meta-llama-sft", # = run_name en wandb
)

trainer = SFTTrainer(
    model=model_config.model_name_or_path,
    tokenizer=tokenizer,
    train_dataset=dataset,
    peft_config=peft_config,
    args=sft_config,
)

Entrenar el modelo

In [None]:
trainer.train()

Push al Hub de Hugging Face

In [None]:
trainer.push_to_hub(SFT_MODEL_NAME)
tokenizer.push_to_hub(SFT_MODEL_NAME)

## 3'. Optimizar utilizando `unsloth`

Cargar el modelo y configurar LoRA

In [None]:
from unsloth import FastLanguageModel

max_seq_length = 2048, # cq número, auto

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = ORIGINAL_MODEL_NAME,
    max_seq_length=max_seq_length,
    dtype = None,  # None = auto; Float16 para Tesla T4, V100; Bfloat16 para Ampere+
    load_in_4bit = True,  # Cuantización a 4bit para reducir el uso de memoria
)

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha=16,
    lora_dropout=0,  # optimizado
    bias="none",  # optimizado
    use_gradient_checkpointing=True,
    random_state=3407,
)

Configurar el SFT Trainer

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

training_args = TrainingArguments(
    per_device_train_batch_size = 2,
    gradient_accumulation_steps = 4,
    warmup_steps = 5,
    max_steps = 60,
    learning_rate = 2e-4,
    fp16 = not is_bfloat16_supported(),
    bf16 = is_bfloat16_supported(),
    logging_steps = 1,
    optim = "adamw_8bit",
    weight_decay = 0.01,
    lr_scheduler_type = "linear",
    seed = 3407,
    output_dir = "outputs",
)

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False,
    args = training_args,
)

Entrenar el modelo

In [None]:
trainer_stats = trainer.train()

Guardar el adapter

In [None]:
# model.push_to_hub(SFT_MODEL_NAME)
# tokenizer.push_to_hub(SFT_MODEL_NAME)

También se puede guardar una versión cuantizada

In [None]:
# model.save_pretrained_gguf("mariagrandury/Meta-Llama-3.1-8B-ft-LoRA-elgrancorpus-it-v2-gguf-q4_k_m", tokenizer, quantization_method = "q4_k_m")
# model.push_to_hub_gguf("mariagrandury/Meta-Llama-3.1-8B-ft-LoRA-elgrancorpus-it-v2-gguf-q4_k_m", tokenizer, quantization_method = "q4_k_m")