In [1]:
!nvidia-smi

Fri Sep 20 03:33:45 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.90.07              Driver Version: 550.90.07      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   39C    P8              9W /   70W |       1MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
|   1  Tesla T4                       Off |   00

**En este notebook y tutorial, realizaremos un fine-tune [Llama-8k](https://huggingface.co/microsoft/Phi-3-small-8k-instruct) modelo relativamente pequeño de 7 mil millones de parametros - que ha 'demostrado un rendimiento casi de última generación entre los modelos con menos de 13 mil millones de parámetros' - *en tus propios datos!!***

**Aqui usaremos [QLoRA (Efficient Finetuning of Quantized LLMs)](https://arxiv.org/abs/2305.14314), una técnica de fine-tunning altamente eficiente que consiste en cuantizar un LLM preentrenado a solo 4 bits y agregar pequeños 'Adaptadores de Bajo Rango'. Este enfoque único permite realizar el fine-tunning de LLMs utilizando solo una GPU. Esta técnica está respaldada por el/la... [PEFT library](https://huggingface.co/docs/peft/index).**

# Tabla de Contenido

- [1- Instalar librerias requeridas](#1)
- [ 2 - Cargar dataset](#2)
- [ 3 - Crear configuración de bitsandbytes](#3)
- [ 4 - Cargar Modelo Base](#4)
- [ 5 - Tokenizar](#5)
- [ 6 - Testear el modelo con Zero Shot Inferencing](#6)
- [ 7 - Pre-procesando el dataset](#7)
- [ 8 - Configurar el modelo PEFT/LoRA para realizar Fine-Tuning](#8)
- [ 9 - Entrenar Adaptador PEFT](#9)
- [ 10 - Evaluar el Modelo Qualitativamente (Evaluacion Humana)](#10)
- [ 11 - Evaluar el Modelo Quantitaviamente (con Metrica ROUGE)](#11)

<a name='1'></a>
#### 1. Instalar librerias requeridas

In [2]:
%%time
!pip install -U transformers
!pip install -U bitsandbytes
!pip install -U peft
!pip install -U accelerate
!pip install -U datasets
!pip install -U scipy
!pip install -U einops
!pip install -U evaluate
#!pip install -U trl
!pip install -U rouge_score
!pip install -U torch

Collecting transformers
  Downloading transformers-4.44.2-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.7/43.7 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
Downloading transformers-4.44.2-py3-none-any.whl (9.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.5/9.5 MB[0m [31m69.7 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.42.3
    Uninstalling transformers-4.42.3:
      Successfully uninstalled transformers-4.42.3
Successfully installed transformers-4.44.2
Collecting bitsandbytes
  Downloading bitsandbytes-0.43.3-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Downloading bitsandbytes-0.43.3-py3-none-manylinux_2_24_x86_64.whl (137.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m137.5/137.5 MB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:0

In [3]:
%%time
import os
import gc
import torch
import time
import pandas as pd
import numpy as np
import transformers
import multiprocessing

from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    GenerationConfig
)
from tqdm import tqdm
from huggingface_hub import interpreter_login
from pynvml import *
from functools import partial
from transformers import set_seed
from datasets import load_dataset, DatasetDict
from peft import LoraConfig, PeftModel, get_peft_model, prepare_model_for_kbit_training

#interpreter_login()

2024-09-20 03:39:12.198581: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-09-20 03:39:12.198706: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-09-20 03:39:12.349043: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


CPU times: user 10.3 s, sys: 1.41 s, total: 11.7 s
Wall time: 19 s


In [4]:
DATASET_FOLDER = os.path.join("/kaggle/input", "drugs-load-dataset")
DATASET_PATH = os.path.join(DATASET_FOLDER, "dataset/drugs_data.parquet")

if not (os.path.exists(DATASET_PATH)):
    print('Dataset no existe!!')


<a name='2'></a>
#### 2. Cargar el dataset

In [5]:
%%time
#Cargar tu dataset
dataset = load_dataset('parquet', data_files=DATASET_PATH)

# Dividir en 70% train y 30% (test + validation)
train_test_valid = dataset['train'].train_test_split(test_size=0.3, seed=42)

# Dividir el 30% restante en 15% test y 15% validation
test_valid = train_test_valid['test'].train_test_split(test_size=0.5, seed=42)

# Reunir los conjuntos en un DatasetDict
dataset = DatasetDict({
    'train': train_test_valid['train'],
    'test': test_valid['test'],
    'validation': test_valid['train']
})
dataset


Generating train split: 0 examples [00:00, ? examples/s]

CPU times: user 15 s, sys: 5.05 s, total: 20.1 s
Wall time: 17.2 s


DatasetDict({
    train: Dataset({
        num_rows: 160299
    })
    test: Dataset({
        num_rows: 34350
    })
    validation: Dataset({
        num_rows: 34350
    })
})

In [6]:
# Funcion para imprimir la utilización de la memoria de la GPU
def print_gpu_utilization():
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    print(f"GPU memory occupied: {info.used//1024**2} MB.")


# Función para reemplazar NaN con cadena vacía
def replace_nan_with_empty_string(example):
    for key, value in example.items():
        if value is None or pd.isna(value) or (value == 'nan'):
            example[key] = ''
    return example


def create_prompt_formats(sample):
    '''
    
    '''
    #===========================================================================================
    try:
        # Construir las partes iniciales
        instruct_key = '### Instruct: Generate a detailed description of the medication for healthcare professionals and patients. Maintain a professional and concise tone throughout all responses. Do not fabricate information, and if a specific field regarding the safety in sensitive groups (pregnant women, children, elderly) is not present, simply state "No specific information available."'
        context_key = '### Context: You are a pharmaceutical chemist specialized in the in-depth understanding of drug descriptions. Your task is to generate a professional and accurate response based on the information provided. If a specific field lacks information, state "No specific information available" instead of providing unconfirmed details.'
        input_key = f"### Input: Provide a detailed description of the medication {sample.get('generic_name', '')} using the available data."
        end_key = "### End"

        # Lista de campos a procesar
        fields = [
            ("brand_name", "Brand Name", "What is the brand name of the medication?"),
            ("generic_name", "Generic Name", "What is the generic name of the medication?"),
            ("substance_name", "Active Ingredient", "What is the active ingredient of the medication?"),
            ("manufacturer_name", "Manufacturer Name", "Who is the manufacturer of the medication?"),
            ("product_type", "Product Type", None),
            ("route", "Route of Administration", None),
            ("dosage_and_administration", "Dosage and Administration", "What is the recommended dosage for this medication?"),
            ("indications_and_usage", "Indications and Usage", "What is this medication used for?"),
            ("contraindications", "Contraindications", "What are the contraindications of the medication?"),
            ("warnings", "Warnings", "What warnings are associated with this medication?"),
            ("precautions", "Precautions", None),
            ("adverse_reactions", "Adverse Reactions", "What adverse reactions are associated with this medication?"),
            ("controlled_substance", "Controlled Substance", None),
            ("active_ingredient", "Chemical Substance", None),
            ("last_update", "Last Update", None)
        ]

        drugs = []
        questions = []

        # Procesar los campos
        for field, label_name, question_text in fields:
            field_value = sample.get(field)
            if field_value:
                drugs.append(f'<{field}> {label_name}: {field_value} </{field}>')
                if question_text:
                    questions.append(f'<question> {question_text}</question><answer> {field_value}</answer>')

        # Construir las partes finales
        output_key = f"### Output: {sample.get('description', '')}"
        if drugs:
            output_key += "\n" + "\n".join(drugs)

        question_key = '### Questions: ' + ("\n".join(questions) if questions else "")

        # Construir el texto final
        parts = [instruct_key, context_key, input_key, output_key, question_key, end_key]
        sample["text"] = "\n\n".join(parts)

    except Exception as ex:
        raise Exception(f'Ocurrió un error inesperado al cargar el prompt [line: {ex.__traceback__.tb_lineno}] - {ex}')
        
    return sample


def preprocess_batch(batch, tokenizer, max_length):
    """
    Tokenizing a batch
    """
    return tokenizer(
        batch["text"],
        max_length=max_length,
        truncation=True,
    )

    
def preprocess_dataset(tokenizer: AutoTokenizer, max_length: int, seed, dataset):
    """Format & tokenize it so it is ready for training
    :param tokenizer (AutoTokenizer): Model Tokenizer
    :param max_length (int): Maximum number of tokens to emit from tokenizer
    """
    
    try:
        # Añadir un prompt a cada muestra
        print("Preprocessing dataset...")
        
        num_cores = multiprocessing.cpu_count()
        print(f"Número de núcleos de la CPU disponibles: {num_cores}")
        
        # Usar todos menos uno o dos núcleos para no sobrecargar el sistema
        num_proc = max(1, num_cores - 1)
        
        dataset = dataset.map(create_prompt_formats
                              #num_proc=num_proc
                             )#, batched=True)
        
        _preprocessing_function = partial(preprocess_batch,
                                          max_length = max_length,
                                          tokenizer = tokenizer
                                         )

        dataset = dataset.map(_preprocessing_function, 
                              remove_columns=[col for col in dataset.column_names if col != "text"],
                              #num_proc=num_proc
                             )

        # Filtrar las muestras que tienen input_ids que exceden la longitud máxima (max_length).
        dataset = dataset.filter(lambda sample: len(sample["input_ids"]) < max_length)

        # Shuffle dataset
        dataset = dataset.shuffle(seed=seed)

        return dataset
    except Exception as ex:
        raise Exception(f'Ocurrió un error inesperado al pre-procesar el dataset [line: {ex.__traceback__.tb_lineno}] - {ex}')

        
def print_number_of_trainable_model_parameters(model):
    try:
        trainable_model_params = 0
        all_model_params = 0
        for _, param in model.named_parameters():
            all_model_params += param.numel()
            if param.requires_grad:
                trainable_model_params += param.numel()
        return f"all model parameters: {all_model_params}\ntrainable model parameters: {trainable_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"
    except Exception as ex:
        print(f'Ocurrió un error inesperado al imprimir los parametros del modelo [line: {ex.__traceback__.tb_lineno}] - {ex}')


In [7]:
class ModelAnalizer:
    '''
    '''
    
    def __init__(self, model_name_or_path):
        self.model_name_or_path = model_name_or_path
        self.model = None
        self.tokenizer = None
        self._load_qtz_config()
    
    
    def _load_qtz_config(self):
        try:
            compute_dtype = getattr(torch, "float16")
            self.bnb_config = BitsAndBytesConfig(load_in_4bit=True,
                                                 bnb_4bit_quant_type='nf4',
                                                 bnb_4bit_compute_dtype=compute_dtype,
                                                 bnb_4bit_use_double_quant=True,
                                                )
        except Exception as ex:
            raise Exception(f'Ocurrió un error inesperado al cargar quantization-config [line: {ex.__traceback__.tb_lineno}] - {ex}')
        
        
    def _load_model(self):
        try:
            device_map = {"": 0}
            self.model = AutoModelForCausalLM.from_pretrained(self.model_name_or_path, 
                                                              device_map=device_map,
                                                              quantization_config=self.bnb_config,
                                                              trust_remote_code=True,
                                                              token="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"
                                                              )
            # Carga el tokenizador
            self._tokenizer()
        except Exception as ex:
            raise Exception(f'Ocurrió un error inesperado al cargar el modelo [line: {ex.__traceback__.tb_lineno}] - {ex}')
            
    
    def _tokenizer(self):
        # https://ai.stackexchange.com/questions/41485/while-fine-tuning-a-decoder-only-llm-like-llama-on-chat-dataset-what-kind-of-pa
        try:
            print(f'self.model_name_or_path : {self.model_name_or_path}')
            self.tokenizer = AutoTokenizer.from_pretrained(self.model_name_or_path, 
                                                          trust_remote_code=True, 
                                                          add_bos_token=True,
                                                          use_fast=False, 
                                                          add_eos_token=True, 
                                                          padding_side="left",
                                                          token="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"
                                                         )
            if not(self.tokenizer):
                raise Exception(f'No se ha definido el atributo self.tokenizer')
            
            self.tokenizer.pad_token = self.tokenizer.eos_token
            
        except Exception as ex:
            raise Exception(f'Ocurrió un error inesperado al cargar el tokenizador [line: {ex.__traceback__.tb_lineno}] - {ex}')
        
    
    
    def gen(self, prompt, maxlen=512, sample=True):
        try:
            '''
            eval_tokenizer = AutoTokenizer.from_pretrained(self.model_name_or_path,
                                                           trust_remote_code=True,
                                                           add_bos_token=True,
                                                           use_fast=False
                                                          )
            eval_tokenizer.pad_token = eval_tokenizer.eos_token
            
            toks = eval_tokenizer(p, return_tensors="pt")
            '''
            
            toks = self.tokenizer(prompt, return_tensors="pt").to("cuda")
            res = self.model.generate(**toks.to("cuda"), 
                                      max_new_tokens=maxlen,
                                      do_sample=sample,
                                      num_return_sequences=1,
                                      temperature=0.7,
                                      num_beams=1,
                                      top_p=0.95
                                     ).to('cpu')
            return self.tokenizer.batch_decode(res, skip_special_tokens=True)
        
        except Exception as ex:
            raise Exception(f'Ocurrió un error inesperado al procesar la inferencia en el modelo [line: {ex.__traceback__.tb_lineno}] - {ex}')
    
    
    def get_max_length(self):
        try:
            max_length = None
            for length_setting in ["n_positions", "max_position_embeddings", "seq_length"]:
                max_length = getattr(self.model.config, length_setting, None)
                if max_length:
                    print(f"Found max length: {max_length}")
                    break
            if not max_length:
                max_length = 1024
                print(f"Using default max length: {max_length}")
            return max_length
        
        except Exception as ex:
            raise Exception(f'Ocurrió un error inesperado al obtener tamaño del modelo [line: {ex.__traceback__.tb_lineno}] - {ex}')
    


In [8]:
dataset['train'][120]

{'abuse': '',
 'abuse_table': '',
 'active_ingredient': 'Active ingredients Aspirin 500 mg (NSAID*) Caffeine 60 mg *Nonsteroidal anti-inflammatory drug',
 'active_ingredient_table': '',
 'adverse_reactions': '',
 'adverse_reactions_table': '',
 'alarms': '',
 'ask_doctor_or_pharmacist': 'Ask a doctor or phamacist before use if you are taking a prescription drug for diabetes gout arthritis',
 'ask_doctor_or_pharmacist_table': '',
 'ask_doctor_table': '',
 'brand_name': 'Blowfish',
 'carcinogenesis_and_mutagenesis_and_impairment_of_fertility': '',
 'clinical_pharmacology_table': '',
 'clinical_studies_table': '',
 'components_table': '',
 'contraindications': '',
 'controlled_substance': '',
 'dependence': '',
 'dependence_table': '',
 'description': '',
 'description_table': '',
 'do_not_use_table': '',
 'dosage_and_administration': 'Directions upon waking, fully dissolve 2 tablets in 16 oz. of water and drink do not exceed recommended dosage Adults and children 12 years and over (up to

#### Imprime el consumo de GPU antes de cargar el modelo pre-entrenado

In [9]:
print_gpu_utilization()

GPU memory occupied: 265 MB.


In [10]:
%%time
#model_name='meta-llama/Meta-Llama-3-8B'
model_name = 'meta-llama/Llama-2-7b-hf'
try:
    llm = ModelAnalizer(model_name)
    llm._load_model()
except Exception as ex:
    print(f"Ocurrió un error inesperado [line: {ex.__traceback__.tb_lineno}] - {ex}")
    


config.json:   0%|          | 0.00/609 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

self.model_name_or_path : meta-llama/Llama-2-7b-hf


tokenizer_config.json:   0%|          | 0.00/776 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

CPU times: user 21.7 s, sys: 20.9 s, total: 42.6 s
Wall time: 1min 15s


In [18]:
print_gpu_utilization()

GPU memory occupied: 4843 MB.


#### 6. Prueba el modelo con inferencia Zero Shot

In [13]:
%%time
seed = 42
index = 120
set_seed(seed)
max_tokens = 100

try:
    prompt = dataset['train'][index]


    # Instrucción: Resume la siguiente conversación
    formatted_prompt = f'Instruct: Generate a detailed description of the medication for healthcare professionals and patients. Maintain a professional and concise tone throughout all responses. Do not fabricate information, and if a specific field regarding the safety in sensitive groups (pregnant women, children, elderly) is not present, simply state "No specific information available".\n Provide a detailed description of the medication {prompt["generic_name"]} using the available data.\n Output:\n'
    res = llm.gen(formatted_prompt, max_tokens)
    #print(res[0])
    output = res[0].split('Output:\n')[1]

    dash_line = '-'.join('' for x in range(100))
    print(dash_line)
    print(f'Input Prompt:\n{formatted_prompt}')
    print(dash_line)
    print(f'Model Generation - Zero Shot:\n{output}')

except Exception as ex:
    print(f"Ocurrió un error inesperado [line: {ex.__traceback__.tb_lineno}] - {ex}")

---------------------------------------------------------------------------------------------------
Input Prompt:
Instruct: Generate a detailed description of the medication for healthcare professionals and patients. Maintain a professional and concise tone throughout all responses. Do not fabricate information, and if a specific field regarding the safety in sensitive groups (pregnant women, children, elderly) is not present, simply state "No specific information available".
 Provide a detailed description of the medication ASPIRIN, CAFFEINE using the available data.
 Output:

---------------------------------------------------------------------------------------------------
Model Generation - Zero Shot:
 nobody is coming to save us
nobody is coming to save us, and nobody is coming to save us.
We are going to have to do it ourselves.
We are going to have to find the resources to do it ourselves.
We are going to have to find the courage to do it ourselves.
We are going to have to find 

#### 7. Pre-procesando el dataset

In [14]:
%%time
try:
    max_length = llm.get_max_length()
    
    train_dataset = preprocess_dataset(tokenizer=llm.tokenizer, 
                                       max_length=max_length,
                                       seed=seed,
                                       dataset=dataset['train']
                                      )
    
    eval_dataset = preprocess_dataset(tokenizer=llm.tokenizer, 
                                      max_length=max_length,
                                      seed=seed,
                                      dataset=dataset['validation']
                                     )
except Exception as ex:
    print(f"Error [line: {ex.__traceback__.tb_lineno}] - {ex}")

Found max length: 4096
Preprocessing dataset...
Número de núcleos de la CPU disponibles: 4


Map:   0%|          | 0/160299 [00:00<?, ? examples/s]

Map:   0%|          | 0/160299 [00:00<?, ? examples/s]

Filter:   0%|          | 0/160299 [00:00<?, ? examples/s]

Preprocessing dataset...
Número de núcleos de la CPU disponibles: 4


Map:   0%|          | 0/34350 [00:00<?, ? examples/s]

Map:   0%|          | 0/34350 [00:00<?, ? examples/s]

Filter:   0%|          | 0/34350 [00:00<?, ? examples/s]

CPU times: user 1h 49min 38s, sys: 1min 2s, total: 1h 50min 41s
Wall time: 1h 50min 51s


In [19]:
print(f"Shapes of the datasets:")
print(f"Training: {train_dataset.shape}")
print(f"Validation: {eval_dataset.shape}")
print(train_dataset)

Shapes of the datasets:
Training: (112701, 3)
Validation: (24177, 3)
Dataset({
    features: ['text', 'input_ids', 'attention_mask'],
    num_rows: 112701
})


#### 8. Configura el modelo PEFT/LoRA para el Fine-Tuning
Ahora, vamos a realizar un ajuste fino eficiente en parámetros (PEFT). PEFT es una forma de ajuste fino por instrucciones que es mucho más eficiente que el ajuste fino completo. PEFT es un término genérico que incluye Adaptación de Bajo Rango (LoRA) y ajuste por indicaciones (¡que NO ES LO MISMO que la ingeniería de prompts!). En la mayoría de los casos, cuando alguien menciona PEFT, generalmente se refieren a LoRA. LoRA, en esencia, permite un ajuste fino eficiente del modelo utilizando menos recursos computacionales, a menudo realizable con solo una GPU. Después del ajuste fino con LoRA para una tarea o caso de uso específico, el resultado es un LLM original sin cambios y la aparición de un "adaptador LoRA" considerablemente más pequeño, que a menudo representa un porcentaje de un solo dígito del tamaño del LLM original (en MBs en lugar de GBs).

Durante la inferencia, el adaptador LoRA debe combinarse con su LLM original. La ventaja radica en la capacidad de muchos adaptadores LoRA para reutilizar el LLM original, reduciendo así los requisitos generales de memoria cuando se manejan múltiples tareas y casos de uso.

Nota el hiperparámetro de rango (r), que define el rango/dimensión del adaptador a ser entrenado. r es el rango de la matriz de bajo rango utilizada en los adaptadores, lo que controla el número de parámetros entrenados. Un rango mayor permitirá mayor expresividad, pero hay una compensación en términos de cómputo.

alpha es el factor de escalado para los pesos aprendidos. La matriz de pesos se escala por alpha/r, y por lo tanto, un valor más alto de alpha asigna más peso a las activaciones de LoRA.

In [20]:
print(print_number_of_trainable_model_parameters(llm.model))

all model parameters: 3500412928
trainable model parameters: 262410240
percentage of trainable model parameters: 7.50%


In [21]:
print(llm.model)

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear4bit(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((4096,), eps=

In [22]:
peft_config = LoraConfig(r=64, #32
                         lora_alpha=16, #32,
                         target_modules=['q_proj','k_proj','v_proj','o_proj'], #dense
                         bias="none",
                         lora_dropout=0.1, #0.05,  # Conventional
                         task_type="CAUSAL_LM",
                        )

# 1 - Habilitando el registro de puntos de control de gradiente para reducir el uso de memoria 
# durante el fine-tuning
llm.model.gradient_checkpointing_enable()

# 2 - Utilizando el método prepare_model_for_kbit_training de PEFT.
llm.model = prepare_model_for_kbit_training(llm.model)

peft_model = get_peft_model(llm.model, peft_config)

Una vez que todo esté configurado y el modelo base esté preparado, podemos utilizar la función auxiliar print_trainable_parameters() para ver cuántos parámetros entrenables hay en el modelo.

In [23]:
print(print_number_of_trainable_model_parameters(peft_model))

all model parameters: 3567521792
trainable model parameters: 67108864
percentage of trainable model parameters: 1.88%


In [24]:
# Observa cómo se ve diferente el modelo ahora, con los adaptadores LoRA añadidos:
print(peft_model)

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(32000, 4096)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaSdpaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=64, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=64, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): l

#### 9. Entrenando el Adaptador PEFT

Define los argumentos de entrenamiento y crea una instancia de Trainer.

In [26]:
from torch import amp

output_dir = './drugs-final-checkpoint'

'''
peft_training_args = TrainingArguments(
    output_dir = output_dir,
    warmup_steps=1,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    max_steps=1000,
    learning_rate=2e-4,
    optim="paged_adamw_8bit",
    logging_steps=25,
    logging_dir="./logs",
    save_strategy="steps",
    save_steps=25,
    evaluation_strategy="steps",
    eval_steps=25,
    do_eval=True,
    gradient_checkpointing=True,
    report_to="none",
    overwrite_output_dir = 'True',
    group_by_length=True,
)
'''

# Cambiar torch.cuda.amp.GradScaler a torch.amp.GradScaler
#scaler = amp.GradScaler('cuda')

peft_training_args = TrainingArguments(
    do_eval=True,
    eval_strategy="steps",
    fp16=True,
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={"use_reentrant": False},
    learning_rate=2.0e-05,
    logging_steps=25,
    log_level="info",
    logging_strategy="steps",
    lr_scheduler_type="cosine",
    max_steps=1000,
    #num_train_epochs=1,
    output_dir = output_dir,
    overwrite_output_dir = True,
    per_device_eval_batch_size=1,
    per_device_train_batch_size=1,
    report_to="none",
    save_strategy="steps",
    eval_steps=25,
    group_by_length=True,
    logging_dir="./logs",
    optim="paged_adamw_8bit",
    save_steps=25,
    warmup_steps=50,
    save_total_limit=None,
    seed=42
)

peft_model.config.use_cache = False

peft_trainer = transformers.Trainer(model=peft_model,
                                    train_dataset=train_dataset,
                                    eval_dataset=eval_dataset,
                                    args=peft_training_args,
                                    data_collator=transformers.DataCollatorForLanguageModeling(llm.tokenizer, mlm=False),
                                    )

PyTorch: setting up devices
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
max_steps is given, it will override any value given in num_train_epochs
Using auto half precision backend


In [27]:
peft_training_args.device
#print(f"GPUs disponibles: {torch.cuda.device_count()}")

device(type='cuda', index=0)

In [None]:
peft_trainer.train()

The following columns in the training set don't have a corresponding argument in `PeftModelForCausalLM.forward` and have been ignored: text. If text are not expected by `PeftModelForCausalLM.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 112,701
  Num Epochs = 1
  Instantaneous batch size per device = 1
  Training with DataParallel so batch size has been adjusted to: 2
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 4
  Total optimization steps = 1,000
  Number of trainable parameters = 67,108,864
  with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), recompute_context:  # type: ignore[attr-defined]


Step,Training Loss,Validation Loss


The following columns in the evaluation set don't have a corresponding argument in `PeftModelForCausalLM.forward` and have been ignored: text. If text are not expected by `PeftModelForCausalLM.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 24177
  Batch size = 2


In [None]:
print_gpu_utilization()

In [31]:
# Liberar memoria para la fusión de pesos
del llm.model
del peft_trainer
torch.cuda.empty_cache()

AttributeError: model

In [None]:
print_gpu_utilization()

#### 10. Evaluar el modelo cualitativamente (Evaluación Humana)

In [None]:
try:
    llm = ModelAnalizer(model_name)
    llm._load_model()
except Exception as ex:
    print(f"Ocurrió un error inesperado [line: {ex.__traceback__.tb_lineno}] - {ex}")
    


In [None]:

ft_model = PeftModel.from_pretrained(llm.model, 
                                     "/kaggle/working/peft-dialogue-summary-training/final-checkpoint/checkpoint-1000",
                                     torch_dtype=torch.float16,
                                     is_trainable=False
                                    )

In [None]:
%%time
seed = 42
index = 120
set_seed(seed)
max_tokens = 512

try:
    prompt = dataset['train'][index]

    # Instrucción: Resume la siguiente conversación
    formatted_prompt = f'Instruct: Generate a detailed description of the medication for healthcare professionals and patients. Maintain a professional and concise tone throughout all responses. Do not fabricate information, and if a specific field regarding the safety in sensitive groups (pregnant women, children, elderly) is not present, simply state "No specific information available".\n Provide a detailed description of the medication {prompt["generic_name"]} using the available data.\n Output:\n'
    res = ft_model.gen(formatted_prompt, max_tokens)
    #print(res[0])
    output = res[0].split('Output:\n')[1]

    dash_line = '-'.join('' for x in range(100))
    print(dash_line)
    print(f'Input Prompt:\n{formatted_prompt}')
    print(dash_line)
    print(f'Peft Model Generation:\n{output}')

except Exception as ex:
    print(f"Ocurrió un error inesperado [line: {ex.__traceback__.tb_lineno}] - {ex}")

#### 10. Evaluar el modelo cuantitativamente (con la Metrica ROUGE)

In [None]:
def data_process(dataset):
    try:
        # Añadir un prompt a cada muestra
        print("Preprocessing dataset...")

        num_cores = multiprocessing.cpu_count()
        print(f"Número de núcleos de la CPU disponibles: {num_cores}")

        # Usar todos menos uno o dos núcleos para no sobrecargar el sistema
        num_proc = max(1, num_cores - 1)

        dataset = dataset.map(create_prompt_formats_v1,
                              num_proc=num_proc
                             )#, batched=True)
    except Exception as ex:
        raise Exception(f'Ocurrió un error inesperado al pre-procesar el dataset [line: {ex.__traceback__.tb_lineno}] - {ex}')

    return dataset

In [None]:
%%time
try:
    train_dataset=data_process(dataset['train'])
    eval_dataset=data_process(dataset['validation'])

    print(f"Shapes of the datasets:")
    print(f"Training: {train_dataset.shape}")
    print(f"Validation: {eval_dataset.shape}")
    
except Exception as ex:
    print(f'Ocurrió un error inesperado [line: {ex.__traceback__.tb_lineno}] - {ex}')



In [None]:
display(train_dataset[120])

In [None]:
eval_dataset[120]