In [None]:
!pip install transformers torch accelerate bitsandbytes peft trl datasets

Collecting bitsandbytes
  Downloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting trl
  Downloading trl-0.17.0-py3-none-any.whl.metadata (12 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  

In [None]:
from huggingface_hub import login
from google.colab import userdata

HF_TOKEN = userdata.get('HF_TOKEN')

if HF_TOKEN is None:
    raise ValueError("HF_TOKEN environment variable not set.")

login(token=HF_TOKEN)

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch
import json
import random
import numpy as np
from transformers import set_seed
import time
import matplotlib.pyplot as plt
import os
from datasets import Dataset, load_dataset
import bitsandbytes as bnb
from peft import LoraConfig, get_peft_model
from sklearn.metrics import accuracy_score


trn_seed = 42
set_seed(trn_seed)
random.seed(trn_seed)
np.random.seed(trn_seed)
torch.manual_seed(trn_seed)
torch.cuda.manual_seed_all(trn_seed)
# Usar GPU si está disponible
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)


cuda


Procesar archivos pubmed_QA_train.json y pubmed_QA_eval.json

In [None]:
def procesar_pubmed_qa(ruta_archivo_json: str) -> list[dict]:
  """
    Returns:
      Una lista de diccionarios, donde cada diccionario representa un ejemplo
      procesado con las columnas: 'id', 'context', 'question', 'options', 'id_answer'.
  """
  dataset_procesado = []

  with open(ruta_archivo_json, 'r', encoding='utf-8') as f:
      for linea_num, linea in enumerate(f, 1):

        # Cargar cada línea como un objeto JSON
        dato_original = json.loads(linea.strip())

        # Extraer los campos necesarios del dato original
        original_id = dato_original.get('id')
        context = dato_original.get('excerpt') # Usar 'excerpt' como 'context'
        question = dato_original.get('question')
        statement = dato_original.get('statement') # Respuesta correcta
        distractors = dato_original.get('distractors') # Lista de 3 respuestas incorrectas
        options = list(distractors)
        id_answer = random.randint(0, len(distractors)) # len(distractors) es 3, así que random.randint(0,3)

        options.insert(id_answer, statement)

        # Crear el nuevo diccionario con el formato deseado
        nuevo_dato = {
            'id': original_id,
            'context': context,
            'question': question,
            'options': options, # Lista de 4 strings (3 distractores + 1 statement)
            'id_answer': id_answer # Índice (0-3) donde se encuentra el statement
        }
        dataset_procesado.append(nuevo_dato)

  return dataset_procesado

In [None]:
dataset_train = procesar_pubmed_qa('/content/drive/MyDrive/UniAndes/MAIA-202411/3. modelos-avanzados-para-el-procesamiento-de-lenguaje-natural/W7/maia-pln-2025/pubmed_QA_train.json')
dataset_eval = procesar_pubmed_qa('/content/drive/MyDrive/UniAndes/MAIA-202411/3. modelos-avanzados-para-el-procesamiento-de-lenguaje-natural/W7/maia-pln-2025/pubmed_QA_eval.json')

# Verificar los tamaños de los datasets resultantes
print(f"Tamaño de dataset_train: {len(dataset_train)}")
print(f"Tamaño de dataset_eval: {len(dataset_eval)}")

Tamaño de dataset_train: 16890
Tamaño de dataset_eval: 5000


In [None]:
print(dataset_train[:5])

[{'id': 'pubmed23n0002_11617', 'context': 'Effects of ethanol on the permeability of toad urinary bladder epithelium. Ethanol (9%) decreases the potential difference across the toad bladder when present at the mucosal surface, the short-circuit current was unchanged. The electrical resistance decreased indicating a change in ion movements across the bladder. Unidirectional 22Na and 36Cl flux measurements showed an increase in the movement of Cl, but no change in Na. The vasopressin-induced increase in Na transport (natriferic response) was also unaffected by the presence of ethanol. It is suggested that ethanol may be altering the apical tight junctions and affecting an anion selective pathway. The hydro-osmotic response of the toad bladder to vasopressin was decreased by 70% in the presence of 3% ethanol. The hydro-osmotic action of cyclic adenosine monophosphate was also inhibited by ethanol, indicating an action subsequent to the endogenous formation of this nucleotide. Tritiated wa

In [None]:
# Calcular el tamaño del dataset de prueba (20% del entrenamiento)
len_test = int(len(dataset_train) * 0.20)

# Dividir el dataset: los primeros tamano_test elementos van a test
dataset_test = dataset_train[:len_test]

# El resto del dataset de entrenamiento
dataset_train = dataset_train[len_test:]

# Verificar los tamaños de los datasets resultantes
print(f"Tamaño de dataset_train: {len(dataset_train)}")
print(f"Tamaño de dataset_test: {len(dataset_test)}")
print(f"Tamaño de dataset_eval: {len(dataset_eval)}")


Tamaño de dataset_train: 13512
Tamaño de dataset_test: 3378
Tamaño de dataset_eval: 5000


In [None]:
# Convertir a un objeto Dataset de Hugging Face
train_dataset_hf = Dataset.from_list(dataset_train)
test_dataset_hf = Dataset.from_list(dataset_test)
eval_dataset_hf = Dataset.from_list(dataset_eval)

In [None]:
model_id = "meta-llama/Llama-3.2-1B" # Según lo especificado en el proyecto

print(f"Cargando el tokenizador para {model_id}...")
# Cargar el tokenizador
tokenizer = AutoTokenizer.from_pretrained(model_id)

print(f"Cargando el modelo Llama: {model_id} ...")
# Cargar el modelo
model = AutoModelForCausalLM.from_pretrained(
  model_id,
  device_map="auto", # Distribuye el modelo automáticamente (GPU si está disponible)
)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    model.config.pad_token_id = model.config.eos_token_id

print("Modelo Llama cargado.")

max_seq_length = 5

Cargando el tokenizador para meta-llama/Llama-3.2-1B...


tokenizer_config.json:   0%|          | 0.00/50.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/301 [00:00<?, ?B/s]

Cargando el modelo Llama: meta-llama/Llama-3.2-1B ...


config.json:   0%|          | 0.00/843 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/185 [00:00<?, ?B/s]

Modelo Llama cargado.


In [None]:
# Función para formatear cada ejemplo en el prompt completo que el LLM aprendera
def format_training_example(example):
  opciones_texto = ""
  for i, opcion_txt in enumerate(example['options']):
      opciones_texto += f"{i}: {opcion_txt}\n"

  # Este es el formato completo que el modelo aprendera
  input_text = (
      f"Context: {example['context']}\n\n"
      f"Question: {example['question']}\n\n"
      f"Options:\n{opciones_texto.strip()}\n\n"
      f"Based on the context and the question, what is the ID of the correct option?\n"
      f"Answer:" # La "etiqueta" o "completion"
  )
  output_text=f"{example['id_answer']}"

  # Tokenizar
  input_tokens = tokenizer(input_text, add_special_tokens=False)
  output_tokens = tokenizer(output_text, add_special_tokens=False)

  input_ids = input_tokens["input_ids"] + output_tokens["input_ids"]
  labels = [-100] * len(input_tokens["input_ids"]) + output_tokens["input_ids"]

  return {
      "input_ids": input_ids,
      "labels": labels
  }


In [None]:
# Aplicar la función de formateo al dataset
train_dataset_formatted = train_dataset_hf.map(format_training_example)
test_dataset_formatted = test_dataset_hf.map(format_training_example)
eval_dataset_formatted = eval_dataset_hf.map(format_training_example)

Map:   0%|          | 0/13512 [00:00<?, ? examples/s]

Map:   0%|          | 0/3378 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Comienza el proceso de finetuning

Preparacion de Lora

In [None]:
from peft import LoraConfig, get_peft_model

lora_config = LoraConfig(
    r=16,  # Dimensión de LoRA (típico: 8, 16, 32, 64)
    lora_alpha=32, # Alpha de LoRA
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"], # Módulos a los que aplicar LoRA (varía por modelo)
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)


Configurar Argumentos de Entrenamiento

In [None]:
from transformers import TrainingArguments

output_dir = "/content/drive/MyDrive/UniAndes/MAIA-202411/3. modelos-avanzados-para-el-procesamiento-de-lenguaje-natural/W7/maia-pln-2025/decoder" # Directorio para guardar resultados y checkpoints
per_device_train_batch_size = 2
gradient_accumulation_steps = 4
learning_rate = 0.0003
num_train_epochs = 2
logging_steps = 25

training_arguments = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    learning_rate=learning_rate,
    num_train_epochs=num_train_epochs,
    logging_steps=logging_steps,
    fp16=True,
    max_grad_norm=0.3,
    warmup_ratio=0.03,
    lr_scheduler_type="cosine",
    report_to="wandb",
)

In [None]:
from trl import SFTTrainer

trainer = SFTTrainer(
    model=model,
    args=training_arguments,
    train_dataset=train_dataset_formatted, # Tu dataset formateado
    peft_config=lora_config,         # Configuración de LoRA
    eval_dataset=test_dataset_formatted,
)

print("Iniciando fine-tuning...")
trainer.train()
print("Fine-tuning completado.")

adapter_output_dir = f"{output_dir}/final_adapter"
trainer.model.save_pretrained(adapter_output_dir) # Guarda solo los pesos del adaptador LoRA
tokenizer.save_pretrained(adapter_output_dir) # Guarda también el tokenizador por conveniencia
print(f"Adaptador LoRA y tokenizador guardados en: {adapter_output_dir}")

Truncating train dataset:   0%|          | 0/13512 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/3378 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Iniciando fine-tuning...


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mperdomopaula[0m ([33mperdomopaula-universidad-de-los-andes[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
25,1.8927
50,1.5807
75,1.53
100,1.4706
125,1.4659
150,1.4896
175,1.4712
200,1.4721
225,1.4378
250,1.4923


Fine-tuning completado.
Adaptador LoRA y tokenizador guardados en: /content/drive/MyDrive/UniAndes/MAIA-202411/3. modelos-avanzados-para-el-procesamiento-de-lenguaje-natural/W7/maia-pln-2025/decoder//final_adapter


In [None]:
def generate_answer(model, tokenizer, example, max_new_tokens=5):
    """Genera una respuesta del modelo para un ejemplo dado."""

    valid_choices = ["0", "1", "2", "3"]
    choice_token_ids = [tokenizer(x, add_special_tokens=False)["input_ids"][0] for x in valid_choices]

    opciones_texto = ""
    for i, opcion_txt in enumerate(example['options']):
        opciones_texto += f"{i}: {opcion_txt}\n"

    prompt = (
        f"Context: {example['context']}\n\n"
        f"Question: {example['question']}\n\n"
        f"Options:\n{opciones_texto.strip()}\n\n"
        f"Based on the context and the question, what is the ID of the correct option?\n"
        f"Answer:"
    )

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    model.eval()
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits[:, -1, :]
        filtered_logits = logits[:, choice_token_ids]
        probs = torch.softmax(filtered_logits, dim=-1)
        pred_index = torch.argmax(probs, dim=-1).item()
        return valid_choices[pred_index]

In [None]:
#Ejmplo usango la funcion generate_answer asegurandonos que responde con el formato esperato (numero entre 0 y 3)
generated_answer_output = generate_answer(model, tokenizer, test_dataset_formatted[2])
print(f"Generated answer for test_dataset_formatted[0]: {generated_answer_output}")
print(f"Expected answer for test_dataset_formatted[0]: {test_dataset_formatted[2]['id_answer']}")

Generated answer for test_dataset_formatted[0]: 2
Expected answer for test_dataset_formatted[0]: 2


In [None]:
def evaluate_model(model, tokenizer, dataset):
  """Evalúa el modelo en un dataset y calcula la precisión."""
  predictions = []
  references = []

  for example in dataset:
      predicted_answer = generate_answer(model, tokenizer, example)
      predictions.append(predicted_answer)
      references.append(str(example['id_answer'])) # Convert reference to string for comparison

  # Calculate accuracy
  accuracy = accuracy_score(references, predictions)
  return accuracy, predictions, references

# Mover el modelo a la GPU si está disponible
model.to(device)

print("Evaluando en test_dataset_formatted...")
accuracy_test, predictions_test, references_test = evaluate_model(model, tokenizer, test_dataset_formatted)
print(f"Precisión en test_dataset_formatted: {accuracy_test:.4f}")

print("\nEvaluando en eval_dataset_formatted...")
accuracy_eval, predictions_eval, references_eval = evaluate_model(model, tokenizer, eval_dataset_formatted)
print(f"Precisión en eval_dataset_formatted: {accuracy_eval:.4f}")

# Opcional: Mostrar algunas predicciones y referencias para inspección
print("\nEjemplos de predicciones y referencias (test_dataset_formatted):")
for i in range(min(5, len(predictions_test))):
    print(f"Predicción: {predictions_test[i]}, Referencia: {references_test[i]}")

print("\nEjemplos de predicciones y referencias (eval_dataset_formatted):")
for i in range(min(5, len(predictions_eval))):
    print(f"Predicción: {predictions_eval[i]}, Referencia: {references_eval[i]}")

Evaluando en test_dataset_formatted...
Precisión en test_dataset_formatted: 0.9988

Evaluando en eval_dataset_formatted...
Precisión en eval_dataset_formatted: 0.9968

Ejemplos de predicciones y referencias (test_dataset_formatted):
Predicción: 0, Referencia: 0
Predicción: 0, Referencia: 0
Predicción: 2, Referencia: 2
Predicción: 1, Referencia: 1
Predicción: 1, Referencia: 1

Ejemplos de predicciones y referencias (eval_dataset_formatted):
Predicción: 1, Referencia: 1
Predicción: 2, Referencia: 2
Predicción: 2, Referencia: 2
Predicción: 1, Referencia: 1
Predicción: 2, Referencia: 2
