In [1]:
!pip install transformers torch accelerate bitsandbytes peft trl datasets sentence_transformers faiss-gpu-cu11==1.10.0 numpy pandas

Collecting bitsandbytes
  Downloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting trl
  Downloading trl-0.17.0-py3-none-any.whl.metadata (12 kB)
Collecting faiss-gpu-cu11==1.10.0
  Downloading faiss_gpu_cu11-1.10.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting numpy
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-runtime-cu11>=11.8.89 (from faiss-gpu-cu11==1.10.0)
  Downloading nvidia_cuda_runtime_cu11-11.8.89-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cublas-cu11>=11.11.3.6 (from faiss-gpu-cu11==1.10.0)
  Downloading nvidia_cublas_cu11-11.11.3.6-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_

In [1]:
from huggingface_hub import login
from google.colab import userdata

HF_TOKEN = userdata.get('HF_TOKEN')

if HF_TOKEN is None:
    raise ValueError("HF_TOKEN environment variable not set.")

login(token=HF_TOKEN)

# Cargando el decoder tuneado

In [2]:
import torch
import numpy  as np
import json
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel

#El ID del modelo Llama BASE
base_model_id = "meta-llama/Llama-3.2-1B"
adapter_and_tokenizer_path = "/content/drive/MyDrive/UniAndes/MAIA-202411/3. modelos-avanzados-para-el-procesamiento-de-lenguaje-natural/W7/maia-pln-2025/decoder/final_adapter"

# Determinar el dispositivo
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Usando dispositivo: {device}")


model_base = AutoModelForCausalLM.from_pretrained(
    base_model_id,
    device_map="auto",
    trust_remote_code=True
)

print("Modelo base cargado.")

# --- Cargar el Tokenizador ---
# El tokenizador se guardó junto con el adaptador.
print(f"Cargando el tokenizador desde: {adapter_and_tokenizer_path}...")
tokenizer = AutoTokenizer.from_pretrained(adapter_and_tokenizer_path, trust_remote_code=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    model_base.config.pad_token_id = model_base.config.eos_token_id
print("Tokenizador cargado.")

# --- Cargar y Aplicar el Adaptador LoRA ---
print(f"Cargando y aplicando el adaptador LoRA desde: {adapter_and_tokenizer_path}...")
model_tuned = PeftModel.from_pretrained(model_base, adapter_and_tokenizer_path)
print("Adaptador LoRA cargado y aplicado al modelo base.")

# --- Preparar el Modelo para Inferencia ---
model_tuned.eval()
print("Modelo fine-tuneado listo para inferencia.")


Usando dispositivo: cuda


config.json:   0%|          | 0.00/843 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/185 [00:00<?, ?B/s]

Modelo base cargado.
Cargando el tokenizador desde: /content/drive/MyDrive/UniAndes/MAIA-202411/3. modelos-avanzados-para-el-procesamiento-de-lenguaje-natural/W7/maia-pln-2025/decoder/final_adapter...
Tokenizador cargado.
Cargando y aplicando el adaptador LoRA desde: /content/drive/MyDrive/UniAndes/MAIA-202411/3. modelos-avanzados-para-el-procesamiento-de-lenguaje-natural/W7/maia-pln-2025/decoder/final_adapter...
Adaptador LoRA cargado y aplicado al modelo base.
Modelo fine-tuneado listo para inferencia.


# Cargando encoder construido

In [3]:
def load_json_to_dataset(file_path):
  try:
    with open(file_path, 'r') as f:
      data = json.load(f)
  except FileNotFoundError:
    print(f"Error: File not found at {file_path}")
    return None

  dataset = Dataset.from_dict(data)
  return dataset

file_path = '/content/drive/MyDrive/UniAndes/MAIA-202411/3. modelos-avanzados-para-el-procesamiento-de-lenguaje-natural/W7/maia-pln-2025/pubmed_500K.json'
pubmed_dataset = load_json_to_dataset(file_path)
if pubmed_dataset:
  print(pubmed_dataset[0])

{'id': 'pubmed23n0001_0', 'title': "[Biochemical studies on camomile components/III. In vitro studies about the antipeptic activity of (--)-alpha-bisabolol (author's transl)].", 'content': '(--)-alpha-Bisabolol has a primary antipeptic action depending on dosage, which is not caused by an alteration of the pH-value. The proteolytic activity of pepsin is reduced by 50 percent through addition of bisabolol in the ratio of 1/0.5. The antipeptic action of bisabolol only occurs in case of direct contact. In case of a previous contact with the substrate, the inhibiting effect is lost.', 'contents': "[Biochemical studies on camomile components/III. In vitro studies about the antipeptic activity of (--)-alpha-bisabolol (author's transl)]. (--)-alpha-Bisabolol has a primary antipeptic action depending on dosage, which is not caused by an alteration of the pH-value. The proteolytic activity of pepsin is reduced by 50 percent through addition of bisabolol in the ratio of 1/0.5. The antipeptic act

In [4]:
import random
import faiss
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from sentence_transformers import SentenceTransformer
from peft import PeftModel
import re # Para parsear la salida del LLM

ENCODER_MODEL_NAME = "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext"
FAISS_INDEX_PATH = "/content/drive/MyDrive/UniAndes/MAIA-202411/3. modelos-avanzados-para-el-procesamiento-de-lenguaje-natural/W7/maia-pln-2025/encoder/pubmed_faiss_pubmedbert_20250516_020433.index" # El índice FAISS que creaste
PUBMED_CORPUS_FILE_PATH = '/content/drive/MyDrive/UniAndes/MAIA-202411/3. modelos-avanzados-para-el-procesamiento-de-lenguaje-natural/W7/maia-pln-2025/pubmed_500K.json' # Para cargar los textos originales

# Archivo de preguntas de prueba
TEST_QUESTIONS_FILE_PATH = '/content/drive/MyDrive/UniAndes/MAIA-202411/3. modelos-avanzados-para-el-procesamiento-de-lenguaje-natural/W7/maia-pln-2025/pubmed_QA_test_questions.json'

# Otros parámetros
TOP_K_RETRIEVAL = 5 # Número de documentos a recuperar
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Usando dispositivo general: {DEVICE}")


# --- 1. CARGAR COMPONENTES DEL ENCODER (RETRIEVAL) ---
print("\n--- Cargando Componentes del Encoder (Retrieval) ---")

encoder_model = SentenceTransformer(ENCODER_MODEL_NAME, device=DEVICE)
print(f"Encoder '{ENCODER_MODEL_NAME}' cargado.")

faiss_index = faiss.read_index(FAISS_INDEX_PATH)
print(f"Índice FAISS cargado desde '{FAISS_INDEX_PATH}'. Total de vectores: {faiss_index.ntotal}")

# Cargar los textos originales del corpus
all_doc_texts = []
all_doc_ids_corpus = []

# Extraer los textos y los IDs
texts_list = pubmed_dataset['contents']
doc_ids_list = pubmed_dataset['PMID']
doc_ids_list = [str(pmid) for pmid in doc_ids_list]

#Filtrado de textos vacios o None
filtered_texts = []
filtered_doc_ids = []

for text, doc_id in zip(texts_list, doc_ids_list):
    if text and text.strip():  # Verifica que no sea None y que no esté vacío después de quitar espacios
        filtered_texts.append(text)
        filtered_doc_ids.append(doc_id)
    else:
        print(f"Advertencia: Documento con ID {doc_id} tiene campo 'contents' vacío o nulo. Será omitido.")

all_doc_texts = filtered_texts
all_doc_ids_corpus = filtered_doc_ids

if not all_doc_texts or len(all_doc_texts) != faiss_index.ntotal:
    raise ValueError(f"El número de textos cargados ({len(all_doc_texts)}) no coincide con el total del índice FAISS ({faiss_index.ntotal}). Verifica el procesamiento de {PUBMED_CORPUS_FILE_PATH}.")
print(f"Textos del corpus ({len(all_doc_texts)} documentos) cargados y listos.")


Usando dispositivo general: cuda

--- Cargando Componentes del Encoder (Retrieval) ---




config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

Encoder 'microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext' cargado.
Índice FAISS cargado desde '/content/drive/MyDrive/UniAndes/MAIA-202411/3. modelos-avanzados-para-el-procesamiento-de-lenguaje-natural/W7/maia-pln-2025/encoder/pubmed_faiss_pubmedbert_20250516_020433.index'. Total de vectores: 500000
Textos del corpus (500000 documentos) cargados y listos.


# Construccion de sistema de busqueda de informacion RAG

In [12]:
def retrieve_relevant_documents(query_text: str) -> list[str]:
  """
  Recupera los textos de los documentos más relevantes para una consulta.
  """
  INTERNAL_SEARCH_K = 5 # Siempre buscar los 5 mejores internamente
  query_embedding = encoder_model.encode([query_text], convert_to_numpy=True)
  distances, faiss_indices = faiss_index.search(query_embedding, INTERNAL_SEARCH_K)
  best_doc_original_index = faiss_indices[0][0]
  best_doc_distance = distances[0][0]

  if 0 <= best_doc_original_index < len(all_doc_texts):
    best_doc_text = all_doc_texts[best_doc_original_index]

    # print(f"  Mejor contexto seleccionado (distancia {best_doc_distance:.4f}): {best_doc_text[:100]}...")
    return best_doc_text
  else:
    print(f"Advertencia: No se encontraron documentos relevantes para la consulta '{query_text}'.")
    return []


In [13]:
def format_inference_prompt(question: str, options: list[str], retrieved_context) -> str:

  opciones_texto = ""
  for i, opcion_txt in enumerate(options):
      opciones_texto += f"{i}: {opcion_txt}\n"

  prompt = (
    f"Context: {retrieved_context}\n\n"
    f"Question: {question}\n\n"
    f"Options:\n{opciones_texto.strip()}\n\n"
    f"Based on the context and the question, what is the ID of the correct option?\n"
    f"Answer:"
  )

  return prompt

def get_predicted_option_id(
    question_text: str,
    option_texts: list[str],
    retrieved_contexts: list[str]
) -> str:
    inference_prompt = format_inference_prompt(question_text, option_texts, retrieved_contexts)

    max_generation_length = 5 # Para generar solo el ID "0", "1", "2", o "3"
    max_input_len = model_tuned.config.max_position_embeddings - max_generation_length

    valid_choices = ["0", "1", "2", "3"]
    choice_token_ids = [tokenizer(x, add_special_tokens=False)["input_ids"][0] for x in valid_choices]

    inputs = tokenizer(inference_prompt, return_tensors="pt").to(model_tuned.device)

    model_tuned.eval()
    with torch.no_grad():
        outputs = model_tuned(**inputs)
        logits = outputs.logits[:, -1, :]
        filtered_logits = logits[:, choice_token_ids]
        probs = torch.softmax(filtered_logits, dim=-1)
        pred_index = torch.argmax(probs, dim=-1).item()
        return valid_choices[pred_index]


In [14]:
#importar datos de prueba desde pumed_QA_test_questions.json
test_questions_data = []
try:
    with open(TEST_QUESTIONS_FILE_PATH, 'r', encoding='utf-8') as f_test:
        for line in f_test:
            test_questions_data.append(json.loads(line.strip()))
    print(f"Cargadas {len(test_questions_data)} preguntas de prueba.")
    print(test_questions_data[0])
except Exception as e:
    print(f"Error cargando el archivo de preguntas de prueba: {e}")
    exit()


Cargadas 1000 preguntas de prueba.
{'id': 26, 'question': 'What were the findings regarding alpha-1-antitrypsin deficiency of genotype PiZ in an autopsy series of 238 individuals?', 'option': ['Alpha-1-antitrypsin deficiency of genotype PiZ was found in 30 cases, with no significant association with pulmonary emphysema.', 'In an autopsy series of 238 individuals, alpha-1-antitrypsin deficiency of genotype PiZ was identified in 15 cases, with a higher prevalence of pulmonary emphysema among heterozygous individuals.', 'In the autopsy series, alpha-1-antitrypsin deficiency of genotype PiZ was identified in 5 cases, all of whom were homozygous.', 'The study found that alpha-1-antitrypsin deficiency of genotype PiZ was present in 20 cases, with a higher prevalence of liver disease among heterozygous individuals.']}


In [16]:
# PROCESAR PREGUNTAS DE PRUEBA
print(f"\n--- Procesando Preguntas de Prueba ---")
results = []
for i, test_item in enumerate(test_questions_data):
    question_id = test_item.get("id")
    question_text = test_item.get("question")
    option_texts = test_item.get("option") # Lista de 4 strings

    if not all([question_id, question_text, option_texts]) or len(option_texts) != 4:
        print(f"Saltando ítem de prueba inválido o incompleto (índice {i}): {test_item}")
        continue

    print(f"\nProcesando Pregunta ID: {question_id} ({i+1}/{len(test_questions_data)})")
    print(f"  Pregunta: {question_text}")

    # a. Recuperar contextos
    retrieved_context = retrieve_relevant_documents(question_text)
    if not retrieved_context:
        print(f"  No se recuperaron contextos para la pregunta ID: {question_id}. Llama podría no tener suficiente información.")

    # b. Obtener predicción del ID de respuesta de Llama
    predicted_id = get_predicted_option_id(question_text, option_texts, retrieved_context)
    print(f"  ID de respuesta predicho por Llama: {predicted_id}")

    results.append({
        "ID": question_id,
        "answer": predicted_id
    })

# --- 4. MOSTRAR O GUARDAR RESULTADOS ---
print("\n\n--- Resultados Finales ---")
for result in results:
    print(f"Pregunta ID: {result['ID']}, Predicción ID: {result['answer']}")

# Opcional: Guardar resultados en un archivo JSON
results_output_path = "/content/drive/MyDrive/UniAndes/MAIA-202411/3. modelos-avanzados-para-el-procesamiento-de-lenguaje-natural/W7/maia-pln-2025/rag_test_predictions.json"
with open(results_output_path, 'w', encoding='utf-8') as f_out:
    json.dump(results, f_out, indent=2, ensure_ascii=False)
print(f"\nResultados completos guardados en: {results_output_path}")

print("\n--- Proceso Completado ---")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  ID de respuesta predicho por Llama: 3

Procesando Pregunta ID: 37 (3/1000)
  Pregunta: What effects do antipsychotic drugs have on rats with biochemically induced abnormal behavior, as shown by the administration of L-Dopa and pimozide?
  ID de respuesta predicho por Llama: 2

Procesando Pregunta ID: 70 (4/1000)
  Pregunta: What are the characteristics and behaviors of gonadotroph-rich cell lines derived from multipotential pituitary clonal cells (2A8) when implanted under the kidney capsule of hypophysectomized female rats?
  ID de respuesta predicho por Llama: 1

Procesando Pregunta ID: 109 (5/1000)
  Pregunta: What were the findings regarding urinary calcium excretion in healthy males with varying protein intake and calcium supplementation in a study on calcium metabolism?
  ID de respuesta predicho por Llama: 2

Procesando Pregunta ID: 182 (6/1000)
  Pregunta: What were the findings regarding the association of clin

In [17]:
import pandas as pd
# Save results to CSV
results_csv_output_path = "/content/drive/MyDrive/UniAndes/MAIA-202411/3. modelos-avanzados-para-el-procesamiento-de-lenguaje-natural/W7/maia-pln-2025/rag_test_predictions.csv"
results_df = pd.DataFrame(results)
results_df.to_csv(results_csv_output_path, index=False)
print(f"\nResultados completos guardados en: {results_csv_output_path}")



Resultados completos guardados en: /content/drive/MyDrive/UniAndes/MAIA-202411/3. modelos-avanzados-para-el-procesamiento-de-lenguaje-natural/W7/maia-pln-2025/rag_test_predictions.csv
