In [3]:
import os
import gdown
import zipfile

# 1. Instalacion de librerias
os.system('pip install -q transformers gdown')

# 2. Descargar eLife Dataset (Test Set para Evaluacion de Generalizacion)
elife_id = '1WKW8BAqluOlXrpy1B9mV3j3CtAK3JdnE'
elife_url = f'https://drive.google.com/uc?id={elife_id}'
elife_zip = 'eLife_dataset.zip'

if not os.path.exists(elife_zip):
    print("[INFO] Descargando eLife Dataset...")
    gdown.download(elife_url, elife_zip, quiet=False)

extract_elife = "eLife_data"
if not os.path.exists(extract_elife):
    with zipfile.ZipFile(elife_zip, 'r') as zip_ref:
        zip_ref.extractall(extract_elife)

# 3. Descargar PLOS Dataset (Para Enriquecer el Entrenamiento)
plos_id = '1lZ6PCAtXvmGjRZyp3vQQCEgO_yerH62Q'
plos_url = f'https://drive.google.com/uc?id={plos_id}'
plos_zip = 'plos_dataset.zip'

if not os.path.exists(plos_zip):
    print("[INFO] Descargando PLOS Dataset...")
    gdown.download(plos_url, plos_zip, quiet=False)

extract_plos = "plos_data"
if not os.path.exists(extract_plos):
    with zipfile.ZipFile(plos_zip, 'r') as zip_ref:
        zip_ref.extractall(extract_plos)

print("[INFO] Descargas y extracciones completadas.")

[INFO] Descargas y extracciones completadas.


In [5]:
import os

# Limpieza previa por si se ejecuta varias veces
if os.path.exists("cochrane_data"):
    import shutil
    shutil.rmtree("cochrane_data")

os.makedirs("cochrane_data")
os.chdir("cochrane_data")

print("[INFO] Inicializando Git y descargando Cochrane...")
os.system("git init")
os.system("git remote add -f origin https://github.com/feliperussi/bridging-the-gap-in-health-literacy.git")
os.system("git config core.sparseCheckout true")

# Configuramos el sparse checkout
with open(".git/info/sparse-checkout", "a") as f:
    f.write("data_collection_and_processing/Data Sources/Cochrane/train/pls\n")
    f.write("data_collection_and_processing/Data Sources/Cochrane/train/non_pls\n")

os.system("git pull origin main")

# Volver al directorio raiz para no romper las rutas siguientes
os.chdir("..")
print("[INFO] Descarga de Cochrane completada.")

[INFO] Inicializando Git y descargando Cochrane...
[INFO] Descarga de Cochrane completada.


In [8]:
import os
import glob
import pandas as pd
import re

# 1. Asegurar que estamos en el directorio raiz /content
try:
    os.chdir("/content")
    print("[INFO] Directorio de trabajo restablecido a /content")
except:
    pass

print("[INFO] Buscando archivos de Cochrane...")

# 2. Definir rutas (Buscamos recursivamente por si acaso)
# Intentamos localizar d√≥nde quedaron los archivos descargados por git
search_path_pls = "**/data_collection_and_processing/Data Sources/Cochrane/train/pls/*.txt"
search_path_non = "**/data_collection_and_processing/Data Sources/Cochrane/train/non_pls/*.txt"

pls_files = sorted(glob.glob(search_path_pls, recursive=True))
non_pls_files = sorted(glob.glob(search_path_non, recursive=True))

# Filtro de seguridad (eliminar archivos 'accumulated' y duplicados)
pls_files = [f for f in pls_files if "accumulated" not in os.path.basename(f).lower()]
non_pls_files = [f for f in non_pls_files if "accumulated" not in os.path.basename(f).lower()]

if not pls_files:
    print("‚ùå [ERROR CRITICO] No se encontraron archivos .txt.")
    print("Verifica que la Celda 2 (Descarga de Git) se haya ejecutado sin errores.")
    print("Ruta actual:", os.getcwd())
    print("Contenido de carpeta actual:", os.listdir("."))
    if os.path.exists("cochrane_data"):
        print("Contenido de cochrane_data:", os.listdir("cochrane_data"))
else:
    print(f"‚úÖ Encontrados: {len(pls_files)} PLS y {len(non_pls_files)} Non-PLS")

    # 3. Emparejar archivos
    def extract_id(filename):
        base = os.path.basename(filename)
        match = re.match(r"(.+?)\.pub\d+", base)
        return match.group(1) if match else base.split(".txt")[0]

    pls_dict = {extract_id(p): p for p in pls_files}
    non_pls_dict = {extract_id(n): n for n in non_pls_files}

    data = []
    for base_id, non_path in non_pls_dict.items():
        if base_id in pls_dict:
            pls_path = pls_dict[base_id]
            try:
                with open(pls_path, "r", encoding="utf-8") as f1, open(non_path, "r", encoding="utf-8") as f2:
                    data.append({
                        "id": base_id,
                        "pls": f1.read().strip(),
                        "non_pls": f2.read().strip()
                    })
            except Exception as e:
                print(f"Error leyendo {base_id}")

    # 4. Guardar CSV
    if data:
        df = pd.DataFrame(data)
        output_path = "cochrane_pairs_clean.csv"
        df.to_csv(output_path, index=False, encoding="utf-8")
        print(f"\nüéâ [EXITO] Dataset creado con {len(df)} pares.")
        print(f"üìÇ Archivo guardado en: {os.path.abspath(output_path)}")
    else:
        print("‚ùå [ERROR] Se encontraron archivos pero no se pudieron emparejar.")

[INFO] Directorio de trabajo restablecido a /content
[INFO] Buscando archivos de Cochrane...
‚úÖ Encontrados: 4797 PLS y 7251 Non-PLS

üéâ [EXITO] Dataset creado con 3426 pares.
üìÇ Archivo guardado en: /content/cochrane_pairs_clean.csv


In [6]:
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
import numpy as np
import json

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"[INFO] Dispositivo: {device}")

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

class BertDataset(Dataset):
    def __init__(self, df, tokenizer, max_len=256):
        self.texts = df['text'].astype(str).values
        self.labels = df['label'].values
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = self.texts[item]
        label = self.labels[item]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

[INFO] Dispositivo: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [9]:
print("[INFO] Preparando datasets...")

# 1. Cargar Cochrane
if os.path.exists("cochrane_pairs_clean.csv"):
    df_raw = pd.read_csv("cochrane_pairs_clean.csv")

    data_cochrane = []
    for _, row in df_raw.iterrows():
        if pd.notna(row['pls']) and pd.notna(row['non_pls']):
            data_cochrane.append({'text': str(row['pls']), 'label': 1})
            data_cochrane.append({'text': str(row['non_pls']), 'label': 0})

    df_cochrane_full = pd.DataFrame(data_cochrane)

    # SPLIT CRITICO: 20% Reservado para Test
    df_cochrane_train, df_cochrane_test_holdout = train_test_split(
        df_cochrane_full, test_size=0.2, random_state=42, stratify=df_cochrane_full['label']
    )

    # Guardar Hold-out en disco
    df_cochrane_test_holdout.to_csv("cochrane_holdout_test.csv", index=False)
    print(f"[INFO] Cochrane Train: {len(df_cochrane_train)} muestras")
    print(f"[INFO] Cochrane Hold-out Test: {len(df_cochrane_test_holdout)} muestras")
else:
    raise FileNotFoundError("Falta cochrane_pairs_clean.csv")

# 2. Cargar PLOS Train
data_plos = []
plos_train_path = None
for root, dirs, files in os.walk("plos_data"):
    if "train.json" in files:
        plos_train_path = os.path.join(root, "train.json")
        break

if plos_train_path:
    with open(plos_train_path, 'r') as f:
        plos_json = json.load(f)

    # Muestra de 5000 pares de PLOS
    limit = 5000
    for item in plos_json[:limit]:
        pls = " ".join(item['summary']) if isinstance(item['summary'], list) else str(item['summary'])
        tech = " ".join(item['abstract']) if isinstance(item['abstract'], list) else str(item['abstract'])
        data_plos.append({'text': pls, 'label': 1})
        data_plos.append({'text': tech, 'label': 0})
    print(f"[INFO] PLOS Train agregado: {len(data_plos)} muestras")

# 3. Fusionar Cochrane Train + PLOS
df_final_train = pd.concat([df_cochrane_train, pd.DataFrame(data_plos)])
print(f"[RESUMEN] Total Entrenamiento: {len(df_final_train)} textos")

# Dataloaders
train_dataset = BertDataset(df_final_train, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

[INFO] Preparando datasets...
[INFO] Cochrane Train: 5481 muestras
[INFO] Cochrane Hold-out Test: 1371 muestras
[INFO] PLOS Train agregado: 10000 muestras
[RESUMEN] Total Entrenamiento: 15481 textos


In [13]:
print("[INFO] Iniciando Entrenamiento...")
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)
model = model.to(device)
optimizer = AdamW(model.parameters(), lr=2e-5)

epochs = 5
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch {epoch+1}/{epochs} - Loss Promedio: {total_loss/len(train_loader):.4f}")

print("[INFO] Modelo entrenado.")

[INFO] Iniciando Entrenamiento...


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5 - Loss Promedio: 0.3154
Epoch 2/5 - Loss Promedio: 0.2205
Epoch 3/5 - Loss Promedio: 0.1480
Epoch 4/5 - Loss Promedio: 0.0797
Epoch 5/5 - Loss Promedio: 0.0515
[INFO] Modelo entrenado.


In [14]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# 1. Cargar eLife para Calibracion
elife_path = None
for root, dirs, files in os.walk("eLife_data"):
    if "test.json" in files:
        elife_path = os.path.join(root, "test.json")
        break

best_thresh = 0.5

if elife_path:
    print(f"[INFO] Calibrando modelo con eLife (Dominio desconocido)...")
    df_elife = pd.read_json(elife_path)
    df_elife['pls'] = df_elife['summary'].apply(lambda x: " ".join(x) if isinstance(x, list) else str(x))
    df_elife['tech'] = df_elife['abstract'].apply(lambda x: " ".join(x) if isinstance(x, list) else str(x))

    probs, trues = [], []
    model.eval()

    with torch.no_grad():
        for _, row in df_elife.iterrows():
            # PLS
            inputs = tokenizer(row['pls'], return_tensors="pt", padding=True, truncation=True, max_length=256).to(device)
            probs.append(F.softmax(model(**inputs).logits, dim=1)[0][1].item())
            trues.append(1)
            # Tech
            inputs = tokenizer(row['tech'], return_tensors="pt", padding=True, truncation=True, max_length=256).to(device)
            probs.append(F.softmax(model(**inputs).logits, dim=1)[0][1].item())
            trues.append(0)

    # Buscar mejor umbral
    best_acc = 0
    for th in np.arange(0.5, 0.99, 0.01):
        preds = [1 if p > th else 0 for p in probs]
        acc = accuracy_score(trues, preds)
        if acc > best_acc:
            best_acc = acc
            best_thresh = th

    print(f"Resultados Calibracion eLife:")
    print(f"Mejor Umbral: {best_thresh:.2f}")
    print(f"Accuracy Maximo en eLife: {best_acc*100:.2f}%")
else:
    print("[ERROR] No se encontro test.json de eLife")

# 2. Evaluacion Final en Cochrane Hold-out
print("\n" + "="*40)
print("[INFO] EVALUACION FINAL EN COCHRANE (Datos Nunca Vistos)")
print("="*40)

if os.path.exists("cochrane_holdout_test.csv"):
    df_test_c = pd.read_csv("cochrane_holdout_test.csv")

    probs_c, trues_c = [], []
    model.eval()

    print(f"Procesando {len(df_test_c)} textos...")

    with torch.no_grad():
        for _, row in df_test_c.iterrows():
            text = str(row['text'])
            inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=256).to(device)
            prob = F.softmax(model(**inputs).logits, dim=1)[0][1].item()
            probs_c.append(prob)
            trues_c.append(row['label'])

    # Aplicar el umbral calculado
    preds_c = [1 if p > best_thresh else 0 for p in probs_c]

    print(f"\nMETRICAS FINALES (Umbral {best_thresh:.2f})")
    print(classification_report(trues_c, preds_c, target_names=['Tecnico', 'PLS']))

    cm = confusion_matrix(trues_c, preds_c)
    print("Matriz de Confusion:")
    print(f"TN: {cm[0][0]} | FP: {cm[0][1]}")
    print(f"FN: {cm[1][0]} | TP: {cm[1][1]}")

else:
    print("[ERROR] No se encontro cochrane_holdout_test.csv")

[INFO] Calibrando modelo con eLife (Dominio desconocido)...
Resultados Calibracion eLife:
Mejor Umbral: 0.98
Accuracy Maximo en eLife: 87.14%

[INFO] EVALUACION FINAL EN COCHRANE (Datos Nunca Vistos)
Procesando 1371 textos...

METRICAS FINALES (Umbral 0.98)
              precision    recall  f1-score   support

     Tecnico       0.99      1.00      0.99       686
         PLS       1.00      0.99      0.99       685

    accuracy                           0.99      1371
   macro avg       0.99      0.99      0.99      1371
weighted avg       0.99      0.99      0.99      1371

Matriz de Confusion:
TN: 683 | FP: 3
FN: 5 | TP: 680


In [15]:
print("[INFO] Evaluando en PLOS Test Set (Datos Cl√≠nicos)...")

# 1. Buscar el archivo test.json de PLOS
plos_test_path = None
for root, dirs, files in os.walk("plos_data"):
    if "test.json" in files:
        plos_test_path = os.path.join(root, "test.json")
        break

if plos_test_path:
    df_plos_test = pd.read_json(plos_test_path)

    # Limpieza de texto
    df_plos_test['pls'] = df_plos_test['summary'].apply(lambda x: " ".join(x) if isinstance(x, list) else str(x))
    df_plos_test['tech'] = df_plos_test['abstract'].apply(lambda x: " ".join(x) if isinstance(x, list) else str(x))

    print(f"[INFO] Procesando {len(df_plos_test)*2} textos de PLOS...")

    probs_p, trues_p = [], []
    model.eval()

    with torch.no_grad():
        for _, row in df_plos_test.iterrows():
            # PLS
            inputs = tokenizer(row['pls'], return_tensors="pt", padding=True, truncation=True, max_length=256).to(device)
            probs_p.append(F.softmax(model(**inputs).logits, dim=1)[0][1].item())
            trues_p.append(1)

            # T√©cnico
            inputs = tokenizer(row['tech'], return_tensors="pt", padding=True, truncation=True, max_length=256).to(device)
            probs_p.append(F.softmax(model(**inputs).logits, dim=1)[0][1].item())
            trues_p.append(0)

    # 2. Evaluaci√≥n con el Umbral Calibrado en eLife
    # Si no existe la variable, usamos 0.5 (est√°ndar) o 0.98 (el que suele salir alto)
    thresh_final = best_thresh if 'best_thresh' in globals() else 0.98

    preds_p = [1 if p > thresh_final else 0 for p in probs_p]

    print(f"\nRESULTADOS PLOS TEST (Umbral {thresh_final:.2f})")
    print(classification_report(trues_p, preds_p, target_names=['T√©cnico', 'PLS']))

    cm = confusion_matrix(trues_p, preds_p)
    print(f"Matriz de Confusi√≥n:\nTN: {cm[0][0]} | FP: {cm[0][1]}\nFN: {cm[1][0]} | TP: {cm[1][1]}")

    # 3. (Opcional) Ver si PLOS prefiere su propio umbral
    best_acc_plos = 0
    best_th_plos = 0
    for th in np.arange(0.5, 0.99, 0.01):
        ps = [1 if p > th else 0 for p in probs_p]
        if accuracy_score(trues_p, ps) > best_acc_plos: best_acc_plos, best_th_plos = accuracy_score(trues_p, ps), th

    print(f"\n[DATO] El umbral ideal espec√≠fico para PLOS hubiera sido: {best_th_plos:.2f} (Acc: {best_acc_plos*100:.2f}%)")

else:
    print("[ERROR] No se encontr√≥ test.json en la carpeta plos_data")

[INFO] Evaluando en PLOS Test Set (Datos Cl√≠nicos)...
[INFO] Procesando 2752 textos de PLOS...

RESULTADOS PLOS TEST (Umbral 0.98)
              precision    recall  f1-score   support

     T√©cnico       0.76      0.92      0.83      1376
         PLS       0.90      0.71      0.79      1376

    accuracy                           0.82      2752
   macro avg       0.83      0.82      0.81      2752
weighted avg       0.83      0.82      0.81      2752

Matriz de Confusi√≥n:
TN: 1267 | FP: 109
FN: 400 | TP: 976

[DATO] El umbral ideal espec√≠fico para PLOS hubiera sido: 0.82 (Acc: 83.68%)


In [16]:
import json
import torch
import shutil
import os

# 1. Configurar directorio de salida
output_dir = "modelo_final_exportado"
if os.path.exists(output_dir):
    shutil.rmtree(output_dir)
os.makedirs(output_dir)

print(f"[INFO] Guardando artefactos en: {output_dir}")

# 2. Guardar Modelo y Tokenizer (Formato Hugging Face)
# Es importante guardarlo mapeado a CPU para m√°xima compatibilidad en inferencia
model_to_save = model.module if hasattr(model, 'module') else model
model_to_save.cpu().save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

# 3. Guardar Metadatos Cr√≠ticos (El Umbral)
# Si la variable 'best_thresh' no existe, usamos 0.98 por defecto basado en tus resultados
umbral_final = best_thresh if 'best_thresh' in globals() else 0.98

config_inferencia = {
    "threshold": umbral_final,
    "model_type": "distilbert-base-uncased",
    "labels": {"0": "Tecnico", "1": "PLS"}
}

with open(os.path.join(output_dir, "inference_config.json"), "w") as f:
    json.dump(config_inferencia, f)

print(f"[EXITO] Modelo guardado.")
print(f"Umbral registrado: {umbral_final}")

[INFO] Guardando artefactos en: modelo_final_exportado
[EXITO] Modelo guardado.
Umbral registrado: 0.9800000000000004


In [17]:
# Comprimir carpeta
shutil.make_archive("clasificador_pls_distilbert", 'zip', output_dir)

print("[INFO] Archivo listo: clasificador_pls_distilbert.zip")

# Si estas en Google Colab, esto inicia la descarga automatica
try:
    from google.colab import files
    files.download("clasificador_pls_distilbert.zip")
except ImportError:
    print("Descarga el archivo 'clasificador_pls_distilbert.zip' manualmente desde el explorador de archivos.")

[INFO] Archivo listo: clasificador_pls_distilbert.zip


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>