# PubMed RCT - Entrenamiento con MLflow

**entrenar modelos** y registrar en **MLflow** (servidor en EC2).

- **MLflow**: tracking en `http://54.205.108.123:5000`


In [4]:
import sys

In [5]:
if "google.colab" in sys.modules:
    print("Colab detected")
else:
    print("Not Colab")

Colab detected


In [6]:
# 1. Instalar dependencias primero (Colab), luego configurar MLflow
import subprocess
import sys

packages = ['mlflow', 'transformers', 'torch', 'scikit-learn', 'accelerate']
for pkg in packages:
    try:
        __import__('sklearn' if pkg == 'scikit-learn' else pkg)
        print(f"{pkg} ya instalado")
    except ImportError:
        subprocess.check_call([sys.executable, '-m', 'pip', 'install', pkg, '-q'])
        print(f"{pkg} instalado")

import mlflow
MLFLOW_TRACKING_URI = "http://54.205.108.123:5000"
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
print(f"MLflow tracking URI: {mlflow.get_tracking_uri()}")
print("Listo.")

mlflow instalado
transformers ya instalado
torch ya instalado
scikit-learn ya instalado
accelerate ya instalado
MLflow tracking URI: http://54.205.108.123:5000
Listo.


In [7]:
# 2. Imports
import os
import random
from collections import defaultdict
from datetime import datetime

import torch
from torch.utils.data import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback,
)
from sklearn.metrics import (
    f1_score,
    precision_recall_fscore_support,
    classification_report,
    confusion_matrix,
)
import matplotlib.pyplot as plt
import seaborn as sns
import mlflow

from datasets import load_dataset

print(f"Device: {'cuda' if torch.cuda.is_available() else 'cpu'}")

Device: cuda


In [8]:
# 3. Cargar dataset y mapeo de etiquetas
print("Cargando PubMed RCT...")
train_dataset = load_dataset("armanc/pubmed-rct20k", split="train")
val_dataset = load_dataset("armanc/pubmed-rct20k", split="validation")
test_dataset = load_dataset("armanc/pubmed-rct20k", split="test")

unique_labels = sorted(set(train_dataset['label']))
label2id = {label: idx for idx, label in enumerate(unique_labels)}
id2label = {idx: label for label, idx in label2id.items()}
num_labels = len(unique_labels)

print(f"Train: {len(train_dataset):,} | Val: {len(val_dataset):,} | Test: {len(test_dataset):,}")
print(f"Etiquetas: {list(label2id.keys())}")

Cargando PubMed RCT...


Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


README.md:   0%|          | 0.00/646 [00:00<?, ?B/s]

Repo card metadata block was not found. Setting CardData to empty.


dataset_infos.json: 0.00B [00:00, ?B/s]

train.jsonl:   0%|          | 0.00/40.7M [00:00<?, ?B/s]

dev.jsonl: 0.00B [00:00, ?B/s]

test.jsonl: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/176642 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/29672 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/29578 [00:00<?, ? examples/s]

Repo card metadata block was not found. Setting CardData to empty.
Repo card metadata block was not found. Setting CardData to empty.


Train: 176,642 | Val: 29,672 | Test: 29,578
Etiquetas: ['background', 'conclusions', 'methods', 'objective', 'results']


In [9]:
# 4. Downsampling por clase (opcional)
def downsample_dataset_by_class(dataset, max_sentences_per_class=3, balance_to_min_class=False, seed=42):
    random.seed(seed)
    abstract_class_sentences = defaultdict(list)
    class_all_indices = defaultdict(list)
    for idx, ex in enumerate(dataset):
        key = (ex['abstract_id'], ex['label'])
        info = {'index': idx, 'sentence_id': ex.get('sentence_id', idx), 'text': ex.get('text', '')}
        abstract_class_sentences[key].append(info)
        class_all_indices[ex['label']].append(info)
    selected_indices = []
    if balance_to_min_class:
        min_size = min(len(v) for v in class_all_indices.values())
        for label, sentences in class_all_indices.items():
            selected = sentences if len(sentences) <= min_size else random.sample(sentences, min_size)
            selected_indices.extend([s['index'] for s in selected])
    else:
        for (_, _), sentences in abstract_class_sentences.items():
            if len(sentences) <= max_sentences_per_class:
                selected_indices.extend([s['index'] for s in sentences])
            else:
                selected_indices.extend([s['index'] for s in random.sample(sentences, max_sentences_per_class)])
    selected_indices.sort()
    return selected_indices

def get_downsampled_dataset(dataset, max_sentences_per_class=3, balance_to_min_class=False, seed=42):
    indices = downsample_dataset_by_class(dataset, max_sentences_per_class, balance_to_min_class, seed)
    return dataset.select(indices)

In [10]:
# 5. Crear dataset downsampled (para use_downsampling=True)
downsampled_train = get_downsampled_dataset(train_dataset, balance_to_min_class=True, seed=42)
print(f"Train completo: {len(train_dataset):,} | Downsampled: {len(downsampled_train):,}")

Train completo: 176,642 | Downsampled: 69,190


In [11]:
# 6. Dataset PyTorch para el Trainer
class SentenceLevelDataset(Dataset):
    def __init__(self, hf_dataset, tokenizer, max_length=128):
        self.dataset = hf_dataset
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        ex = self.dataset[idx]
        text = ex['text']
        label = ex['label']
        label_id = label2id[label] if isinstance(label, str) else label
        enc = self.tokenizer(text, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')
        return {
            'input_ids': enc['input_ids'].squeeze(),
            'attention_mask': enc['attention_mask'].squeeze(),
            'labels': torch.tensor(label_id, dtype=torch.long)
        }

In [12]:
# 7. M칠tricas para el Trainer
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=-1)
    macro_f1 = f1_score(labels, preds, average='macro')
    micro_f1 = f1_score(labels, preds, average='micro')
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average=None, labels=range(num_labels))
    metrics = {'macro_f1': macro_f1, 'micro_f1': micro_f1}
    for idx, name in id2label.items():
        if idx < len(precision):
            metrics[f'precision_{name}'] = precision[idx]
            metrics[f'f1_{name}'] = f1[idx]
    return metrics

In [14]:
# 8. Funci칩n de entrenamiento con MLflow
def train_model(
    model_name: str,
    learning_rate: float = 2e-5,
    batch_size: int = 16,
    num_epochs: int = 3,
    max_length: int = 128,
    experiment_name: str = "microproyecto-entrega-Nata",
    use_downsampling: bool = False,
    training_dataset=None,
):
    mlflow.set_experiment(experiment_name)
    training_data = training_dataset if training_dataset is not None else (downsampled_train if use_downsampling else train_dataset)

    author = "Juan Pablo Perez"
    strategy = "downsampling" if use_downsampling else "baseline"
    model_short = model_name.split("/")[-1]
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    run_name = f"{author}-{strategy}-{model_short}-{timestamp}"

    with mlflow.start_run(run_name=run_name):
        mlflow.log_param("model_name", model_name)
        mlflow.log_param("learning_rate", learning_rate)
        mlflow.log_param("batch_size", batch_size)
        mlflow.log_param("num_epochs", num_epochs)
        mlflow.log_param("max_length", max_length)
        mlflow.log_param("use_downsampling", use_downsampling)
        mlflow.log_param("train_size", len(training_data))

        print(f"Training: {model_name} | samples: {len(training_data):,}")
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels, id2label=id2label, label2id=label2id)
        train_ds = SentenceLevelDataset(training_data, tokenizer, max_length)
        val_ds = SentenceLevelDataset(val_dataset, tokenizer, max_length)
        test_ds = SentenceLevelDataset(test_dataset, tokenizer, max_length)

        output_dir = f"./results/{model_name.replace('/', '_')}"
        args = TrainingArguments(
            output_dir=output_dir,
            eval_strategy="epoch",
            save_strategy="epoch",
            learning_rate=learning_rate,
            per_device_train_batch_size=batch_size,
            per_device_eval_batch_size=batch_size,
            num_train_epochs=num_epochs,
            weight_decay=0.01,
            load_best_model_at_end=True,
            metric_for_best_model="macro_f1",
            greater_is_better=True,
            push_to_hub=False,
            logging_steps=100,
            report_to=["none"],
            save_total_limit=2,
        )
        trainer = Trainer(
            model=model,
            args=args,
            train_dataset=train_ds,
            eval_dataset=val_ds,
            compute_metrics=compute_metrics,
            callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
        )
        train_result = trainer.train()
        mlflow.log_metrics({"train_loss": train_result.training_loss, "train_runtime": train_result.metrics["train_runtime"], "train_samples_per_second": train_result.metrics["train_samples_per_second"]})

        val_metrics = trainer.evaluate()
        for k, v in val_metrics.items():
            if k.startswith("eval_"):
                mlflow.log_metric(f"val_{k[5:]}", v)
        test_metrics = trainer.evaluate(test_ds)
        for k, v in test_metrics.items():
            if k.startswith("eval_"):
                mlflow.log_metric(f"test_{k[5:]}", v)

        preds = trainer.predict(test_ds)
        y_pred = preds.predictions.argmax(axis=-1)
        y_true = preds.label_ids
        report = classification_report(y_true, y_pred, target_names=[id2label[i] for i in range(num_labels)], digits=4)
        print("\n" + report)
        os.makedirs(output_dir, exist_ok=True)
        report_path = f"{output_dir}/classification_report.txt"
        with open(report_path, 'w') as f:
            f.write(report)
        mlflow.log_artifact(report_path)

        cm = confusion_matrix(y_true, y_pred)
        fig, ax = plt.subplots(figsize=(10, 8))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=[id2label[i] for i in range(num_labels)], yticklabels=[id2label[i] for i in range(num_labels)], ax=ax)
        ax.set_xlabel('Predicted')
        ax.set_ylabel('True')
        ax.set_title(f'Confusion Matrix - {model_name}')
        plt.tight_layout()
        cm_path = f"{output_dir}/confusion_matrix.png"
        plt.savefig(cm_path, dpi=150, bbox_inches='tight')
        mlflow.log_artifact(cm_path)
        plt.close()

        trainer.save_model(f"{output_dir}/model")
        mlflow.log_artifact(f"{output_dir}/model")
        print(f"Test macro_f1: {test_metrics['eval_macro_f1']:.4f} | micro_f1: {test_metrics['eval_micro_f1']:.4f}")
        return trainer, test_metrics

## Entrenar modelo

Cada ejecuci칩n se registra en MLflow en **http://54.205.108.123:5000**.

In [18]:
# Solo cambia MODEL_NAME para probar otro modelo (no hace falta tocar el procesamiento)
MODEL_NAME = "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext"  # PubMedBERT: entrenado en PubMed
# MODEL_NAME = "allenai/scibert_scivocab_uncased"   # SciBERT
# MODEL_NAME = "bert-base-uncased"                  # BERT general (m치s ligero)

trainer, test_metrics = train_model(
    model_name=MODEL_NAME,
    learning_rate=2e-5,
    batch_size=32,
    num_epochs=3,
    max_length=128,
    experiment_name="microproyecto-entrega-Nata",
    use_downsampling=False,
)
# use_downsampling=True  -> dataset balanceado (m치s r치pido). use_downsampling=False -> baseline

Training: microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext | samples: 176,642


Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

BertForSequenceClassification LOAD REPORT from: microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext
Key                                        | Status     | 
-------------------------------------------+------------+-
cls.seq_relationship.weight                | UNEXPECTED | 
cls.predictions.bias                       | UNEXPECTED | 
cls.seq_relationship.bias                  | UNEXPECTED | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED | 
cls.predictions.transform.dense.bias       | UNEXPECTED | 
cls.predictions.decoder.bias               | UNEXPECTED | 
cls.predictions.decoder.weight             | UNEXPECTED | 
cls.predictions.transform.dense.weight     | UNEXPECTED | 
classifier.weight                          | MISSING    | 
classifier.bias                            | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- 

Epoch,Training Loss,Validation Loss,Macro F1,Micro F1,Precision Background,F1 Background,Precision Conclusions,F1 Conclusions,Precision Methods,F1 Methods,Precision Objective,F1 Objective,Precision Results,F1 Results
1,0.35588,0.321186,0.821925,0.881976,0.627733,0.716383,0.889412,0.855978,0.953049,0.947847,0.786674,0.658674,0.924079,0.930744
2,0.288349,0.324605,0.823513,0.883998,0.626012,0.7195,0.888863,0.851359,0.940323,0.952315,0.802758,0.662216,0.942092,0.932176
3,0.234841,0.332446,0.830898,0.888144,0.661999,0.727833,0.880452,0.865105,0.9459,0.951853,0.78101,0.677942,0.934663,0.931757


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

There were missing keys in the checkpoint model loaded: ['bert.embeddings.LayerNorm.weight', 'bert.embeddings.LayerNorm.bias', 'bert.encoder.layer.0.attention.output.LayerNorm.weight', 'bert.encoder.layer.0.attention.output.LayerNorm.bias', 'bert.encoder.layer.0.output.LayerNorm.weight', 'bert.encoder.layer.0.output.LayerNorm.bias', 'bert.encoder.layer.1.attention.output.LayerNorm.weight', 'bert.encoder.layer.1.attention.output.LayerNorm.bias', 'bert.encoder.layer.1.output.LayerNorm.weight', 'bert.encoder.layer.1.output.LayerNorm.bias', 'bert.encoder.layer.2.attention.output.LayerNorm.weight', 'bert.encoder.layer.2.attention.output.LayerNorm.bias', 'bert.encoder.layer.2.output.LayerNorm.weight', 'bert.encoder.layer.2.output.LayerNorm.bias', 'bert.encoder.layer.3.attention.output.LayerNorm.weight', 'bert.encoder.layer.3.attention.output.LayerNorm.bias', 'bert.encoder.layer.3.output.LayerNorm.weight', 'bert.encoder.layer.3.output.LayerNorm.bias', 'bert.encoder.layer.4.attention.output.La


              precision    recall  f1-score   support

  background     0.6662    0.7946    0.7248      3077
 conclusions     0.8696    0.8416    0.8554      4571
     methods     0.9359    0.9565    0.9461      9884
   objective     0.7585    0.5911    0.6644      2333
     results     0.9313    0.9171    0.9242      9713

    accuracy                         0.8801     29578
   macro avg     0.8323    0.8202    0.8230     29578
weighted avg     0.8821    0.8801    0.8796     29578



Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Test macro_f1: 0.8230 | micro_f1: 0.8801
游끢 View run Juan Pablo Perez-baseline-BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext-20260222_020729 at: http://54.205.108.123:5000/#/experiments/989624885271817029/runs/efb1d2ccd421405a9cb7531d2d9e7c41
游빍 View experiment at: http://54.205.108.123:5000/#/experiments/989624885271817029
