L4


In [None]:
# 1. Crear carpeta y entrar
!mkdir cochrane_data
%cd cochrane_data

# 2. Inicializar git vacío
!git init

# 3. Agregar remote
!git remote add -f origin https://github.com/feliperussi/bridging-the-gap-in-health-literacy.git

# 4. Activar modo sparse checkout
!git config core.sparseCheckout true

# 5. Indicar SOLO las carpetas que queremos
!echo "data_collection_and_processing/Data Sources/Cochrane/train/pls" >> .git/info/sparse-checkout
!echo "data_collection_and_processing/Data Sources/Cochrane/train/non_pls" >> .git/info/sparse-checkout

# 6. Descargar solo esa parte del repo
!git pull origin main


/content/cochrane_data
[33mhint: Using 'master' as the name for the initial branch. This default branch name[m
[33mhint: is subject to change. To configure the initial branch name to use in all[m
[33mhint: [m
[33mhint: 	git config --global init.defaultBranch <name>[m
[33mhint: [m
[33mhint: Names commonly chosen instead of 'master' are 'main', 'trunk' and[m
[33mhint: 'development'. The just-created branch can be renamed via this command:[m
[33mhint: [m
[33mhint: 	git branch -m <name>[m
Initialized empty Git repository in /content/cochrane_data/.git/
Updating origin
remote: Enumerating objects: 72074, done.[K
remote: Counting objects: 100% (3/3), done.[K
remote: Compressing objects: 100% (3/3), done.[K
remote: Total 72074 (delta 0), reused 2 (delta 0), pack-reused 72071 (from 2)[K
Receiving objects: 100% (72074/72074), 315.90 MiB | 26.10 MiB/s, done.
Resolving deltas: 100% (2991/2991), done.
From https://github.com/feliperussi/bridging-the-gap-in-health-literacy
 * 

In [None]:
import os
os.environ["HF_TOKEN"] = "hf_xxx"

In [None]:
from huggingface_hub import login
login(token=os.environ["HF_TOKEN"])

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [None]:
import glob, pandas as pd
import re

# Número máximo de pares
pares = 200

# === 1️⃣ Listar archivos ===
pls_files = sorted(glob.glob("data_collection_and_processing/Data Sources/Cochrane/train/pls/*.txt"))
non_pls_files = sorted(glob.glob("data_collection_and_processing/Data Sources/Cochrane/train/non_pls/*.txt"))

# === 2️⃣ Filtrar archivos que contengan 'accumulated' en AMBOS ---
pls_files = [f for f in pls_files if "accumulated" not in os.path.basename(f).lower()]
non_pls_files = [f for f in non_pls_files if "accumulated" not in os.path.basename(f).lower()]

print(f"📚 Total PLS (sin accumulated): {len(pls_files)}")
print(f"📂 Total non-PLS (sin accumulated): {len(non_pls_files)}")

# === 3️⃣ Función para extraer el ID base ===
def extract_id(filename):
    base = os.path.basename(filename)
    match = re.match(r"(.+?)\.pub\d+", base)
    return match.group(1) if match else base.split(".txt")[0]

# === 4️⃣ Diccionarios {id_base: ruta} ===
pls_dict = {extract_id(p): p for p in pls_files}
non_pls_dict = {extract_id(n): n for n in non_pls_files}

# === 5️⃣ Emparejar ===
data = []
for base_id, non_path in non_pls_dict.items():
    if base_id in pls_dict:
        pls_path = pls_dict[base_id]
        with open(pls_path, "r", encoding="utf-8") as f1, open(non_path, "r", encoding="utf-8") as f2:
            data.append({
                "id": base_id,
                "file_pls": os.path.basename(pls_path),
                "file_non_pls": os.path.basename(non_path),
                "pls": f1.read().strip(),
                "non_pls": f2.read().strip()
            })
    if len(data) >= pares:
        break

# === 6️⃣ DataFrame ===
df = pd.DataFrame(data)

# === 7️⃣ Guardar ===
output_path = "cochrane_pairs_clean.csv"
df.to_csv(output_path, index=False, encoding="utf-8")

print(f"\n✅ Emparejados {len(df)} pares por nombre base (sin 'accumulated' en ninguno)")
print(f"💾 Guardado en: {output_path}")
df.head(3)


📚 Total PLS (sin accumulated): 4797
📂 Total non-PLS (sin accumulated): 7251

✅ Emparejados 200 pares por nombre base (sin 'accumulated' en ninguno)
💾 Guardado en: cochrane_pairs_clean.csv


Unnamed: 0,id,file_pls,file_non_pls,pls,non_pls
0,10.1002-14651858.CD000006,10.1002-14651858.CD000006.pub2-pls.txt,10.1002-14651858.CD000006.pub2-abstract.txt,Absorbable stitches for repair of episiotomy a...,Background\nApproximately 70% of women will ex...
1,10.1002-14651858.CD000009,10.1002-14651858.CD000009.pub4-pls.txt,10.1002-14651858.CD000009.pub4-abstract.txt_se...,Do acupuncture and related therapies help smok...,We included 38 studies. Based on three studies...
2,10.1002-14651858.CD000012,10.1002-14651858.CD000012.pub4-pls.txt,10.1002-14651858.CD000012.pub4-abstract.txt_se...,Alternative versus conventional institutional ...,"Ten trials involving 11,795 women met the incl..."


In [None]:
!pip install textstat

Collecting textstat
  Downloading textstat-0.7.11-py3-none-any.whl.metadata (15 kB)
Collecting pyphen (from textstat)
  Downloading pyphen-0.17.2-py3-none-any.whl.metadata (3.2 kB)
Downloading textstat-0.7.11-py3-none-any.whl (176 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/176.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m176.4/176.4 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyphen-0.17.2-py3-none-any.whl (2.1 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/2.1 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m68.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyphen, textstat
Successfully installed pyphen-0.17.2 textstat-0.7.11


In [None]:
!pip install bert-score

Collecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bert-score
Successfully installed bert-score-0.3.13


In [None]:
import os
os.environ["OPENAI_API_KEY"] = "sk-xxx"

In [None]:
# ============================================================
# 🟩 COMPARADOR RÁPIDO DE MODELOS PARA PLS (5 abstracts)
# Modelos: Llama 3.2 3B, Phi-3 Mini 3.8B, Qwen2.5 3B
# Métricas: estructura, jerga, grade level, longitud, BERTScore
# ============================================================

import re
import torch
import textstat
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from bert_score import score as bert_score
import pandas as pd

# ------------------------------------------------------------
# 1. CONFIGURAR MODELOS A COMPARAR
# ------------------------------------------------------------

MODELS = {
    "llama3.2_3b":  "meta-llama/Llama-3.2-3B-Instruct",
    "phi3_3.8b":    "microsoft/phi-3-mini-4k-instruct",
    "qwen2.5_3b":   "Qwen/Qwen2.5-3B-Instruct",
    "mistral_7b":   "mistralai/Mistral-7B-Instruct-v0.2",
    "gemma_2.6b":   "google/gemma-2-2.6b-it"
}

DEVICE = "cuda"

# ------------------------------------------------------------
# 2. TU PROMPT FIJO PARA PLS
# ------------------------------------------------------------

PROMPT_TEMPLATE = """Using the following abstract of a biomedical study as input, generate a Plain Language Summary
(PLS) understandable by any patient, regardless of their health literacy. Ensure that the generated text
adheres to the following instructions which should be followed step-by-step:
a. Specific Structure: The generated PLS should be presented in a logical order, using the following
order:
1. Plain Title
2. Rationale
3. Trial Design
4. Results
b. Sections should be authored following these parameters:
1. Plain Title: Simplified title understandable to a layperson that summarizes the research that was
done.
2. Rationale: Include: background or study rationale providing a general description of the
condition, what it may cause or why it is a burden for the patients; the reason and main hypothesis
for the study; and why the study is needed, and why the study medication has the potential to
treat the condition.
3. Trial Design: Answer ‘How is this study designed?’ Include the description of the design,
description of study and patient population (age, health condition, gender), and the expected
amount of time a person will be in the study.
4. Results: Answer ‘What were the main results of the study’, include the benefits for the patients,
how the study was relevant for the area of study, and the conclusions from the investigator.
c. Consistency and Replicability: The generated PLS should be consistent regardless of the order of
sentences or the specific phrasing used in the input protocol text.
d. Compliance with Plain Language Guidelines: The generated PLS must follow all these plain
language guidelines:
• Have readability grade level of 6 or below.
• Do not have jargon. All technical or medical words or terms should be defined or broken down
into simple and logical explanations.
• Active voice, not passive.
• Mostly one or two syllable words.
• Sentences of 15 words or less.
• Short paragraphs of 3-5 sentences.
• Simple numbers (e.g., ratios, no percentages).
e. Do not invent Content: The AI model should not invent information. If the AI model includes data
other than the one given in the input abstract, the AI model should guarantee such data is verified and
real.
f. Aim for an approximate PLS length of 500-900 words.

Input abstract:

{abstract}

Output PLS:
"""


# ------------------------------------------------------------
# 1. TOMAR 5 ABSTRACTS REALES DEL DATAFRAME original df
# ------------------------------------------------------------

abstracts = df["non_pls"].sample(5, random_state=42).tolist()

print("📄 Abstracts seleccionados:")
for i, abs_text in enumerate(abstracts, 1):
    print(f"\n--- Abstract #{i} ---\n{abs_text[:300]}...")


# ------------------------------------------------------------
# 2. FUNCIÓN PARA CARGAR MODELOS
# ------------------------------------------------------------

def load_model(model_name):
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.float16,
        device_map={"": "cuda"}   # 👈 fuerza TODO en GPU
    )

    return tokenizer, model


# ------------------------------------------------------------
# 3. FUNCIÓN DE GENERACIÓN DE PLS
# ------------------------------------------------------------

def generate_pls(model_name, tokenizer, model, abstract):
    prompt = PROMPT_TEMPLATE.format(abstract=abstract)
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    output = model.generate(
        **inputs,
        max_new_tokens=700,
        temperature=0.7
    )

    return tokenizer.decode(output[0], skip_special_tokens=True)


# ------------------------------------------------------------
# 4. MÉTRICAS (ACTUALIZADAS CON LLM-JUDGE)
# ------------------------------------------------------------
import textstat
import numpy as np
from openai import OpenAI

client = OpenAI()   # asume OPENAI_API_KEY en entorno

# Lista de jerga médica básica para penalizar
MEDICAL_JARGON = [
    "ischemia","subarachnoid","hemorrhage","cerebrovascular","neuronal",
    "cardiovascular","pathogenesis","randomized","relative risk","cohort",
    "double blind","etiology","pharmacology","symptomatic","morbidity"
]


def check_structure(text):
    sections = ["Plain Title", "Rationale", "Trial Design", "Results"]
    return all(s in text for s in sections)


def count_jargon(text):
    text_low = text.lower()
    return sum(1 for j in MEDICAL_JARGON if j in text_low)


def count_words(text):
    return len(text.split())


# -----------------------------
# ⭐ 1) Factuality (BERTScore)
# -----------------------------
def factuality(text, abstract):
    P, R, F = bert_score([text], [abstract], lang="en", verbose=False)
    return float(F[0])


# -----------------------------
# ⭐ 2) Readability Score
# -----------------------------
def readability_score(text):
    try:
        fk = textstat.flesch_kincaid_grade(text)
        score = 1.0 / (1.0 + fk)
        return float(max(min(score, 1.0), 0.0))
    except:
        return 0.0


# -----------------------------
# ⭐ 3) Conciseness Score
# -----------------------------
def conciseness(output_text, input_text):
    out_len = len(output_text.split())
    in_len = len(input_text.split())
    if in_len == 0:
        return 0.0
    ratio = out_len / in_len          # >1 significa más largo
    score = 1 - min(ratio, 2.0) / 2.0 # normaliza 0–1
    return float(max(min(score, 1.0), 0.0))


# -----------------------------
# ⭐ 4) Coherence Score (LLM Judge)
# -----------------------------
def coherence_score(text):
    """
    Usa GPT-4o-mini para puntuar claridad, coherencia y estructura.
    Retorna un float de 0 a 1.
    """
    prompt = f"""
You are an expert biomedical editor.

Rate the COHERENCE and CLARITY of the following text
from 0 to 1, where:

- 1 = perfectly coherent, clear, and well structured
- 0 = incoherent or badly structured

Text:
\"\"\"{text}\"\"\"

Only output a NUMBER between 0 and 1.
"""

    try:
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[{"role": "user", "content": prompt}],
            temperature=0.0
        )
        raw = response.choices[0].message.content.strip()
        return max(0.0, min(1.0, float(raw)))

    except Exception:
        return 0.0

def perplexity_score(text):
    """
    Calcula la Perplexity de un texto usando GPT-4o-mini.
    Devuelve un score 0–1 donde 1 = texto muy fluido y natural.
    """
    try:
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[{"role": "user", "content": text}],
            temperature=0.0,
            max_tokens=1,  # solo necesitamos logits del último token
            logprobs=True
        )

        # obtener los logprobs de todos los tokens del prompt
        lp = response.choices[0].logprobs.content

        # lp[i]["logprob"] es el logprob del token i
        logprobs = [tok.logprob for tok in lp if tok.logprob is not None]

        if len(logprobs) == 0:
            return 0.0

        cross_entropy = -np.mean(logprobs)
        ppl = np.exp(cross_entropy)   # perplexity tradicional

        # Normalizamos: 1/ppl (cuanto menor perplexity, mejor)
        score = 1.0 / (1.0 + ppl)

        # clamp entre 0–1
        score = max(0.0, min(1.0, float(score)))
        return score

    except Exception as e:
        print("Error computing perplexity:", e)
        return 0.0

# -----------------------------
# ⭐ 5) Final Composite Score (ACTUALIZADO)
# -----------------------------
def final_score(bert_f1, readability, coherence, concise, fluency):
    return (
        0.40 * bert_f1 +
        0.25 * coherence +
        0.15 * fluency +
        0.10 * readability +
        0.10 * concise
    )



# ------------------------------------------------------------
# 5. CARGA SECUENCIAL → SOLO 1 MODELO A LA VEZ
# ------------------------------------------------------------

def generate_for_all_models(abstracts, MODELS):
    results = []

    for model_label, model_path in MODELS.items():

        print(f"\n==============================")
        print(f"🔵 CARGANDO MODELO: {model_label}")
        print(f"==============================")

        tokenizer, model = load_model(model_path)

        for idx, abstract in enumerate(abstracts):
            print(f"\n→ Generating for abstract #{idx+1} with {model_label}...")

            pls = generate_pls(model_label, tokenizer, model, abstract)

            # ---- métricas individuales ----
            bert_f1 = factuality(pls, abstract)
            read = readability_score(pls)
            conc = conciseness(pls, abstract)
            coh = coherence_score(pls)
            flu = perplexity_score(pls)     # ← NUEVA MÉTRICA

            # ---- score compuesto ----
            final = final_score(
                bert_f1=bert_f1,
                readability=read,
                coherence=coh,
                concise=conc,
                fluency=flu
            )

            results.append({
                "abstract_id": idx+1,
                "model": model_label,
                "pls": pls,
                "structure_ok": check_structure(pls),
                "jargon_count": count_jargon(pls),
                "word_count": count_words(pls),

                # métricas nuevas
                "bert_factuality": bert_f1,
                "readability": read,
                "conciseness": conc,
                "coherence": coh,
                "fluency": flu,
                "final_score": final
            })

        del tokenizer
        del model
        torch.cuda.empty_cache()
        print(f"🧹 GPU cleaned for {model_label}")

    return pd.DataFrame(results)


# ------------------------------------------------------------
# 6. LLAMADA FINAL
# ------------------------------------------------------------
torch.cuda.empty_cache()
df_out = generate_for_all_models(abstracts, MODELS)


# ------------------------------------------------------------
# 7. GUARDAR RESULTADOS
# ------------------------------------------------------------

df_out.to_csv("model_comparison_results.csv", index=False)
df_out



📄 Abstracts seleccionados:

--- Abstract #1 ---
For response in depression, combined therapy was more effective than antidepressants alone in the early phase (RR 1.34, 95% CI 1.13 to 1.58; 10 studies, 731 participants), but there was no evidence of a difference in the acute phase (RR 1.12, 95% CI 0.93 to 1.35; 7 studies, 383 participants) or in t...

--- Abstract #2 ---
Background
External cephalic version (ECV) of the breech fetus at term (after 37 weeks) has been shown to be effective in reducing the number of breech presentations and caesarean sections, but the rates of success are relatively low. This review examines studies initiating ECV prior to term (before...

--- Abstract #3 ---
The review now includes four trials (total 75 people, one additional trial since 2006, 21 people) randomising inpatients and outpatients in China and the USA. Risk of bias was mostly unclear as reporting was poor. We are uncertain about all the effects as all evidence was graded at very low quality...

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/878 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.46G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.



→ Generating for abstract #1 with llama3.2_3b...


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.



→ Generating for abstract #2 with llama3.2_3b...


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.



→ Generating for abstract #3 with llama3.2_3b...


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.



→ Generating for abstract #4 with llama3.2_3b...


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.



→ Generating for abstract #5 with llama3.2_3b...


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


🧹 GPU cleaned for llama3.2_3b

🔵 CARGANDO MODELO: phi3_3.8b


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/306 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/599 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/967 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.67G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.



→ Generating for abstract #1 with phi3_3.8b...


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



→ Generating for abstract #2 with phi3_3.8b...


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



→ Generating for abstract #3 with phi3_3.8b...


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



→ Generating for abstract #4 with phi3_3.8b...


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



→ Generating for abstract #5 with phi3_3.8b...


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


🧹 GPU cleaned for phi3_3.8b

🔵 CARGANDO MODELO: qwen2.5_3b


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/661 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/3.97G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]


→ Generating for abstract #1 with qwen2.5_3b...


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



→ Generating for abstract #2 with qwen2.5_3b...


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



→ Generating for abstract #3 with qwen2.5_3b...


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



→ Generating for abstract #4 with qwen2.5_3b...


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



→ Generating for abstract #5 with qwen2.5_3b...


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


🧹 GPU cleaned for qwen2.5_3b

🔵 CARGANDO MODELO: mistral_7b


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/596 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.



→ Generating for abstract #1 with mistral_7b...


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.



→ Generating for abstract #2 with mistral_7b...


OutOfMemoryError: CUDA out of memory. Tried to allocate 92.00 MiB. GPU 0 has a total capacity of 14.74 GiB of which 84.12 MiB is free. Process 2672 has 14.66 GiB memory in use. Of the allocated memory 14.39 GiB is allocated by PyTorch, and 142.10 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
from google.colab import userdata
userdata.get('secretName')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
df_out.to_csv("/content/drive/MyDrive/model_comparison_results.csv", index=False)

In [None]:
import pandas as pd

def compute_leaderboard(df):
    metrics = ["final_score", "bert_factuality", "coherence", "fluency", "readability", "conciseness"]

    leaderboard = (
        df.groupby("model")[metrics]
        .agg(["mean", "std"])
        .round(4)
        .sort_values(("final_score", "mean"), ascending=False)
    )

    return leaderboard

leaderboard = compute_leaderboard(df_out)
leaderboard

Unnamed: 0_level_0,final_score,final_score,bert_factuality,bert_factuality,coherence,coherence,fluency,fluency,readability,readability,conciseness,conciseness
Unnamed: 0_level_1,mean,std,mean,std,mean,std,mean,std,mean,std,mean,std
model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
qwen2.5_3b,0.6158,0.035,0.7778,0.0084,0.9,0.1414,0.4673,0.053,0.0901,0.0084,0.0053,0.0117
llama3.2_3b,0.6112,0.0487,0.7778,0.0084,0.86,0.1949,0.4993,0.0005,0.0886,0.008,0.0133,0.0298
phi3_3.8b,0.6093,0.0448,0.7778,0.0084,0.88,0.1789,0.4547,0.0528,0.0896,0.0121,0.0104,0.0232
