In [None]:
!pip install evaluate datasets

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from evaluate import load as load_metric
import numpy as np
from tqdm import tqdm

In [None]:
# Configuración
MODEL_PATH = "./modelo"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    MODEL_PATH,
    local_files_only=True,
    torch_dtype=torch.float16 if DEVICE == "cuda" else torch.float32
).to(DEVICE)
model.eval()


In [None]:
# Cargar dataset completo
data_files = {
    'train': 'dataset_train_filtrado.jsonl',
    'validation': 'dataset_valid_filtrado.jsonl',
    'test': 'dataset_test_filtrado.jsonl'
}
raw_datasets = load_dataset('json', data_files=data_files)
test_dataset = raw_datasets["test"]

# Tomar solo la tercera parte
subset_size = len(test_dataset)
test_subset = test_dataset.select(range(subset_size))


In [None]:
def comentar_codigo(snippet):
    prompt = f"Comenta este código Python:\n```python\n{snippet}\n```\nCódigo comentado:\n```python\n"
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512).to(DEVICE)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=512,
            temperature=0.7,
            do_sample=False,
            pad_token_id=tokenizer.eos_token_id
        )
    full_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
    comentado = full_output.split("Código comentado:")[-1].strip()
    return comentado.replace("```", "").strip()


In [None]:
rouge = load_metric("rouge")
meteor = load_metric("meteor")

In [None]:
refs, preds = [], []

for item in tqdm(test_subset):
    codigo_limpio = item["code_clean"]
    codigo_comentado_ref = item["code"]

    codigo_comentado_pred = comentar_codigo(codigo_limpio)

    refs.append(codigo_comentado_ref)
    preds.append(codigo_comentado_pred)


In [None]:
# BLEU
smoothie = SmoothingFunction().method4
bleu_scores = [
    sentence_bleu(
        [ref.split()],
        pred.split(),
        smoothing_function=smoothie
    ) for ref, pred in zip(refs, preds)
]
bleu_avg = np.mean(bleu_scores)

# ROUGE
rouge_result = rouge.compute(predictions=preds, references=refs, use_stemmer=True)

# METEOR
meteor_result = meteor.compute(predictions=preds, references=refs)

# Mostrar resultados
print(f"\nBLEU score promedio: {bleu_avg:.4f}")
print(f"METEOR: {meteor_result['meteor']:.4f}")
print(f"ROUGE-L: {rouge_result['rougeL']:.4f}")

