In [None]:
from sentence_transformers import SentenceTransformer
from huggingface_hub import hf_hub_download, login
import os

HF_TOKEN = "hf_token"  
MODEL_NAME = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
LOCAL_DIR = "/content/drive/My Drive/modelo_embeddings/" 

# Login a Hugging Face
login(token=HF_TOKEN)

# Crear carpeta si no existe
os.makedirs(LOCAL_DIR, exist_ok=True)

print("Descargando modelo y guardando localmente...")

# Cargar el modelo (se descarga automáticamente y se guarda en cache)
model = SentenceTransformer(MODEL_NAME, use_auth_token=HF_TOKEN)

# Guardar localmente
model.save(LOCAL_DIR)

print(f"✅ Modelo descargado y guardado en: {LOCAL_DIR}")


In [None]:
import pandas as pd
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.tokenize import word_tokenize
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from tqdm import tqdm

# ===== DESCARGA DE RECURSOS NLTK =====
nltk.download('punkt')
nltk.download('punkt_tab')

# ===== CONFIGURACIÓN =====
CSV_LLM = "/content/land_transport_definitions_zero_shot_mistral.csv"  # CSV LLM
CSV_EXPERT = "//content/glossary_land_transport_with_info_semantica_es (1).csv"  # CSV manual
OUTPUT = "/content/evaluacion_definiciones_natural_sciences_mistral_es_batch.csv"
LOCAL_MODEL_DIR = "/content/drive/My Drive/modelo_embeddings/"

# ===== CARGA MODELO =====
print("Cargando modelo desde local...")
model = SentenceTransformer(LOCAL_MODEL_DIR)

# ===== CARGA CSVs ROBUSTA =====
print("Cargando CSVs...")
df_llm = pd.read_csv(CSV_LLM, sep=';', engine='python', on_bad_lines='skip', dtype=str)
df_expert = pd.read_csv(CSV_EXPERT, sep=';', engine='python', on_bad_lines='skip', dtype=str)

# ===== NORMALIZAR NOMBRES DE COLUMNAS =====
df_llm.columns = df_llm.columns.str.strip()
df_expert.columns = df_expert.columns.str.strip()
print("LLM columns:", df_llm.columns.tolist())
print("Expert columns:", df_expert.columns.tolist())

# ===== RENOMBRAR DEFINITIONS =====
if 'definition' in df_llm.columns:
    df_llm.rename(columns={'definition':'definition_llm'}, inplace=True)
else:
    raise KeyError("No se encontró la columna 'definition' en CSV LLM")

if 'definition' in df_expert.columns:
    df_expert.rename(columns={'definition':'definition_expert'}, inplace=True)
else:
    raise KeyError("No se encontró la columna 'definition' en CSV Expert")

# ===== LIMPIEZA =====
for col in ['term','definition_llm','definition_expert']:
    if col in df_llm.columns:
        df_llm[col] = df_llm[col].fillna("").astype(str).str.strip()
    if col in df_expert.columns:
        df_expert[col] = df_expert[col].fillna("").astype(str).str.strip()

# Eliminar filas vacías
df_llm = df_llm[(df_llm['term'] != "") & (df_llm['definition_llm'] != "")]
df_expert = df_expert[(df_expert['term'] != "") & (df_expert['definition_expert'] != "")]

# Eliminar duplicados por término
df_llm = df_llm.drop_duplicates(subset=['term'])
df_expert = df_expert.drop_duplicates(subset=['term'])

# ===== MERGE POR TERM =====
df = pd.merge(df_llm, df_expert, on='term', how='inner')
print(f"{len(df)} términos listos para evaluar.")

# ===== BATCH ENCODING PARA EMBEDDINGS =====
print("Generando embeddings en batch...")
emb_llm = model.encode(df['definition_llm'].tolist(), batch_size=32, show_progress_bar=True)
emb_expert = model.encode(df['definition_expert'].tolist(), batch_size=32, show_progress_bar=True)

# ===== SIMILITUD COSENO =====
print("Calculando similitudes coseno...")
cosine_scores_matrix = cosine_similarity(emb_llm, emb_expert)
cosine_scores = [cosine_scores_matrix[i,i] for i in range(len(df))]  # diagonal

# ===== BLEU SCORE =====
print("Calculando BLEU scores...")
smooth = SmoothingFunction().method1
bleu_scores = []
for _, row in tqdm(df.iterrows(), total=len(df), desc="BLEU"):
    reference = word_tokenize(row['definition_expert'].lower())
    candidate = word_tokenize(row['definition_llm'].lower())
    bleu = sentence_bleu([reference], candidate, smoothing_function=smooth)
    bleu_scores.append(bleu)

# ===== GUARDAR RESULTADOS =====
df['cosine_similarity'] = cosine_scores
df['bleu_score'] = bleu_scores

# ===== MÉTRICAS GLOBALES Y MEJORES/PEORES =====
cosine_mean = df['cosine_similarity'].mean()
bleu_mean = df['bleu_score'].mean()

best_cosine = df.loc[df['cosine_similarity'].idxmax()]
worst_cosine = df.loc[df['cosine_similarity'].idxmin()]
best_bleu = df.loc[df['bleu_score'].idxmax()]
worst_bleu = df.loc[df['bleu_score'].idxmin()]

# ===== IMPRESIÓN DE RESULTADOS =====
print(f"\nCosine similarity promedio: {cosine_mean:.4f}")
print(f"BLEU promedio: {bleu_mean:.4f}\n")

print(f"Mejor término (cosine): {best_cosine['term']} -> {best_cosine['cosine_similarity']:.4f}")
print(f"Peor término (cosine): {worst_cosine['term']} -> {worst_cosine['cosine_similarity']:.4f}")
print(f"Mejor término (BLEU): {best_bleu['term']} -> {best_bleu['bleu_score']:.4f}")
print(f"Peor término (BLEU): {worst_bleu['term']} -> {worst_bleu['bleu_score']:.4f}")

# ===== CSV FINAL =====
df.to_csv(OUTPUT, sep=';', index=False)
print(f"\n✅ Evaluación completada. Archivo generado: {OUTPUT}")


In [None]:
import pandas as pd
import nltk
import re
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.tokenize import word_tokenize
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import html

# ===== DESCARGA DE RECURSOS NLTK =====
nltk.download('punkt')
nltk.download('punkt_tab')

# ===== CONFIGURACIÓN =====
CSV_LLM = "/content/celex_rag_definitions_UN_mistral_semantic.csv"
CSV_EXPERT = "/content/glossary_UN_with_definitions_es_100.csv"
OUTPUT = "/content/evaluacion_definiciones_filtrado.csv"
LOCAL_MODEL_DIR = "/content/drive/My Drive/modelo_embeddings/"

# ===== FUNCIÓN DE LIMPIEZA =====
def clean_text(text):
    if pd.isna(text):
        return ""
    # Decodificar entidades HTML
    text = html.unescape(text)
    # Eliminar etiquetas HTML
    text = re.sub(r'<[^>]+>', ' ', text)
    # Eliminar URLs
    text = re.sub(r'http\S+', ' ', text)
    # Reemplazar múltiples espacios por uno solo
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

# ===== CARGA MODELO =====
print("Cargando modelo de embeddings...")
model = SentenceTransformer(LOCAL_MODEL_DIR)

# ===== CARGAR CSVs =====
print("Cargando CSV de definiciones del LLM...")
df_llm = pd.read_csv(
    CSV_LLM, sep=';', usecols=[0,1], names=['term','definition_llm'],
    header=0, engine='python', on_bad_lines='skip', encoding='utf-8'
)

# 🔴 FILTRAR DEFINICIONES ÚTILES
df_llm = df_llm.dropna(subset=['definition_llm'])
df_llm = df_llm[~df_llm['definition_llm'].str.contains("ERROR|No hay información suficiente", case=False, na=False)]
df_llm = df_llm[df_llm['definition_llm'].str.len() > 10]
print(f"Términos con definición útil del LLM: {len(df_llm)}")

print("Cargando CSV experto...")
df_expert = pd.read_csv(
    CSV_EXPERT, sep=';', usecols=[0,1], names=['term','definition_expert'],
    header=0, engine='python', on_bad_lines='skip', encoding='utf-8'
)
df_expert = df_expert.dropna(subset=['definition_expert'])
print(f"Términos disponibles del experto: {len(df_expert)}")

# ===== LIMPIAR DEFINICIONES DEL EXPERTO =====
df_expert['definition_expert'] = df_expert['definition_expert'].apply(clean_text)

# ===== NORMALIZAR TÉRMINOS PARA EL MERGE =====
df_llm['term_norm'] = df_llm['term'].str.strip().str.lower()
df_expert['term_norm'] = df_expert['term'].str.strip().str.lower()

# ===== MERGE SOLO TÉRMINOS COMUNES =====
df = pd.merge(df_llm, df_expert, on='term_norm', how='inner', suffixes=('_llm','_expert'))
print(f"Términos listos para evaluación: {len(df)}")

# ===== CALCULAR MÉTRICAS =====
cosine_scores = []
bleu_scores = []
smooth = SmoothingFunction().method1

print("Calculando métricas de similitud...")
for _, row in df.iterrows():
    emb_llm = model.encode(row['definition_llm'])
    emb_expert = model.encode(row['definition_expert'])
    cosine_sim = cosine_similarity([emb_llm], [emb_expert])[0][0]
    cosine_scores.append(cosine_sim)

    reference = word_tokenize(row['definition_expert'].lower())
    candidate = word_tokenize(row['definition_llm'].lower())
    bleu = sentence_bleu([reference], candidate, smoothing_function=smooth)
    bleu_scores.append(bleu)

df['cosine_similarity'] = cosine_scores
df['bleu_score'] = bleu_scores

# ===== RESUMEN =====
print("\nResumen de métricas")
print(f"Cosine similarity promedio: {df['cosine_similarity'].mean():.4f}")
print(f"BLEU promedio: {df['bleu_score'].mean():.4f}")

# ===== GUARDAR =====
df.to_csv(OUTPUT, sep=';', index=False)
print(f"\n✅ Evaluación completada. Archivo guardado en: {OUTPUT}")
