# 📊 05_evaluation.ipynb

This notebook provides tools to explore, validate, and visualize the labels assigned to Bible verses during emotion and theme classification. It will also visualize the Spanish version.

## 🧱 1. Setup Paths & Translation Maps

In [None]:
from pathlib import Path
import pandas as pd

BIBLE = "bible_kjv"
BIBLE_ES = "bible_rv60"

EN_DIR = Path("data/labeled") / BIBLE / "emotion_theme"
ES_DIR = Path("data/labeled") / BIBLE_ES / "emotion_theme"

EMOTION_MAP = {
    "joy": "Alegría",
    "sadness": "Tristeza",
    "anger": "Ira",
    "fear": "Miedo",
    "trust": "Confianza",
    "surprise": "Sorpresa"
}

THEME_MAP = {
    "love": "amor",
    "faith": "fe",
    "hope": "esperanza",
    "forgiveness": "perdón",
    "fear": "miedo"
}

# Invert for comparison
INV_EMOTION_MAP = {v.lower(): k for k, v in EMOTION_MAP.items()}
INV_THEME_MAP = {v.lower(): k for k, v in THEME_MAP.items()}


## 🧪 2. Load & Compare One Example Book (e.g., Genesis)

In [None]:
book = "1_genesis"
en_file = EN_DIR / f"{book}_emotion_theme.csv"
es_file = ES_DIR / f"{book}_emotion_theme.csv"

df_en = pd.read_csv(en_file)
df_es = pd.read_csv(es_file)

assert len(df_en) == len(df_es)


## 🧠 3. Compare Emotions

In [None]:
# Map Spanish → English
df_es["emotion_en"] = df_es["emotion"].str.lower().map(INV_EMOTION_MAP)
emotion_matches = df_en["emotion"].str.lower() == df_es["emotion_en"]
emotion_accuracy = emotion_matches.mean()

print(f"🎭 Emotion agreement: {emotion_accuracy:.2%}")


## 🧩 4. Compare Themes (Multi-label, unordered)

In [None]:
def normalize_themes(series, inverse_map):
    def map_themes(row):
        if pd.isna(row): return set()
        return set(inverse_map.get(x.strip().lower(), x.strip().lower()) for x in row.split(";"))
    return series.apply(map_themes)

en_themes = normalize_themes(df_en["theme"], {})
es_themes = normalize_themes(df_es["theme"], INV_THEME_MAP)

theme_match = (en_themes == es_themes)
theme_overlap = [
    len(en & es) / max(len(en | es), 1)
    for en, es in zip(en_themes, es_themes)
]

print(f"🧠 Exact theme match: {theme_match.mean():.2%}")
print(f"🔁 Avg. theme overlap: {sum(theme_overlap)/len(theme_overlap):.2%}")


## 📊 5. Show Mismatches (Optional Debug View)

In [None]:
mismatched = df_en[~emotion_matches].copy()
mismatched["es_emotion"] = df_es.loc[~emotion_matches, "emotion"]
mismatched[["chapter", "verse", "text", "emotion", "es_emotion"]].head(10)

## 🧪 6. Manual Evaluation

This section evaluates the performance of the HuggingFace pretrained models using a small set of manually labeled examples. Each example includes an input sentence, an expected emotion, and an expected theme. The goal is to measure whether the models predict labels that align with human expectations.

This validation supports the reliability of the system before using it as a recommender.


In [None]:
import pandas as pd

# Load manually curated test cases
df_eval = pd.read_csv("data/evaluation/eval_examples.csv")
df_eval.head()


In [None]:
from transformers import pipeline

# Emotion classification model
emotion_model = pipeline(
    "text-classification",
    model="j-hartmann/emotion-english-distilroberta-base",
    top_k=None
)

# Thematic classification model
theme_model = pipeline(
    "zero-shot-classification",
    model="MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli"
)

# Candidate theme labels
themes = ["Love", "Faith", "Hope", "Forgiveness", "Fear"]


In [None]:
def evaluate_row(row):
    text = row["input_text"]
    expected_emotion = row["expected_emotion"]
    expected_theme = row["expected_theme"]

    # Predict emotion
    pred_emotion = max(emotion_model(text)[0], key=lambda x: x["score"])["label"]

    # Predict theme
    pred_theme = theme_model(text, candidate_labels=themes)["labels"][0]

    return pd.Series({
        "pred_emotion": pred_emotion,
        "pred_theme": pred_theme,
        "emotion_match": pred_emotion == expected_emotion,
        "theme_match": pred_theme == expected_theme
    })


In [None]:
# Apply evaluation to all rows
results = df_eval.join(df_eval.apply(evaluate_row, axis=1))
results.head()


In [None]:
emotion_accuracy = results["emotion_match"].mean()
theme_accuracy = results["theme_match"].mean()

print(f"🎯 Emotion accuracy: {emotion_accuracy:.2%}")
print(f"🏷️ Theme accuracy: {theme_accuracy:.2%}")
