In [None]:
# =====================================================
# Notebook_E ‚Äî Evaluation Dashboard (3 Model Comparison)
# Models:
#   - Baseline BART (no training)
#   - BART + LoRA fine-tuned
#   - BART merged (LoRA merged into base)
#
# Computes:
#   ROUGE-1 / ROUGE-2 / ROUGE-L
#   BLEU
#   BERTScore-F1
#
# Saves:
#   per-model CSVs
#   summary comparison table
#   charts for each metric
# =====================================================

import os
import pandas as pd
from rouge_score import rouge_scorer
from bert_score import score as bert_score
from nltk.translate.bleu_score import corpus_bleu
import matplotlib.pyplot as plt

# -----------------------------------------------------
# Paths (EDIT THESE IF NEEDED)
# -----------------------------------------------------
BASELINE_CSV = "/content/llmed_certification_FineTuneFlow/metrics/baseline_predictions.csv"
LORA_CSV     = "/content/llmed_certification_FineTuneFlow/metrics/validation_predictions.csv"
MERGED_CSV   = "/content/llmed_certification_FineTuneFlow/metrics/validation_predictions_merged.csv"

OUTPUT_DIR = "/content/llmed_certification_FineTuneFlow/metrics/dashboard_outputs"
os.makedirs(OUTPUT_DIR, exist_ok=True)

print(f"Saving results to: {OUTPUT_DIR}")

# -----------------------------------------------------
# Load prediction files
# -----------------------------------------------------
print("\nüì• Loading prediction CSV files...")

df_base  = pd.read_csv(BASELINE_CSV)
df_lora  = pd.read_csv(LORA_CSV)
df_merge = pd.read_csv(MERGED_CSV)

print(f"Baseline: {len(df_base)} samples")
print(f"LoRA:     {len(df_lora)} samples")
print(f"Merged:   {len(df_merge)} samples")


# -----------------------------------------------------
# Metric computation tools
# -----------------------------------------------------

scorer = rouge_scorer.RougeScorer(
    ["rouge1", "rouge2", "rougeL"],
    use_stemmer=True
)

def compute_metrics(df):
    """Compute ROUGE, BLEU, BERTScore and return df with metrics + summary dict."""
    # Ensure summary columns are strings and handle potential NaNs
    df["model_summary"] = df["model_summary"].astype(str).fillna("")
    df["human_summary"] = df["human_summary"].astype(str).fillna("")

    rouge1_list, rouge2_list, rougeL_list = [], [], []

    print("üîç Computing ROUGE...")

    for pred, ref in zip(df["model_summary"], df["human_summary"]):
        scores = scorer.score(ref, pred)
        rouge1_list.append(scores["rouge1"].fmeasure)
        rouge2_list.append(scores["rouge2"].fmeasure)
        rougeL_list.append(scores["rougeL"].fmeasure)

    df["rouge1"] = rouge1_list
    df["rouge2"] = rouge2_list
    df["rougeL"] = rougeL_list

    print("üîç Computing BERTScore...")
    _, _, F1 = bert_score(
        df["model_summary"].tolist(),
        df["human_summary"].tolist(),
        lang="en",
        rescale_with_baseline=True
    )
    df["bert_f1"] = F1.numpy()

    print("üîç Computing BLEU...")
    references = [[ref.split()] for ref in df["human_summary"]]
    candidates = [pred.split() for pred in df["model_summary"]]
    bleu = corpus_bleu(references, candidates)

    summary = {
        "rouge1": df["rouge1"].mean(),
        "rouge2": df["rouge2"].mean(),
        "rougeL": df["rougeL"].mean(),
        "bert_f1": df["bert_f1"].mean(),
        "bleu": bleu
    }

    return df, summary


# -----------------------------------------------------
# Compute Metrics for All Models
# -----------------------------------------------------
print("\n=== üßÆ Baseline BART ===")
df_base, s_base = compute_metrics(df_base)

print("\n=== üßÆ BART + LoRA ===")
df_lora, s_lora = compute_metrics(df_lora)

print("\n=== üßÆ BART Merged ===")
df_merge, s_merge = compute_metrics(df_merge)


# -----------------------------------------------------
# Build Summary Table
# -----------------------------------------------------
s_base["model"] = "Baseline-BART"
s_lora["model"] = "LoRA"
s_merge["model"] = "Merged"

summary_df = pd.DataFrame([s_base, s_lora, s_merge])

print("\nüìä Summary Comparison:")
print(summary_df)


# -----------------------------------------------------
# Save Detailed Metrics
# -----------------------------------------------------
df_base.to_csv(f"{OUTPUT_DIR}/baseline_metrics.csv", index=False)
df_lora.to_csv(f"{OUTPUT_DIR}/lora_metrics.csv", index=False)
df_merge.to_csv(f"{OUTPUT_DIR}/merged_metrics.csv", index=False)
summary_df.to_csv(f"{OUTPUT_DIR}/summary_metrics.csv", index=False)

print("\nüíæ Saved all CSV outputs.")


# -----------------------------------------------------
# Visualization ‚Äî Matplotlib ONLY
# -----------------------------------------------------
metrics = ["rouge1", "rouge2", "rougeL", "bert_f1", "bleu"]

print("\nüìà Creating charts...")

for metric in metrics:
    plt.figure(figsize=(6, 4))
    plt.bar(summary_df["model"], summary_df[metric])
    plt.title(f"Comparison of {metric.upper()}")
    plt.xlabel("Model")
    plt.ylabel(metric)
    plt.tight_layout()

    chart_path = f"{OUTPUT_DIR}/{metric}_comparison.png"
    plt.savefig(chart_path)
    plt.close()

    print(f"Chart saved: {chart_path}")

print("\nüéâ Notebook_E complete!")
print(f"All evaluation outputs stored in:\n{OUTPUT_DIR}")
