# Step 0: Setup — Imports, API key, and CSV loading

### Step 0.1: Libraries

In [103]:
# Standard Libraries
import os
import re
import time
from functools import lru_cache
from pathlib import Path
from typing import Tuple

# Third-Party Libraries
import numpy as np
import pandas as pd
import textstat
from sentence_transformers import SentenceTransformer

# API Libraries
from dotenv import load_dotenv

# LLM Libraries
import requests
import google.generativeai as genai

### Step 0.2: Loading API keys

In [3]:
# Load environment variables from the .env file
load_dotenv()

# Read API keys for both providers
DEEPSEEK_KEY = os.getenv("DEEPSEEK_API_KEY")   # DeepSeek API key
GEMINI_KEY   = os.getenv("GEMINI_API_KEY")     # Gemini API key

### Step 0.3: Loading datasets

In [102]:
# Load persona onboarding data
onboarding_df = pd.read_csv("../data/Onboarding_QnA.csv")

# Load training Q&A datasets for each persona
sarah_train = pd.read_csv("../data/Sarah_QnA.csv")
leo_train   = pd.read_csv("../data/Leo_QnA.csv")
urja_train  = pd.read_csv("../data/Urja_QnA.csv")

# Load test questions to evaluate the model outputs
test_df = pd.read_csv("../data/Test_Qs.csv")

# Print a quick check on all dataset shapes
datasets = {
    "Onboarding": onboarding_df,
    "Sarah training": sarah_train,
    "Leo training": leo_train,
    "Urja training": urja_train,
    "Test questions": test_df,
}
for name, df in datasets.items():
    print(f"{name:15}: {len(df)} rows, {len(df.columns)} columns")

Onboarding     : 4 rows, 48 columns
Sarah training : 90 rows, 3 columns
Leo training   : 90 rows, 3 columns
Urja training  : 90 rows, 3 columns
Test questions : 30 rows, 2 columns


# Step 1: Extracting and formatting persona onboarding data

### Step 1.1: Fetching a persona’s onboarding data by name

The function `get_persona_row()` looks up persona in the onboarding dataset by name and returns the row for that persona to use in Step 1.2. It also ensures the persona's existance in the file.

*P.S. Error checks like this are common through out the file, it helps us to know exatly where to look when the code fails without the need for debugging.*

In [5]:
def get_persona_row(name: str) -> pd.Series:
    
    # Create a boolean mask where "Persona" column matches the provided name (ensures case insensitivity and stripping whitespace)
    mask = onboarding_df["Persona"].astype(str).str.strip().str.casefold() == name.strip().casefold()
    
    # Raise error if persona not found
    if not mask.any():
        raise ValueError(f"Persona '{name}' not found in onboarding_df['Persona']")
    
    # Return the first matching row (when used in a product there will only be one name in the dataset, but we test diffent personas here)
    return onboarding_df.loc[mask].iloc[0]

### Step 1.2: Building a persona text block from onboarding data

The fuction `build_persona_text()` takes a single persona's onboarding info and converts it into a structured text block. This provides a clear persona description for LLM to use.

In [6]:
def build_persona_text(row: pd.Series) -> str:
    
    # Collect formatted "column: value" pairs
    lines = []

    # Loop over every column in the onboarding dataset
    for col in onboarding_df.columns:
        val = str(row.get(col, "")).strip()  # Get the value and clean whitespace

        # Only keep non-empty and non-NaN values
        if val and val.lower() != "nan":
            lines.append(f"{col}: {val}")    # Format as "ColumnName: Value"

    # Join all entries into a multi-line string
    return "\n".join(lines)

# Step 2: Preparing contextual examples and constructing user prompts

### Step 2.1: Building an examples blob for a specific setting

The function `build_example_blob()` filters the training Q&A from the files for a given persona by the setting. Then it formats the pairs of Q&A into examples blocks for the LLM to use as training examples.

In [7]:
def build_examples_blob(train_df: pd.DataFrame, setting: str) -> str:
    
    # Filter the data to get only the rows matching the specified setting (droppign all else)
    sub = train_df[
        train_df["Setting"].astype(str).str.strip().str.casefold() == setting.strip().casefold()
    ].dropna(subset=["Question", "Answer"])

    # Format each Q&A pair into a string as "- Q: <question>\n  A: <answer>"
    lines = [f"- Q: {q}\n  A: {a}" for q, a in zip(sub["Question"], sub["Answer"])]

    # Join all pairs into a single string
    return f"Examples of past answers (Setting: {setting}, count={len(lines)}):\n" + "\n".join(lines)

### Step 2.2 : Trimming text to fit within character budget

The function `trim_to_char_budget()` ensures that long text does not exceed the limit. Helps to keep prompts within the LLM's context window. It is not applicable to us, however, a larger training/ onboarding set or prompt could use this function with a specified character limit (i.e. future proofing function).

In [8]:
def trim_to_char_budget(text: str, max_chars: int = 60000) -> str:

    # If the text is already within the limit, return it as is
    if len(text) <= max_chars:
        return text
    
    # Buffer for kept lines + current total character count
    out, total = [], 0

    # Split the text into lines and iterate through them
    for ln in text.splitlines():
        
        # Line length (+1 for newline char)
        L = len(ln) + 1

        # If adding line exceeds the limit, stop adding more lines
        if total + L > max_chars:
            break
        out.append(ln)
        total += L

    # If the output is empty, return a placeholder message to indicate truncation
    out.append("\n...[trimmed due to context budget]...")
    return "\n".join(out)

### Step 2.3: Composing the per-question user prompt

The function `build_full_user_prompt()` assembles the final prompt by combining the conversation setting, relevant example (train) blob and clear instructions to return exactly 3 option. 3 options are used purely for evalution purposes, the prototype allows for a selection for number of options (e.g. 1-10).

In [9]:
def build_full_user_prompt(setting: str, question: str, examples_blob: str) -> str:

    # Ensure the examples section stays within a safe character budget
    examples_blob = trim_to_char_budget(examples_blob, 60000)

    # Construct the full user prompt with the conversation setting, examples, and new question
    return (
        f"Conversation setting: {setting}\n\n"
        f"{examples_blob}\n\n"
        f"Now answer the new question in the user's voice and preferences.\n"
        f"New question: {question}\n\n"
        "Provide exactly 3 concise reply options, labelled 1, 2, 3."
    )

# Step 3: Extracting and cleaning numbered reply options from model output

The function `split_numbered_options()` defines regex pattern to recognise common numbering styles and uses it to split model outputs into 3 clean options. 

*re Documentation: [LINK](https://docs.python.org/3/library/re.html)*

In [10]:
# Regex pattern to capture "numbered" or bullet-style options
_num_pat = re.compile(r"^\s*(?:\d+[\).\-:]|\-\s*|\•\s*)\s*(.+?)\s*$")

def split_numbered_options(text: str):

    # Handle invalid or empty input
    if not isinstance(text, str) or not text.strip():
        return ["", "", ""]
    
    # Split the text into lines and clean them
    lines = [ln.strip() for ln in text.strip().splitlines() if ln.strip()]

    # Apply regex to extract options, falling back to raw lines if no match
    opts = [(_num_pat.match(ln).group(1).strip() if _num_pat.match(ln) else ln) for ln in lines]

    # If we have fewer than 3 options, try to split by common numbering patterns
    if len(opts) < 3:
        chunks = re.split(r"(?:^|\s)(?:1\.|2\.|3\.)\s*", text)
        chunks = [c.strip() for c in chunks if c.strip()]
        if len(chunks) == 3:
            opts = chunks

    # Return exactly 3 options, filling with empty strings if needed
    return (opts + ["", "", ""])[:3]

# Step 4: Running batched persona tests with DeepSeek or Gemini

### Step 4.1: Runner for the LLM calling

The function `run_batched_test_for_persona()` generates prompts for all 30 test questions, sends them in a single batched request to chosen LLM, parses the rerurned options, and saves the results into a an excel file. This is the core function of our pipeline.

In [None]:
def run_batched_test_for_persona(persona_text: str,                       # Chosen persona text
                                 train_df: pd.DataFrame,                  # Training data for the persona
                                 test_df: pd.DataFrame,                   # Test questions to evaluate the model outputs
                                 out_path: str,                           # Output path for the results
                                 provider: str = "deepseek",              # LLM provider to use ("deepseek" or "gemini")
                                 temperature: float = 1,                  # Temperature for DeepSeek defaults to 1
                                 gemini_model: str = "gemini-2.5-flash",  # Gemini model to use
                                 deepseek_model: str = "deepseek-chat",   # DeepSeek model to use
                                 persona_name: str = "Sarah Ahmed",       # Persona name for logging
                                 ):     

    # (1) Normalize the provider input and validate
    provider = provider.lower().strip()
    if provider not in {"deepseek", "gemini"}:
        raise ValueError("provider must be 'deepseek' or 'gemini'")


    # (2) Build example blobs per setting from the training data
    example_blobs = {
        st: build_examples_blob(train_df, st)
        for st in sorted(test_df["Setting"].astype(str).str.strip().unique())
    }


    # (3) Construct blocks for each test question with QID (Question ID) headers
    sections_meta, user_blocks = [], []
    for i, r in test_df.reset_index(drop=True).iterrows():
        setting = str(r["Setting"]).strip()                                           # Setting
        question = str(r["Question"]).strip()                                         # Question
        blob = example_blobs.get(setting, "")                                         # Example blob
        per_q_block = build_full_user_prompt(setting, question, blob)                 # Build the user prompt block
        header = f"### QID {i+1} | SETTING: {setting}\n"                              # QID Header for the question block
        user_blocks.append(header + per_q_block)                                      # Append the full block to the user blocks
        sections_meta.append({"qid": i+1, "Setting": setting, "Question": question})  # Store data for each question

    # (4) Wrapper instructions: enforce structure across all QIDs
    wrapper = (
        "You will receive multiple question blocks, each preceded by a header:\n"
        "### QID n | SETTING: <name>\n\n"
        "For EVERY QID, generate exactly 3 numbered options in the user's voice.\n"
        "Reply with 30 sections in order (QID 1..30), each section starting with the SAME header, "
        "followed by the three options below it.\n"
    )

    # Add the wrapper to the blocks, ensures the LLM knows to expect multiple blocks with headers
    combined_user_prompt = wrapper + "\n\n".join(user_blocks)

    # (5) System prompt
#### -------------------------------------------------- EDIT FOR EVALUATION -------------------------------------------------- ####
    system_prompt = (
            f"Task: Generate 3 responses as {persona_name}.\n\n"
            f"Context:\n{persona_text}\n\n"
            "Requirements:\n"
            "- First person\n"
            "- <30 words\n"
            "- Neutral tone\n"
            "- Format: 1) response 2) response 3) response\n"
    )
#### ------------------------------------------------------------------------------------------------------------------------- ####
    # (6) Single API call (branched by LLM provider)
    # DEEPSEEK #
    if provider == "deepseek":

        # Ensure the DeepSeek API key is set
        if not DEEPSEEK_KEY:
            raise ValueError("❌ DEEPSEEK_API_KEY not found in .env")

        # DeepSeek: role-based chat completion
        url = "https://api.deepseek.com/v1/chat/completions"  # Endpoint for DeepSeek's chat-completion API
        headers = {
            "Authorization": f"Bearer {DEEPSEEK_KEY}",        # API key auth
            "Content-Type": "application/json"                # Content type
        }

        # Prepare the payload with system and user prompts
        payload = {
#### -------------------------------------------------- EDIT FOR EVALUATION -------------------------------------------------- ####
            "model": deepseek_model,                               # Specific DeepSeek model name
#### ------------------------------------------------------------------------------------------------------------------------- ####
            "messages": [
                {"role": "system", "content": system_prompt},      # System prompt
                {"role": "user", "content": combined_user_prompt}  # User prompt (all questions)
            ],
            "temperature": temperature,  # Temperature controls randomness
            "max_tokens": 6000           # Limit response length
        }
        resp = requests.post(url, headers=headers, json=payload, timeout=180)  # Send the POST request to DeepSeek API
        resp.raise_for_status()                                                # Raise an error if the request failed
        big_text = resp.json()["choices"][0]["message"]["content"]             # Extract the generated content from the response

    # GEMINI #
    else:

        # Ensure the Gemini API key is set
        if not GEMINI_KEY:
            raise ValueError("❌ GEMINI_API_KEY not found in .env")

        # Gemini: configure client and send combined prompt
        genai.configure(api_key=GEMINI_KEY)                                       # Configure the Gemini client with the API key
#### -------------------------------------------------- EDIT FOR EVALUATION -------------------------------------------------- ####
        model = genai.GenerativeModel(model_name=gemini_model)                    # Select the Gemini model
#### ------------------------------------------------------------------------------------------------------------------------- ####
        prompt = f"{system_prompt}\n\n{combined_user_prompt}"                     # Build the full prompt by concatenating system + user parts
        response = model.generate_content(prompt)                                 # Send the prompt to the Gemini API
        big_text = response.text if hasattr(response, "text") else str(response)  # Extract the generated text output.


    # (7) Parse response by QID headers → map answers back to test questions
    # Split the full response into blocks using the exact header pattern
    parts = re.split(r"^###\s*QID\s+(\d+)\s*\|\s*SETTING:\s*(.+)$", big_text, flags=re.M)
    parsed_rows = []

    # We expect at least one triplet (qid, setting, section_text), requires length >= 4.
    if len(parts) >= 4:
        # Iterate the list in steps of 3 (1 = qid, 2 = setting, 3 = section_text)
        for idx in range(1, len(parts), 3):
            try:
                qid = int(parts[idx])                # Captured QID number
                section_text = parts[idx+2].strip()  # Section text
            
            # If anything goes wrong, skip block
            except Exception:
                continue
                
            # Ensure the QID is within range of our constructed metadata
            if 1 <= qid <= len(sections_meta):
                meta = sections_meta[qid-1]                        # Map QID back to the original test question
                o1, o2, o3 = split_numbered_options(section_text)  # Extract exactly three numbered options from the section text
                
                # Append a structured row for saving later
                parsed_rows.append({
                    "Setting": meta["Setting"],
                    "Question": meta["Question"],
                    "Option_1": o1,
                    "Option_2": o2,
                    "Option_3": o3
                })


    # (8) Fallback parsing: if headers not respected (e.g. LLM ignored formatting instructions)
    # We attempt a simpler backup strategy, split the response into paragraphs
    if not parsed_rows:
        chunks = re.split(r"\n\s*\n+", big_text.strip())  # Split the text whenever there's a blank line

        # Iterate over each test question in order
        for i, meta in enumerate(sections_meta):
            raw = chunks[i] if i < len(chunks) else big_text  # If fewer chunks than questions, reuse the whole output as the "raw" block
            o1, o2, o3 = split_numbered_options(raw)          # Extract up to 3 numbered options from the chunk (using regex-based cleaner)

            # Append a structured row for saving later (even if imperfect)
            parsed_rows.append({
                "Setting": meta["Setting"],
                "Question": meta["Question"],
                "Option_1": o1,
                "Option_2": o2,
                "Option_3": o3
            })

    # (9) Save results 
    base = out_path[:-5]   # strip ".xlsx"
    out_path = f"{base}_{provider}.xlsx"  # Auto-suffix filename by provider

    # Create a DataFrame from the parsed rows and save to Excel
    out_df = pd.DataFrame(parsed_rows)
    out_df.to_excel(out_path, index=False)
    print(f"[+_+] Saved batched 30 responses: {out_path}")

### Step 4.2: Initialising personas

The block below retrieves each persona's onboarding data (row) and buids the formatted persona text using for prompts.

In [12]:
sarah_persona_text = build_persona_text(get_persona_row("Sarah Ahmed"))
leo_persona_text   = build_persona_text(get_persona_row("Leonardo Carrey"))
urja_persona_text  = build_persona_text(get_persona_row("Urja Mir"))

for name, df in [("Sarah", sarah_train), ("Leo", leo_train), ("Urja", urja_train)]:
    print(f"{name} Q&A: {len(df)} rows loaded.")

Sarah Q&A: 90 rows loaded.
Leo Q&A: 90 rows loaded.
Urja Q&A: 90 rows loaded.


# Step 5: Run batched evaluation for each persona

### Step 5.1: DeepSeek Batched Tests

3 block below, run batched evaluation using DeepSeek (for each persona)

In [None]:
# SARAH (DeepSeek)
run_batched_test_for_persona(
    persona_text=sarah_persona_text,
    train_df=sarah_train,
    test_df=test_df,
    out_path="test_responses/Sarah_Test_Responses.xlsx",
    provider="deepseek",
    persona_name="Sarah Ahmed",
)

[+_+] Saved batched 30 responses: test_responses/Sarah_Test_Responses_deepseek.xlsx


In [None]:
# LEO (DeepSeek)
run_batched_test_for_persona(
    persona_text=leo_persona_text,
    train_df=leo_train,
    test_df=test_df,
    out_path="test_responses/Leo_Test_Responses.xlsx",
    provider="deepseek",
    persona_name="Leonardo Carrey",
)

[+_+] Saved batched 30 responses: test_responses/Leo_Test_Responses_deepseek.xlsx


In [None]:
# URJA (DeepSeek)
run_batched_test_for_persona(
    persona_text=urja_persona_text,
    train_df=urja_train,
    test_df=test_df,
    out_path="test_responses/Urja_Test_Responses.xlsx",
    provider="deepseek",
    persona_name="Urja Mir",
)

[+_+] Saved batched 30 responses: test_responses/Urja_Test_Responses_deepseek.xlsx


### Step 5.2: Gemini Batched Tests

3 block below, run batched evaluation using Gemini (for each persona)

In [None]:
# SARAH (Gemini)
run_batched_test_for_persona(
    persona_text=sarah_persona_text,
    train_df=sarah_train,
    test_df=test_df,
    out_path="test_responses/Sarah_Test_Responses.xlsx",
    provider="gemini",
    persona_name="Sarah Ahmed",
)

[+_+] Saved batched 30 responses: test_responses/Sarah_Test_Responses_gemini.xlsx


In [None]:
# LEO (Gemini)
run_batched_test_for_persona(
    persona_text=leo_persona_text,
    train_df=leo_train,
    test_df=test_df,
    out_path="test_responses/Leo_Test_Responses.xlsx",
    provider="gemini",
    persona_name="Leonardo Carrey",
)

[+_+] Saved batched 30 responses: test_responses/Leo_Test_Responses_gemini.xlsx


In [None]:
# URJA (Gemini)
run_batched_test_for_persona(
    persona_text=urja_persona_text,
    train_df=urja_train,
    test_df=test_df,
    out_path="test_responses/Urja_Test_Responses.xlsx",
    provider="gemini",
    persona_name="Urja Mir",
)

[+_+] Saved batched 30 responses: test_responses/Urja_Test_Responses_gemini.xlsx


# Step 6: Qualitative Evaluation

### 6.1: DeepSeek Model

Testing the differnt DeepSeek models available according to [DeepSeek API docs](https://api-docs.deepseek.com/quick_start/pricing)

In [60]:
for i in ["deepseek-chat", "deepseek-reasoner"]:
    start = time.time()
    run_batched_test_for_persona(
        persona_text=sarah_persona_text,
        train_df=sarah_train,
        test_df=test_df,
        out_path=f"deepseekEval/mdl-{str(i)}.xlsx",
        provider="deepseek",
        deepseek_model=i
    )
    end = time.time()
    print(f"EXECUTION TIME: {end - start:.2f} seconds")
    print("_" * 80)

[+_+] Saved batched 30 responses: deepseekEval/mdl-deepseek-chat_deepseek.xlsx
EXECUTION TIME: 101.71 seconds
________________________________________________________________________________
[+_+] Saved batched 30 responses: deepseekEval/mdl-deepseek-reasoner_deepseek.xlsx
EXECUTION TIME: 112.74 seconds
________________________________________________________________________________


### 6.2: Gemini Model

Testing the differnt Gemini models available for free tier according to [Gemini API docs](https://ai.google.dev/gemini-api/docs/models) (pro doesnt seem to work)

In [46]:
for i in ["gemini-2.5-flash-lite", "gemini-2.5-flash"]:
    start = time.time()
    run_batched_test_for_persona(
        persona_text=sarah_persona_text,
        train_df=sarah_train,
        test_df=test_df,
        out_path=f"geminiEval/mdl-{str(i)}.xlsx",
        provider="gemini",
        gemini_model=i
    )
    end = time.time()
    print(f"EXECUTION TIME: {end - start:.2f} seconds")
    print("_" * 80)

[+_+] Saved batched 30 responses: geminiEval/mdl-gemini-2.5-flash-lite_gemini.xlsx
EXECUTION TIME: 6.62 seconds
________________________________________________________________________________
[+_+] Saved batched 30 responses: geminiEval/mdl-gemini-2.5-flash_gemini.xlsx
EXECUTION TIME: 25.71 seconds
________________________________________________________________________________


# Step 7: Quantitative Evaluation 

### Step 7.1: Helpers (embeddings + readability)

In [None]:
# Trim whitespace and coerce None to empty string
def normalize_text(s: str) -> str:
    return (s or "").strip()

# Count alphabetic words (handles simple apostrophes)
def word_count(t: str) -> int:
    return len(re.findall(r"[A-Za-z]+(?:'[A-Za-z]+)?", t or ""))

# Flesch Reading Ease via textstat (fallback to 0.0 on error)
def readability_score(t: str) -> float:
    try:
        return float(textstat.flesch_reading_ease(t or ""))
    except Exception:
        return print("Could not compute readability score")

# MiniLM sentence embeddings (normalized so cosine == dot)
_emb = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

# Cached normalized embedding for a given text
@lru_cache(maxsize=4096)
def emb(text: str) -> np.ndarray:
    return _emb.encode(text or "", normalize_embeddings=True)

# Cosine similarity with defensive normalization
def dot_cos(u: np.ndarray, v: np.ndarray) -> float:
    denom = (np.linalg.norm(u) * np.linalg.norm(v)) + 1e-8
    return float(np.dot(u, v) / denom)

### Step 7.2: Metrics & Runner

In [None]:
# compute (mean_similarity, diversity) across options; diversity = 1 - mean_similarity
def option_diversity(options: list[str]) -> Tuple[float, float]:
    vecs = [emb(o) for o in options if normalize_text(o)]
    if len(vecs) < 2:
        return 1.0, 0.0
    sims = [dot_cos(vecs[i], vecs[j]) for i in range(len(vecs)) for j in range(i+1, len(vecs))]
    mean_sim = float(sum(sims) / len(sims))
    return mean_sim, float(1.0 - mean_sim)

# evaluate a single row: readability, average words, and diversity
def evaluate_row(row: pd.Series) -> dict:
    options = [
        normalize_text(row.get("Option_1", "")),
        normalize_text(row.get("Option_2", "")),
        normalize_text(row.get("Option_3", "")),
    ]

    non_empty = [o for o in options if o]
    read = float(np.mean([readability_score(o) for o in non_empty] or [0.0]))  # mean Flesch
    avg_words = float(np.mean([word_count(o) for o in non_empty] or [0.0]))    # mean word count
    mean_sim, diversity = option_diversity(options)                            # embedding-based variety

    return {
        "flesch_reading_ease": read,
        "avg_words": avg_words,
        "diversity": diversity,                 # keep last in printed table
        "mean_option_similarity": mean_sim,     # diagnostic only
    }

# parse model/provider from stem "mdl-{model}_{provider}"
def _meta_from_stem(stem: str) -> dict:
    m = re.match(r"^mdl-(.+)_(gemini|deepseek)$", stem, re.I)
    return {
        "model": (m.group(1) if m else "unknown"),
        "provider": (m.group(2).lower() if m else "unknown"),
    }

# print compact per-model mean metrics
def _print_model_summary(provider: str, model: str, df: pd.DataFrame):
    means = df.mean(numeric_only=True)
    view = pd.DataFrame([
        ("flesch_reading_ease",  means.get("flesch_reading_ease", np.nan)),
        ("avg_words",            means.get("avg_words", np.nan)),
        ("diversity",            means.get("diversity", np.nan)),
    ], columns=["metric", "value"])
    print(f"\n=== {provider.upper()} | {model} ===")
    print(view.to_string(index=False))

# scan folders, evaluate files, print per-model summaries, return per-model means
def run_eval_from_folders(
    folders: tuple[str, ...] = ("geminiEval", "deepseekEval"),
    pattern: str = "mdl-*.xlsx",
) -> pd.DataFrame:
    paths = [p for folder in folders for p in sorted(Path(folder).glob(pattern))]  # collect files
    if not paths:
        print("No files found.")
        return pd.DataFrame()

    recs = []
    for p in paths:
        try:
            df = pd.read_excel(p)
            if not {"Setting", "Question", "Option_1", "Option_2", "Option_3"}.issubset(df.columns):  # schema check
                continue
            rows = [evaluate_row(r) for _, r in df.iterrows()]  # row metrics
            m = _meta_from_stem(p.stem)  # model/provider
            out = pd.DataFrame(rows)
            out["provider"] = m["provider"]
            out["model"] = m["model"]
            recs.append(out)
        except Exception as e:
            print(f"!! Skipping one file: {e}")  # keep going

    if not recs:
        print("No successful evaluations.")
        return pd.DataFrame()

    all_df = pd.concat(recs, ignore_index=True)  # stack all rows

    # Print provider/model summaries (no filenames)
    for (prov, mdl), grp in all_df.groupby(["provider", "model"]):  # print summaries
        _print_model_summary(prov, mdl, grp)

    # Return means per provider/model for downstream use
    return (
        all_df.groupby(["provider", "model"], as_index=False)  # per-model means
              .mean(numeric_only=True)
              .sort_values(["provider", "model"])
    )

In [None]:
# Runt the quantitative evaluation across all interested models
results = run_eval_from_folders(
    folders=("geminiEval", "deepseekEval"),
    pattern="mdl-*.xlsx"
)


=== DEEPSEEK | deepseek-chat ===
             metric     value
flesch_reading_ease 64.365406
          avg_words 11.688889
          diversity       NaN

=== DEEPSEEK | deepseek-reasoner ===
             metric     value
flesch_reading_ease 60.568871
          avg_words 12.077778
          diversity       NaN

=== GEMINI | gemini-2.5-flash ===
             metric     value
flesch_reading_ease 69.449849
          avg_words 10.677778
          diversity       NaN

=== GEMINI | gemini-2.5-flash-lite ===
             metric     value
flesch_reading_ease 73.851537
          avg_words  9.066667
          diversity       NaN
