# Step 0: Setup — Imports, API key, and CSV loading

### Step 0.1: Libraries

In [None]:
# Standard Libraries
import os
import re
import pandas as pd
from pathlib import Path
import time

# API Libraries
from dotenv import load_dotenv

# LLM Libraries
import requests
import google.generativeai as genai

  from .autonotebook import tqdm as notebook_tqdm


### Step 0.2: Loading API keys

In [63]:
# Load environment variables from the .env file
load_dotenv()

# Read API keys for both providers
DEEPSEEK_KEY = os.getenv("DEEPSEEK_API_KEY")   # DeepSeek API key
GEMINI_KEY   = os.getenv("GEMINI_API_KEY")     # Gemini API key

### Step 0.3: Loading datasets

In [3]:
# Load persona onboarding data
onboarding_df = pd.read_csv("../data/Onboarding_QnA.csv")

# Load training Q&A datasets for each persona
sarah_train = pd.read_csv("../data/Sarah_QnA.csv")
leo_train   = pd.read_csv("../data/Leo_QnA.csv")
urja_train  = pd.read_csv("../data/Urja_QnA.csv")

# Print a quick check on all dataset shapes
datasets = {
    "Onboarding": onboarding_df,
    "Sarah training": sarah_train,
    "Leo training": leo_train,
    "Urja training": urja_train
}
for name, df in datasets.items():
    print(f"{name:15}: {len(df)} rows, {len(df.columns)} columns")

Onboarding     : 4 rows, 48 columns
Sarah training : 90 rows, 3 columns
Leo training   : 90 rows, 3 columns
Urja training  : 90 rows, 3 columns


# Step 1: Helper Functions (re-used)

The functions: `get_persona_row()`, `build_persona_text()`, `build_examples_blob()`, and `trim_to_char_budget()` are directly reused from the `llm_evaluation.ipynb` code file without modification. They handle persona loading, training Q&A formatting, and trimming prompts to character limit. For detailed inline explanation please refer to the evalution script Sections 1 & 2 (in-line comments are referecnes to sections the functions are from).

In [65]:
# Step 1.1: Fetching a persona’s onboarding data by name
def get_persona_row(name: str) -> pd.Series:
    mask = onboarding_df["Persona"].astype(str).str.strip().str.casefold() == name.strip().casefold()
    if not mask.any():
        raise ValueError(f"Persona '{name}' not found in onboarding_df['Persona']")
    return onboarding_df.loc[mask].iloc[0]

# Step 1.2: Building a persona text block from onboarding data
def build_persona_text(row: pd.Series) -> str:
    lines = []
    for col in onboarding_df.columns:
        val = str(row.get(col, "")).strip()
        if val and val.lower() != "nan":
            lines.append(f"{col}: {val}")
    return "\n".join(lines)

# Step 2.1: Building an examples blob for a specific setting
def build_examples_blob(train_df: pd.DataFrame, setting: str) -> str:
    sub = train_df[
        train_df["Setting"].astype(str).str.strip().str.casefold() == setting.strip().casefold()
    ].dropna(subset=["Question", "Answer"])
    lines = [f"- Q: {q}\n  A: {a}" for q, a in zip(sub["Question"], sub["Answer"])]
    return f"Examples of past answers (Setting: {setting}, count={len(lines)}):\n" + "\n".join(lines)

# Step 2.2 : Trimming text to fit within character budget
def trim_to_char_budget(text: str, max_chars: int = 60000) -> str:
    if len(text) <= max_chars:
        return text
    out, total = [], 0
    for ln in text.splitlines():
        L = len(ln) + 1
        if total + L > max_chars:
            break
        out.append(ln)
        total += L
    out.append("\n...[trimmed due to context budget]...")
    return "\n".join(out)

# Step 2: Helper Functions (prototype-specifc)

### Step 2.1: Allowed values for settings and providers

Define fixed sets of valid settings and LLM providers. These are used for validation so the prototype only accepts supported options.

In [66]:
ALLOWED_SETTINGS = {"Family & Friends", "Medical", "Work"}
ALLOWED_PROVIDERS = {"deepseek", "gemini"}

### Step 2.2: Map persona name to their training dataset

The function `get_train_df_for_persona()` selects the correct trainign Q&A df based on persona name. This helps to minimize the number of variables to pass to final prototype function.

In [67]:
def get_train_df_for_persona(persona_name: str) -> pd.DataFrame:

    # Map known persona names to their corresponding training datasets
    mapping = {
        "Sarah Ahmed": sarah_train,
        "Leonardo Carrey": leo_train,
        "Urja Mir": urja_train,
    }

    # If the provided name isn’t in the mapping, raise a clear error
    if persona_name not in mapping:
        raise ValueError(f"Unknown persona '{persona_name}'. Choose from: {list(mapping.keys())}")
    
    # Return the matching training DataFrame
    return mapping[persona_name]

### Step 2.3: Prototype-specific initial prompt builder

The function `build_full_user_prompt_proto()` constructs the primary prompt consisting of setting, relevant Q&A examples, question, sentiment, and number of responses for the LLM to use for response generation.

In [68]:
def build_full_user_prompt_proto(setting: str, question: str,
                                 examples_blob: str, num_responses: int,
                                 sentiment: str) -> str:
    
    # Ensure the examples section stays within a safe character budget
    examples_blob = trim_to_char_budget(examples_blob, 60000)
    
    # Construct the initial user prompt
    return (
        f"Conversation setting: {setting}\n\n"
        f"{examples_blob}\n\n"
        f"New question: {question}\n\n"
        f"Use a HARD '{sentiment}' sentiment in every option.\n"
        f"Provide exactly {num_responses} concise reply option(s), labelled 1, 2, 3... (no extra text)."
    )

### Step 2.4: Flexible option splitter

The function `split_numbered_options()` parses the LLM output test into exactly *n* clear options (with limit set 1-5/10). It extend the basic splitter we used in evalution to allow for a flexible n-responses parsing.

In [69]:
# Regex pattern for numbered/bulleted lines
_num_pat = re.compile(r"^\s*(?:\d+[\).\-:]|\-\s*|\•\s*)\s*(.+?)\s*$")

def split_numbered_options(text: str, n: int):

    # Ensure the number of responses is within the supported range (1–10) 
    n = max(1, min(10, int(n)))  # TODO: adjust "min([], int(n))" for higher/lower limits

    # Handle invalid or empty input early
    if not isinstance(text, str) or not text.strip():
        return [""] * n

    # First pass: line-wise cleanup + regex extraction
    lines = [ln.strip() for ln in text.strip().splitlines() if ln.strip()]
    opts = []
    for ln in lines:
        m = _num_pat.match(ln)
        opts.append(m.group(1).strip() if m else ln)

    # Fallback: if too few, split by common numbering tokens
    if len([o for o in opts if o]) < n:
        chunks = re.split(r"(?:^|\s)(?:1\.|2\.|3\.|4\.|5\.|6\.|7\.|8\.|9\.|10\.)\s*", text)
        chunks = [c.strip() for c in chunks if c.strip()]
        merged = []
        for x in opts + chunks:
            if x and x not in merged:
                merged.append(x)
        opts = merged

    # Normalise to exactly n options (truncate or pad with empties)
    return (opts + [""] * n)[:n]

# Step 3: Prototype Runner

In [None]:
def run_prototype(question: str,
                  provider: str = "gemini",
                  num_responses: int = 3,
                  sentiment: str = "Neutral",
                  persona_name: str = "Sarah Ahmed",
                  setting: str = "Family & Friends",
                  temperature: float = 1,
                  gemini_model: str = "gemini-2.5-flash",
                  deepseek_model: str = "deepseek-chat",
                  evaluate: bool = False,               
                  ):

    # (1) Validate provider
    provider = provider.lower().strip()
    if provider not in ALLOWED_PROVIDERS:
        raise ValueError(f"provider must be one of {ALLOWED_PROVIDERS}")

    # (2) Validate setting
    if setting not in ALLOWED_SETTINGS:
        raise ValueError(f"setting must be one of {ALLOWED_SETTINGS} (got '{setting}')")

    # (3) Validate number of responses
    num_responses = int(num_responses)
    if not (1 <= num_responses <= 10):                             # TODO: adjust "(1 <= num_responses <= [])" for higher/lower limits
        raise ValueError("num_responses must be between 1 and 5")  # TODO: adjust "num_responses must be between 1 and []" for higher/lower limits

    # (4) Persona context setup
    persona_row = get_persona_row(persona_name)             # Persna row from onboarding_df
    persona_text = build_persona_text(persona_row)          # Formatting persona_row into text block
    train_df = get_train_df_for_persona(persona_name)       # Load persona specific Q&A training data
    examples_blob = build_examples_blob(train_df, setting)  # Filter examples for the given setting & persona

    # (5) User prompt
    user_prompt = build_full_user_prompt_proto(setting, question, examples_blob,
                                               num_responses, sentiment)
    
    # (6) System prompt
#### -------------------------------------------------- EDIT FOR EVALUATION -------------------------------------------------- ####
    system_prompt = (
            f"Task: Generate {num_responses} responses as {persona_name}.\n\n"
            f"Context:\n{persona_text}\n\n"
            "Requirements:\n"
            "- First person\n"
            "- <30 words\n"
            f"- {sentiment} tone\n"
            "- Format: 1) response 2) response 3) response ... n) response\n"
    )
#### ------------------------------------------------------------------------------------------------------------------------- ####
    # (7) Single API call (branched by LLM provider)
    # DEEPSEEK #
    if provider == "deepseek":

        # Ensure the DeepSeek API key is set
        if not DEEPSEEK_KEY:
            raise ValueError("❌ DEEPSEEK_API_KEY not found in .env")
        
        # DeepSeek: role-based chat completion
        url = "https://api.deepseek.com/v1/chat/completions"  # Endpoint for DeepSeek's chat-completion API
        headers = {
            "Authorization": f"Bearer {DEEPSEEK_KEY}",        # API key auth
              "Content-Type": "application/json"              # Content type
        }

        # Prepare the payload with system and user prompts
        payload = {
#### -------------------------------------------------- EDIT FOR EVALUATION -------------------------------------------------- ####
            "model": deepseek_model,                           # Specific DeepSeek model
#### ------------------------------------------------------------------------------------------------------------------------- ####
            "messages": [
                {"role": "system", "content": system_prompt},  # System prompt
                {"role": "user", "content": user_prompt},      # User prompt
            ],
            "temperature": temperature,  # Temperature controls randomness
            "max_tokens": 6000,  # Limit response length
        }
        resp = requests.post(url, headers=headers, json=payload, timeout=180)  # Send the POST request to DeepSeek API
        resp.raise_for_status()                                                # Raise an error if the request failed
        raw_text = resp.json()["choices"][0]["message"]["content"]             # Extract the generated content from the response
    
    # GEMINI #
    else:

        # Ensure the Gemini API key is set
        if not GEMINI_KEY:
            raise ValueError("❌ GEMINI_API_KEY not found in .env")
        
        # Gemini: configure client and send combined prompt
        genai.configure(api_key=GEMINI_KEY)                                       # Configure the Gemini client with the API key
#### -------------------------------------------------- EDIT FOR EVALUATION -------------------------------------------------- ####
        model = genai.GenerativeModel(model_name=gemini_model)         # Select the Gemini model
#### ------------------------------------------------------------------------------------------------------------------------- ####
        prompt = f"{system_prompt}\n\n{user_prompt}"                              # Build the full prompt by concatenating system + user parts
        response = model.generate_content(prompt)                                 # Send the prompt to the Gemini API
        raw_text = response.text if hasattr(response, "text") else str(response)  # Extract the generated text output.

    # (8) Parse results into N options
    options = split_numbered_options(raw_text, num_responses)

    # (9) Print results
    if evaluate == False:
        print("=" * 100)
        print(f"Persona: {persona_name}  |  Provider: {provider}")
        print(f"Sentiment: {sentiment}  |  Responses: {num_responses}  |  Setting: {setting}")
        print("-" * 100)
        print(f"QUESTION: '{question}'")
        print("-" * 100)
        for i, opt in enumerate(options, 1):
            print(f"{i}. {opt}")
        print("=" * 100)

# Step 4: Run the Prototype

In [35]:
start = time.time()
run_prototype(
    question="Do you feel well-rested after sleeping last night?",
    provider="deepseek",
    num_responses=5,
    sentiment="positive",
    persona_name="Sarah Ahmed",
    setting="Medical",
    stm_prompt=2
)
end = time.time()

print("_" * 30)
print(f" EXECUTION TIME: {end - start:.2f} seconds")
print("_" * 30)

Persona: Sarah Ahmed  |  Provider: deepseek
Sentiment: positive  |  Responses: 5  |  Setting: Medical
----------------------------------------------------------------------------------------------------
QUESTION: 'Do you feel well-rested after sleeping last night?'
----------------------------------------------------------------------------------------------------
1. Alhamdulillah, I slept wonderfully last night and feel refreshed and ready for the day! 🌞
2. Yes, I had a very restful night - feeling energized and grateful for good sleep.
3. Absolutely! Slept like a baby and woke up feeling peaceful and well-rested. 😊
4. Definitely well-rested - my morning prayers felt extra meaningful after such good rest.
5. I feel fantastic! Great sleep last night has me in a wonderful mood today. YOLO!
______________________________
 EXECUTION TIME: 9.54 seconds
______________________________


# Step 5: Time Evalution

In [61]:
deepseek_mdl = ["deepseek-chat", "deepseek-reasoner"]
gemini_mdl = ["gemini-2.5-flash-lite", "gemini-2.5-flash"]
prompt = [1, 2, 3]
number_responses = [1, 3, 5, 10]

### Step 5.1: Number of responses (DeepSeek)

Evaluating how number of respones generated effects output time.

In [37]:
for i in number_responses:
    times = []
    for _ in range(10):
        start = time.time()
        run_prototype(
            question="Do you feel well-rested after sleeping last night?",
            provider="deepseek",
            num_responses=i,
            sentiment="positive",
            persona_name="Sarah Ahmed",
            setting="Medical",
            evaluate=True
        )
        end = time.time()
        times.append(end - start)

    avg_time = sum(times) / len(times)  # average runtime

    print(f"N-Responses ={i} | AVERAGE EXECUTION TIME (10 runs): {avg_time:.2f} seconds")
    print("_" * 100)

N-Responses =1 | AVERAGE EXECUTION TIME (10 runs): 5.89 seconds
____________________________________________________________________________________________________
N-Responses =3 | AVERAGE EXECUTION TIME (10 runs): 8.58 seconds
____________________________________________________________________________________________________
N-Responses =5 | AVERAGE EXECUTION TIME (10 runs): 9.88 seconds
____________________________________________________________________________________________________
N-Responses =10 | AVERAGE EXECUTION TIME (10 runs): 14.05 seconds
____________________________________________________________________________________________________


### 5.2: Prompt Complexity

Evaluating how prompt compexity effects output time.

In [38]:
for i in prompt:
    times = []
    for _ in range(10):
        start = time.time()
        run_prototype(
            question="Do you feel well-rested after sleeping last night?",
            provider="deepseek",
            num_responses=5,
            sentiment="positive",
            persona_name="Sarah Ahmed",
            setting="Medical",
            stm_prompt=i,
            evaluate=True
        )
        end = time.time()
        times.append(end - start)

    avg_time = sum(times) / len(times)  # average runtime

    print(f"Prompt ={i} | AVERAGE EXECUTION TIME (10 runs): {avg_time:.2f} seconds")
    print("_" * 100)

Prompt =1 | AVERAGE EXECUTION TIME (10 runs): 9.62 seconds
____________________________________________________________________________________________________
Prompt =2 | AVERAGE EXECUTION TIME (10 runs): 9.76 seconds
____________________________________________________________________________________________________
Prompt =3 | AVERAGE EXECUTION TIME (10 runs): 9.94 seconds
____________________________________________________________________________________________________


### 5.3: DeepSeek Model

Evaluating how DeepSeek model effects output time.

In [48]:
for i in deepseek_mdl:
    times = []
    for _ in range(10):
        start = time.time()
        run_prototype(
            question="Do you feel well-rested after sleeping last night?",
            provider="deepseek",
            num_responses=5,
            sentiment="positive",
            persona_name="Sarah Ahmed",
            setting="Medical",
            deepseek_model=i,
            evaluate=True
        )
        end = time.time()
        times.append(end - start)

    avg_time = sum(times) / len(times)  # average runtime

    print(f"Model ={i} | AVERAGE EXECUTION TIME (10 runs): {avg_time:.2f} seconds")
    print("_" * 100)

Model =deepseek-chat | AVERAGE EXECUTION TIME (10 runs): 9.56 seconds
____________________________________________________________________________________________________
Model =deepseek-reasoner | AVERAGE EXECUTION TIME (10 runs): 19.60 seconds
____________________________________________________________________________________________________


### Step 5.4: Gemini Model

Evaluating how Gemini model effects output time.

In [59]:
for i in gemini_mdl:
    times = []
    for _ in range(10):
        start = time.time()
        run_prototype(
            question="Do you feel well-rested after sleeping last night?",
            provider="gemini",
            num_responses=5,
            sentiment="positive",
            persona_name="Sarah Ahmed",
            setting="Medical",
            gemini_model=i,
            evaluate=True
        )
        end = time.time()
        times.append(end - start)

    avg_time = sum(times) / len(times)  # average runtime

    print(f"Model ={i} | AVERAGE EXECUTION TIME (10 runs): {avg_time:.2f} seconds")
    print("_" * 100)

Model =gemini-2.5-flash-lite | AVERAGE EXECUTION TIME (10 runs): 1.07 seconds
____________________________________________________________________________________________________
Model =gemini-2.5-flash | AVERAGE EXECUTION TIME (10 runs): 7.16 seconds
____________________________________________________________________________________________________
Model =gemini-2.0-flash-lite | AVERAGE EXECUTION TIME (10 runs): 1.02 seconds
____________________________________________________________________________________________________
Model =gemini-2.0-flash | AVERAGE EXECUTION TIME (10 runs): 1.07 seconds
____________________________________________________________________________________________________


### 5.5: Number of responses (Gemini)

Evaluating how number of responses generated effects output time.

In [71]:
for i in number_responses:
    times = []
    for _ in range(10):
        start = time.time()
        run_prototype(
            question="Do you feel well-rested after sleeping last night?",
            provider="gemini",
            num_responses=i,
            sentiment="positive",
            persona_name="Sarah Ahmed",
            setting="Medical",
            evaluate=True
        )
        end = time.time()
        times.append(end - start)

    avg_time = sum(times) / len(times)  # average runtime

    print(f"N-Responses ={i} | AVERAGE EXECUTION TIME (10 runs): {avg_time:.2f} seconds")
    print("_" * 100)

N-Responses =1 | AVERAGE EXECUTION TIME (10 runs): 7.17 seconds
____________________________________________________________________________________________________
N-Responses =3 | AVERAGE EXECUTION TIME (10 runs): 6.86 seconds
____________________________________________________________________________________________________
N-Responses =5 | AVERAGE EXECUTION TIME (10 runs): 7.77 seconds
____________________________________________________________________________________________________
N-Responses =10 | AVERAGE EXECUTION TIME (10 runs): 7.20 seconds
____________________________________________________________________________________________________


I am not evalating the prompt for gemini as it showed no effect with DeepSeek, therefore unlikely to have any with Gemini either.