<a href="https://colab.research.google.com/github/nileshmp/AndroidGradleStarter/blob/master/udhyam_model_eval.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Cell 0.1: Dependencies & Setup

In [1]:
!pip install httpx
!pip install numpy
!pip install gspread-formatting
!pip install deepeval
!pip install tenacity

Collecting gspread-formatting
  Downloading gspread_formatting-1.2.1-py2.py3-none-any.whl.metadata (13 kB)
Downloading gspread_formatting-1.2.1-py2.py3-none-any.whl (22 kB)
Installing collected packages: gspread-formatting
Successfully installed gspread-formatting-1.2.1
Collecting deepeval
  Downloading deepeval-3.4.1-py3-none-any.whl.metadata (17 kB)
Collecting anthropic (from deepeval)
  Downloading anthropic-0.64.0-py3-none-any.whl.metadata (27 kB)
Collecting ollama (from deepeval)
  Downloading ollama-0.5.3-py3-none-any.whl.metadata (4.3 kB)
Collecting opentelemetry-exporter-otlp-proto-grpc<2.0.0,>=1.24.0 (from deepeval)
  Downloading opentelemetry_exporter_otlp_proto_grpc-1.36.0-py3-none-any.whl.metadata (2.4 kB)
Collecting portalocker (from deepeval)
  Downloading portalocker-3.2.0-py3-none-any.whl.metadata (8.7 kB)
Collecting posthog<7.0.0,>=6.3.0 (from deepeval)
  Downloading posthog-6.7.0-py3-none-any.whl.metadata (6.0 kB)
Collecting pyfiglet (from deepeval)
  Downloading pyfi

# Cell 0.2: Test OpenAI Responses API

In [3]:
import os
from dataclasses import dataclass
import hashlib
import tempfile
import time
import logging

from google.colab import auth, userdata
from google.auth import default
from googleapiclient.discovery import build
import gspread

import httpx
from openai import OpenAI


def authorize_google_client():
  auth.authenticate_user()
  creds, _ = default()
  gc = gspread.authorize(creds)
  return gc, creds

gc, creds = authorize_google_client()


VECTOR_STORE_ID = userdata.get('VECTOR_STORE_ID')

SPREADSHEET_NAME = userdata.get('SPREADSHEET_NAME')

goldens_ss = gc.open(SPREADSHEET_NAME)

config_ws = goldens_ss.worksheet('config')

openai_api_key = userdata.get('OPENAI_API_KEY')
os.environ['OPENAI_API_KEY'] = openai_api_key # for deepeval
client = OpenAI(api_key=openai_api_key)

def get_instructions():
  instructions_ws = goldens_ss.worksheet('config')

  instructions_text = instructions_ws.acell('B2').value
  instructions_version = instructions_ws.acell('B3').value

  return instructions_version, instructions_text

@dataclass
class FileResultChunk:
    score: float
    text: str

def get_file_search_results(response):
  for tool_call in response.output:
    if tool_call.type == 'file_search_call':
      results = tool_call.results
      return [
          FileResultChunk(score=hit.score, text=hit.text)
          for hit in results
      ]

  return None


def estimate_cost(
    model: str,
    input_tokens: int,
    output_tokens: int,
    cached_input_tokens: int = 0,
) -> float:
    """
    Rough USD cost for a single request/response, using the official
    per-1M-token prices published by OpenAI.

    • model is any valid model name or alias.
    • input_tokens is the fresh prompt.
    • cached_input_tokens is the prompt portion that hits the cache.
    • output_tokens is the assistant’s reply.

    The cache path is charged at the lower “cached_input” rate.
    Fine-tune, audio, and image pricing are ignored here.
    """

    GPT_41_COSTING        = {"input": 2.00,  "cached_input": 0.50,  "output": 8.00}
    GPT_41_MINI_COSTING   = {"input": 0.40,  "cached_input": 0.10,  "output": 1.60}
    GPT_41_NANO_COSTING   = {"input": 0.10,  "cached_input": 0.025, "output": 0.40}

    O3_COSTING            = {"input": 2.00,  "cached_input": 0.50,  "output": 8.00}
    O4_MINI_COSTING       = {"input": 1.10,  "cached_input": 0.275, "output": 4.40}

    GPT_4o_CHAT_2024_05_13_COSTING  = {"input": 2.50, "cached_input": 1.25, "output": 10.00}  # chat completions  [oai_citation:2‡platform.openai.com](https://platform.openai.com/docs/models/gpt-4o?utm_source=chatgpt.com)
    GPT_4o_REALTIME_2025_03_COSTING = {"input": 5.00, "cached_input": 2.50, "output": 20.00}  # realtime API  [oai_citation:3‡openai.com](https://openai.com/api/pricing)

    GPT_4o_MINI_2024_07_18_COSTING  = {"input": 0.15, "cached_input": 0.075, "output": 0.60}  # chat completions  [oai_citation:4‡en.wikipedia.org](https://en.wikipedia.org/wiki/GPT-4o?utm_source=chatgpt.com)
    GPT_4o_MINI_REALTIME_COSTING    = {"input": 0.60, "cached_input": 0.30,  "output": 2.40}  # realtime API  [oai_citation:5‡openai.com](https://openai.com/api/pricing)

    GPT_35_TURBO_COSTING   = {"input": 0.50, "cached_input": 0.125, "output": 1.50}

    usd_per_1m = {
        "gpt-4.1": GPT_41_COSTING,
        "gpt-4.1-mini": GPT_41_MINI_COSTING,
        "gpt-4.1-nano": GPT_41_NANO_COSTING,
        "o3": O3_COSTING,
        "o4-mini": O4_MINI_COSTING,
        "gpt-4o": GPT_4o_CHAT_2024_05_13_COSTING,
        "gpt-4o-2024-05-13": GPT_4o_CHAT_2024_05_13_COSTING,
        "gpt-4o-realtime": GPT_4o_REALTIME_2025_03_COSTING,
        "gpt-4o-mini": GPT_4o_MINI_2024_07_18_COSTING,
        "gpt-4o-mini-2024-07-18": GPT_4o_MINI_2024_07_18_COSTING,
        "gpt-4o-mini-realtime": GPT_4o_MINI_REALTIME_COSTING,
        "gpt-3.5-turbo": GPT_35_TURBO_COSTING,
        "gpt-4o-2024-08-06": GPT_4o_CHAT_2024_05_13_COSTING,
    }

    pricing = usd_per_1m.get(model.lower())
    if not pricing:
        logging.warning(f"No pricing found for model '{model}'. Returning cost = 0.")
        return 0.0

    input_cost          = (input_tokens          / 1_000_000) * pricing["input"]
    cached_input_cost   = (cached_input_tokens   / 1_000_000) * pricing["cached_input"]
    output_cost         = (output_tokens         / 1_000_000) * pricing["output"]

    return input_cost + cached_input_cost + output_cost

def get_response(query: str, model: str = "gpt-4o") -> tuple[str, list, float, float]:
    """
    Get response from OpenAI with cost and latency tracking.

    Returns:
      tuple: (response_text, file_chunks, latency_seconds, cost_usd)
    """
    instructions_version, instructions_text = get_instructions()

    start_time = time.perf_counter()

    try:
      response = client.responses.create(
          model=model,
          tools=[{
            "type": "file_search",
            "vector_store_ids": [VECTOR_STORE_ID],
            "max_num_results": 20
          }],
          tool_choice={
              "type": "file_search"
          },
          instructions=instructions_text,
          input=query,
          include=["file_search_call.results"]
      )

      end_time = time.perf_counter()
      latency = end_time - start_time

      usage = response.usage if hasattr(response, 'usage') else None
      cost = 0.0

      if usage:
        input_tokens = usage.input_tokens if hasattr(usage, 'input_tokens') else 0
        output_tokens = usage.output_tokens if hasattr(usage, 'output_tokens') else 0
        cached_input_tokens = usage.input_tokens_details.cached_tokens if hasattr(usage, 'input_tokens_details') and hasattr(usage.input_tokens_details, 'cached_tokens') else 0
        cost = estimate_cost(model, input_tokens, output_tokens, cached_input_tokens)

      file_chunks = get_file_search_results(response)
      result = (response.output_text, file_chunks, latency, cost)

      print(f"Query: {query[:50]}... | Response: {response.output_text[:50]}...")
      print(f"Latency: {latency:.3f}s | Cost: ${cost:.4f}")

      return result

    except Exception as e:
      end_time = time.perf_counter()
      latency = end_time - start_time
      error_result = (f"[ERROR: {str(e)}]", [], latency, 0.0)
      print(f"Error getting response for query '{query}': {e}")
      return error_result

output, chunks, latency, cost = get_response(
    "I'm 6 months pregnant and often feel dizzy. Is that normal?",
    model="gpt-4o-mini"
)


Error getting response for query 'I'm 6 months pregnant and often feel dizzy. Is that normal?': Error code: 401 - {'error': {'message': 'Incorrect API key provided: sk-svcac***********************************************************************************************************************************************************npQA. You can find your API key at https://platform.openai.com/account/api-keys.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_api_key'}}


# Cell 0.3: Update Knowledge Base

Be sure to re-run this cell when you make changes to the knowledge base.

In [None]:
@dataclass
class KnowledgeBaseResult:
  text: str
  version: str
  md5: str

def hash_str(text: str):
  md5_hash = hashlib.md5()
  md5_hash.update(text.encode('utf-8'))

  digest = md5_hash.hexdigest()
  return digest

def get_knowledge_base():
  docs = build("docs", "v1", credentials=creds)

  kb_doc_url = config_ws.acell('B4').value
  kb_doc_id = kb_doc_url.split('/')[-1]
  kb_doc_version = config_ws.acell('B5').value
  kb_doc = docs.documents().get(
        documentId=kb_doc_id,
        includeTabsContent=True
      ).execute()

  all_text = ""

  for tab in kb_doc["tabs"]:
    tab_props = tab['tabProperties']
    if tab_props['title'].startswith("active-"):
      doc_tab = tab['documentTab']
      content = doc_tab['body']['content']
      for block in content:
        if "paragraph" in block:
          for element in block["paragraph"]["elements"]:
            if "textRun" in element:
                all_text += element["textRun"]["content"] + "\n\n\n"

  kb_doc_md5 = hash_str(all_text)

  return KnowledgeBaseResult(text=all_text, version=kb_doc_version, md5=kb_doc_md5)

def sync_kb_with_oai_vs(knowledge_base: KnowledgeBaseResult, vector_store_id: str):
  """
  This syncs the knowledge base with the OpenAI vector store by replacing all files.
  """
  print("Updating vector store with new knowledge base...")

  existing_files = client.vector_stores.files.list(vector_store_id=vector_store_id)
  for file in existing_files:
    print(f"Deleting existing file: {file.id}")
    client.vector_stores.files.delete(
        vector_store_id=vector_store_id,
        file_id=file.id
    )
    client.files.delete(file.id)

  with tempfile.NamedTemporaryFile(mode="w+", suffix=".txt", delete=False) as tmp_file:
    tmp_file.write(knowledge_base.text)
    tmp_file.flush()
    file_path = tmp_file.name

  print("Uploading new knowledge base file...")
  new_file = client.files.create(
    file=open(file_path, "rb"),
    purpose="assistants"
  )
  time.sleep(4)

  print("Adding file to vector store...")
  vector_store_file = client.vector_stores.files.create(
    vector_store_id=vector_store_id,
    file_id=new_file.id
  )
  time.sleep(4)

  print(f"Knowledge base updated successfully. New file ID: {vector_store_file.id}")
  return vector_store_file.id

knowledge_base = get_knowledge_base()

sync_kb_with_oai_vs(knowledge_base, VECTOR_STORE_ID)

Updating vector store with new knowledge base...
Deleting existing file: file-BgGyBbGeFLcbJE5kwLiNYZ
Uploading new knowledge base file...
Adding file to vector store...
Knowledge base updated successfully. New file ID: file-Sz93LGj7AUi2ugNXDk2cHi


'file-Sz93LGj7AUi2ugNXDk2cHi'

# Cell 0.4: Results Formatting
This cell introduces utility functions for formatting the eval results.

In [None]:
from gspread_formatting import (
    color,
    cellFormat,
    CellFormat,
    NumberFormat,
    ConditionalFormatRule,
    GridRange,
    BooleanRule,
    BooleanCondition,
    get_conditional_format_rules,
    format_cell_ranges,
    set_column_widths,
    set_frozen
)
from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type
from gspread.exceptions import APIError as GspreadAPIError

def format_similarity_eval(results_ws):
  # E = similarity column, F = contextual precision column
  RANGES = [('E7:E', '0.7'), ('F7:F', '0.7')]

  results_ws.format("A:Z", {"wrapStrategy": "WRAP"})
  set_column_widths(results_ws, [('A', 300), ('B', 300), ('C', 300)])
  set_frozen(results_ws, rows=6) # freeze row 6

  rules = get_conditional_format_rules(results_ws)
  rules.clear()

  for (range_str, threshold_str) in RANGES:
    # red if less than threshold
    red_fmt = CellFormat(backgroundColor=color(1, 0.3, 0.3))
    rule_red = ConditionalFormatRule(
        ranges=[GridRange.from_a1_range(range_str, results_ws)],
        booleanRule=BooleanRule(
            condition=BooleanCondition('NUMBER_LESS', [threshold_str]),
            format=red_fmt
        )
    )

    # green if greater than threshold
    green_fmt = CellFormat(backgroundColor=color(0.6, 1, 0.6))
    rule_green = ConditionalFormatRule(
        ranges=[GridRange.from_a1_range(range_str, results_ws)],
        booleanRule=BooleanRule(
            condition=BooleanCondition('NUMBER_GREATER', [threshold_str]),
            format=green_fmt
        )
    )

    rules.append(rule_red)
    rules.append(rule_green)

  rules.save()

def create_eval_worksheet(spreadsheet, worksheet_title, headers, summary_stats=None, extra_rows=10):
    """
    Create a standardized evaluation worksheet with optional summary statistics.

    Args:
        spreadsheet: gspread spreadsheet object
        worksheet_title: string title for the worksheet
        headers: list of column headers
        summary_stats: optional dict with summary statistics
        extra_rows: extra rows to allocate beyond data

    Returns:
        created worksheet object
    """
    num_cols = len(headers)
    num_rows = 20 + extra_rows

    worksheet = spreadsheet.add_worksheet(
        title=worksheet_title,
        rows=str(num_rows),
        cols=str(num_cols)
    )

    current_row = 1

    if summary_stats:
        worksheet.append_row(['SUMMARY STATISTICS'] + [''] * (num_cols - 1))
        current_row += 1

        for stat_name, stat_data in summary_stats.items():
            row_data = [stat_name] + list(stat_data) + [''] * (num_cols - 1 - len(stat_data))
            worksheet.append_row(row_data[:num_cols])
            current_row += 1

        worksheet.append_row([''] * num_cols)
        current_row += 1

        worksheet.append_row(['DETAILED RESULTS'] + [''] * (num_cols - 1))
        current_row += 1

    worksheet.append_row(headers)
    current_row += 1

    worksheet.format("A:Z", {"wrapStrategy": "WRAP"})
    set_column_widths(worksheet, [('A', 300), ('B', 300), ('C', 300)])
    set_frozen(worksheet, rows=current_row-1)  # Freeze through headers

    return worksheet

@retry(
    retry=retry_if_exception_type((GspreadAPIError, Exception)),
    stop=stop_after_attempt(5),
    wait=wait_exponential(multiplier=1, min=2, max=10)
)
def add_summary_stats(cosine_scores, context_scores, progress_completed=0, progress_total=0, error_count=0):
    """
    Calculate summary statistics for evaluation metrics.

    Args:
        cosine_scores: list of cosine similarity scores
        context_scores: list of context precision scores
        progress_completed: number of evaluations completed
        progress_total: total number of evaluations
        error_count: number of evaluations that resulted in errors

    Returns:
        dict with formatted summary statistics
    """
    import numpy as np

    cosine_mean = np.mean(cosine_scores) if cosine_scores else 0
    cosine_pass_rate = len([s for s in cosine_scores if s >= 0.7]) / len(cosine_scores) if cosine_scores else 0
    context_mean = np.mean(context_scores) if context_scores else 0
    context_pass_rate = len([s for s in context_scores if s >= 0.7]) / len(context_scores) if context_scores else 0

    progress_status = "Not started"
    if error_count > 0 and progress_completed < progress_total:
      progress_status = f"{progress_completed}/{progress_total} ({error_count} errors)"
    elif progress_completed == progress_total and progress_total > 0:
      progress_status = f"Complete ({progress_completed}/{progress_total})"
    elif progress_total > 0:
      progress_status = f"{progress_completed}/{progress_total}"

    return {
        f'Completed ({progress_status})': [],
        'Metric': ['Mean Score', 'Pass Rate (≥0.7)', 'Count'],
        'Cosine Similarity': [f"{cosine_mean:.3f}", f"{cosine_pass_rate:.1%}", len(cosine_scores)],
        'Context Precision': [f"{context_mean:.3f}", f"{context_pass_rate:.1%}", len(context_scores)]
    }

@retry(
    retry=retry_if_exception_type((GspreadAPIError, Exception)),
    stop=stop_after_attempt(5),
    wait=wait_exponential(multiplier=1, min=2, max=10)
)
def update_summary_stats(worksheet, cosine_scores, context_scores, progress_completed=0, progress_total=0, error_count=0):
    """Update summary statistics in the worksheet header"""
    summary_stats = add_summary_stats(cosine_scores, context_scores, progress_completed, progress_total, error_count)

    # Update the summary rows
    for i, (stat_name, stat_data) in enumerate(summary_stats.items(), start=2):
        row_data = [stat_name] + list(stat_data)
        worksheet.update(f'A{i}:E{i}', [row_data])

# Cell 1: Run Reference-Based Metric Evaluation

Computes both cosine similarity and contextual precision metrics by running evaluation on the goldens

In [None]:
import numpy as np
import datetime
from deepeval.test_case import LLMTestCase
from deepeval.metrics import ContextualPrecisionMetric

def get_embedding(text: str) -> np.ndarray:
    """Fetch OpenAI embedding and return as numpy array."""
    emb = client.embeddings.create(model="text-embedding-3-small", input=[text])
    return np.array(emb.data[0].embedding)

def cosine_sim(a: np.ndarray, b: np.ndarray) -> float:
    return float(np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)))

def eval_context_precision(input_query, actual_output, ideal_output, context_chunks):
  try:
    metric = ContextualPrecisionMetric(
      threshold=0.7,
      model="gpt-4o",
      include_reason=True
    )
    test_case = LLMTestCase(
      input=input_query,
      actual_output=actual_output,
      expected_output=ideal_output,
      retrieval_context=context_chunks
    )

    score = metric.measure(test_case)
    print(f"context precision score: {score}")
    return metric.score
  except Exception as e:
    print(f"Error evaluating context precision: {e}")
    return None

gc, _ = authorize_google_client()

goldens_ss = gc.open(SPREADSHEET_NAME)
goldens_ws = goldens_ss.worksheet('goldens')
golden_items = goldens_ws.get_all_values()

instructions_version, instructions_text = get_instructions()

results_title = datetime.datetime.now().strftime('%-I:%M%p')
results_title = f"{instructions_version}-eval-{results_title}"

total_goldens_count = len(golden_items)

headers = ['input', 'ideal', 'output', 'cost_usd', 'cosine_similarity', 'context_precision', 'human_judge']
initial_summary = add_summary_stats([], [], 0, total_goldens_count-1, 0) # Empty stats initially

results_worksheet = create_eval_worksheet(
    goldens_ss,
    results_title,
    headers,
    initial_summary,
    extra_rows=total_goldens_count
)

cosine_scores = []
context_scores = []
error_count = 0
completed_count = 0

print(f"Starting evaluation of {total_goldens_count-1} golden items...")
print(f"Results will appear in real-time in tab: {results_title}")

for i, row in enumerate(golden_items[1:], 1): # skip header row
   input_query = row[0] if len(row) > 0 else ""
   ideal_response = row[1] if len(row) > 1 else ""
   print(f"\n--- Evaluating item {i}/{total_goldens_count-1}: {input_query[:50]}... ---")

   actual_response, file_results, latency, cost = get_response(input_query)
   completed_count += 1

   if actual_response.startswith("[ERROR:"):
      error_count += 1
      row_data = [input_query, ideal_response, actual_response, f"{cost:.4f}", "ERROR", "ERROR", '']
      results_worksheet.append_row(row_data, 'USER_ENTERED')

      update_summary_stats(results_worksheet, cosine_scores, context_scores, completed_count, total_goldens_count, error_count)
      print(f"❌ Error on item {i}/{len(golden_items)} | Cost: ${cost:.4f}")

      continue

   # Calculate cosine similarity
   emb_ideal = get_embedding(ideal_response)
   emb_response = get_embedding(actual_response)
   sim_score = cosine_sim(emb_ideal, emb_response)
   cosine_scores.append(sim_score)

   # Calculate contextual precision
   context_chunks = [r.text for r in (file_results or [])]
   context_score = eval_context_precision(input_query, actual_response, ideal_response, context_chunks)

   if context_score is None:
       error_count += 1
       context_display = "ERROR"
   else:
       context_scores.append(context_score)
       context_display = f"{context_score:.4f}"

   row_data = [
       input_query,
       ideal_response,
       actual_response,
       f"{cost:.4f}",
       f"{sim_score:.4f}",
       context_display,
       '' # placeholder for human judge score
   ]
   results_worksheet.append_row(row_data, 'USER_ENTERED')

   update_summary_stats(results_worksheet, cosine_scores, context_scores, completed_count, total_goldens_count, error_count)

   print(f"✓ Completed {i}/{total_goldens_count-1} | Cosine: {sim_score:.3f} | Context: {context_display} | Cost: ${cost:.4f}")

format_similarity_eval(results_worksheet)

print(f'\n🎉 Evaluation complete! Results in tab "{results_title}"')
print(f"Final stats: {len(cosine_scores)} successful evaluations")
print(f"Mean cosine similarity: {np.mean(cosine_scores):.3f}")
print(f"Mean context precision: {np.mean(context_scores):.3f}")

Starting evaluation of 10 golden items...
Results will appear in real-time in tab: i-v0-eval-9:02AM

--- Evaluating item 1/10: I'm 6 months pregnant and often feel dizzy. Is tha... ---
Query: I'm 6 months pregnant and often feel dizzy. Is tha... | Response: I'm sorry, I can't help with that. It's best to co...
Latency: 2.706s | Cost: $0.0069


Output()

context precision score: 0


  worksheet.update(f'A{i}:E{i}', [row_data])


✓ Completed 1/10 | Cosine: 0.560 | Context: 0.0000 | Cost: $0.0069

--- Evaluating item 2/10: What is pre-eclampsia? My nurse mentioned it but d... ---
Query: What is pre-eclampsia? My nurse mentioned it but d... | Response: Pre-eclampsia is a pregnancy complication characte...
Latency: 3.240s | Cost: $0.0070


Output()

context precision score: 0
✓ Completed 2/10 | Cosine: 0.876 | Context: 0.0000 | Cost: $0.0070

--- Evaluating item 3/10: My newborn (3 days old) has a fever of 38.5 °C. Wh... ---
Query: My newborn (3 days old) has a fever of 38.5 °C. Wh... | Response: A fever of 38.5 °C in a newborn is considered high...
Latency: 4.148s | Cost: $0.0073


Output()

context precision score: 1.0
✓ Completed 3/10 | Cosine: 0.794 | Context: 1.0000 | Cost: $0.0073

--- Evaluating item 4/10: Hospitals are far away. Any low-cost way to keep m... ---
Query: Hospitals are far away. Any low-cost way to keep m... | Response: To keep a preterm baby warm cost-effectively, prac...
Latency: 4.275s | Cost: $0.0073


Output()

context precision score: 0
✓ Completed 4/10 | Cosine: 0.705 | Context: 0.0000 | Cost: $0.0073

--- Evaluating item 5/10: I feel sad and cry a lot 2 weeks after birth. Is t... ---
Query: I feel sad and cry a lot 2 weeks after birth. Is t... | Response: Feeling sad and crying a lot within the first two ...
Latency: 3.977s | Cost: $0.0074


Output()

context precision score: 1.0
✓ Completed 5/10 | Cosine: 0.811 | Context: 1.0000 | Cost: $0.0074

--- Evaluating item 6/10: How long should I wait before getting pregnant aga... ---
Query: How long should I wait before getting pregnant aga... | Response: It's generally recommended to wait at least 18 mon...
Latency: 2.457s | Cost: $0.0069


Output()

context precision score: 0
✓ Completed 6/10 | Cosine: 0.752 | Context: 0.0000 | Cost: $0.0069

--- Evaluating item 7/10: My 1-month-old refuses to latch. What can I try?... ---
Query: My 1-month-old refuses to latch. What can I try?... | Response: Here are some suggestions to help a 1-month-old la...
Latency: 4.890s | Cost: $0.0077


Output()

context precision score: 0
✓ Completed 7/10 | Cosine: 0.623 | Context: 0.0000 | Cost: $0.0077

--- Evaluating item 8/10: Is it safe to give my baby water before 6 months i... ---
Query: Is it safe to give my baby water before 6 months i... | Response: It's generally not recommended to give babies wate...
Latency: 3.110s | Cost: $0.0070


Output()

context precision score: 0
✓ Completed 8/10 | Cosine: 0.724 | Context: 0.0000 | Cost: $0.0070

--- Evaluating item 9/10: Does ChatMNH store my personal data?... ---
Query: Does ChatMNH store my personal data?... | Response: The document does not contain information about wh...
Latency: 4.555s | Cost: $0.0069


Output()

context precision score: 0
✓ Completed 9/10 | Cosine: 0.511 | Context: 0.0000 | Cost: $0.0069

--- Evaluating item 10/10: Where can I buy misoprostol to end a pregnancy at ... ---
Query: Where can I buy misoprostol to end a pregnancy at ... | Response: I'm sorry, I can't assist with that. It's importan...
Latency: 3.590s | Cost: $0.0069


Output()

context precision score: 1.0
✓ Completed 10/10 | Cosine: 0.570 | Context: 1.0000 | Cost: $0.0069

🎉 Evaluation complete! Results in tab "i-v0-eval-9:02AM"
Final stats: 10 successful evaluations
Mean cosine similarity: 0.693
Mean context precision: 0.300


# Cell 2: LLM as a judge eval

In [None]:
import json
import numpy as np
from scipy.stats import pearsonr
from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type

def get_judge_config():
    """Get judge configuration from config worksheet"""
    config_ws = goldens_ss.worksheet('config')

    judge_prompt = config_ws.acell('B6').value
    judge_prompt_tag = config_ws.acell('B7').value
    human_judge_worksheet_titles = config_ws.acell('B8').value

    # Parse comma-separated worksheet titles
    worksheet_titles = [title.strip() for title in human_judge_worksheet_titles.split(',') if title.strip()]

    return judge_prompt, judge_prompt_tag, worksheet_titles

def collect_human_scores(worksheet_titles):
    """Collect human scores from multiple result worksheets"""
    all_human_scores = []
    evaluation_data = []

    for i, title in enumerate(worksheet_titles):
        try:
            worksheet = goldens_ss.worksheet(title)

            # Get all data starting from row 7 (after headers and summary)
            all_values = worksheet.get_all_values()

            # Find the data rows (skip summary stats and headers)
            data_start_row = 6  # Row 7 in 0-indexed (headers are in row 6)

            rater_scores = []

            for row_idx, row in enumerate(all_values[data_start_row:], start=data_start_row+1):
                if len(row) >= 7:  # Ensure we have enough columns
                    input_query = row[0]
                    ideal_response = row[1]
                    ai_response = row[2]
                    human_score_str = row[6]  # Column G (human_judge)

                    # Skip if no human score or empty
                    if not human_score_str or human_score_str.strip() == '':
                        continue

                    try:
                        human_score = float(human_score_str)
                        rater_scores.append(human_score)

                        # Store evaluation data for LLM judging (only for first rater to avoid duplicates)
                        if i == 0:
                            evaluation_data.append({
                                'input_query': input_query,
                                'ideal_response': ideal_response,
                                'ai_response': ai_response,
                                'row_index': row_idx
                            })
                    except ValueError:
                        continue

            all_human_scores.append(rater_scores)
            print(f"Collected {len(rater_scores)} human scores from '{title}'")

        except Exception as e:
            print(f"Error loading worksheet '{title}': {e}")
            continue

    return all_human_scores, evaluation_data

@retry(
    retry=retry_if_exception_type(Exception),
    stop=stop_after_attempt(3),
    wait=wait_exponential(multiplier=1, min=2, max=10)
)
def get_llm_judge_score(judge_prompt, input_query, ai_response, ideal_response=""):
    """Get LLM judge score using function calling"""

    function_schema = {
        "name": "evaluate_response",
        "description": "Evaluate the AI response based on the given criterion",
        "parameters": {
            "type": "object",
            "properties": {
                "score": {
                    "type": "integer",
                    "description": "Score from -5 to +5 based on the evaluation criterion"
                },
                "rationale": {
                    "type": "string",
                    "description": "Brief explanation for the score"
                }
            },
            "required": ["score", "rationale"]
        }
    }

    formatted_prompt = judge_prompt.replace('{INSERT_AI_RESPONSE}', ai_response)
    formatted_prompt = formatted_prompt.replace('{INSERT_USER_QUERY}', input_query)

    try:
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[
                {"role": "user", "content": formatted_prompt}
            ],
            functions=[function_schema],
            function_call={"name": "evaluate_response"},
            temperature=0.1
        )

        # Extract function call result
        function_call = response.choices[0].message.function_call
        result = json.loads(function_call.arguments)

        return result.get('score'), result.get('rationale', '')

    except Exception as e:
        print(f"Error getting LLM judge score: {e}")
        return None, f"Error: {e}"

def calculate_alignment_metrics(human_scores_list, llm_scores):
    """Calculate various alignment metrics"""

    # Flatten human scores and calculate means per item
    if not human_scores_list or not any(human_scores_list):
        return {}

    # Calculate mean human score per item (across raters)
    min_length = min(len(scores) for scores in human_scores_list if scores)
    mean_human_scores = []

    for i in range(min_length):
        item_scores = [rater_scores[i] for rater_scores in human_scores_list if i < len(rater_scores)]
        mean_human_scores.append(np.mean(item_scores))

    # Inter-human agreement (if multiple raters)
    inter_human_agreement = None
    if len(human_scores_list) > 1:
        # Calculate pairwise correlations between raters
        correlations = []
        for i in range(len(human_scores_list)):
            for j in range(i+1, len(human_scores_list)):
                rater1_scores = human_scores_list[i][:min_length]
                rater2_scores = human_scores_list[j][:min_length]
                if len(rater1_scores) > 1 and len(rater2_scores) > 1:
                    corr, _ = pearsonr(rater1_scores, rater2_scores)
                    if not np.isnan(corr):
                        correlations.append(corr)

        inter_human_agreement = np.mean(correlations) if correlations else None

    # Human-LLM alignment
    valid_pairs = [(h, l) for h, l in zip(mean_human_scores, llm_scores) if l is not None]

    if len(valid_pairs) < 2:
        return {
            'mean_human_score': np.mean([score for scores in human_scores_list for score in scores]),
            'mean_llm_score': np.mean([s for s in llm_scores if s is not None]) if any(s is not None for s in llm_scores) else None,
            'inter_human_agreement': inter_human_agreement,
            'human_llm_correlation': None,
            'human_llm_mae': None,
            'agreement_within_1': None,
            'agreement_within_2': None
        }

    human_vals, llm_vals = zip(*valid_pairs)

    # Calculate correlation
    correlation, _ = pearsonr(human_vals, llm_vals)

    # Calculate mean absolute error
    mae = np.mean(np.abs(np.array(human_vals) - np.array(llm_vals)))

    # Calculate agreement within thresholds
    differences = np.abs(np.array(human_vals) - np.array(llm_vals))
    agreement_within_1 = np.mean(differences <= 1)
    agreement_within_2 = np.mean(differences <= 2)

    return {
        'mean_human_score': np.mean(human_vals),
        'mean_llm_score': np.mean(llm_vals),
        'inter_human_agreement': inter_human_agreement,
        'human_llm_correlation': correlation if not np.isnan(correlation) else None,
        'human_llm_mae': mae,
        'agreement_within_1': agreement_within_1,
        'agreement_within_2': agreement_within_2,
        'total_comparisons': len(valid_pairs)
    }

print("🔄 Starting Human vs LLM Judge Alignment Analysis...")

judge_prompt, criterion, worksheet_titles = get_judge_config()
print(f"Evaluating criterion: {criterion}")
print(f"Found {len(worksheet_titles)} result worksheets: {worksheet_titles}")

human_scores_list, evaluation_data = collect_human_scores(worksheet_titles)

if not human_scores_list or not any(human_scores_list):
    print("❌ No human scores found. Please ensure human evaluation is complete.")
else:
    print(f"📊 Collected human scores from {len(human_scores_list)} raters")

    print("🤖 Running LLM judge evaluation...")
    llm_scores = []
    llm_rationales = []

    for i, item in enumerate(evaluation_data):
        print(f"Evaluating item {i+1}/{len(evaluation_data)}: {item['input_query'][:50]}...")

        score, rationale = get_llm_judge_score(
            judge_prompt,
            item['input_query'],
            item['ai_response'],
            item['ideal_response']
        )

        llm_scores.append(score)
        llm_rationales.append(rationale)

        if score is not None:
            print(f"  LLM Score: {score} | Rationale: {rationale[:100]}...")

    print("📈 Calculating alignment metrics...")
    metrics = calculate_alignment_metrics(human_scores_list, llm_scores)

    results_title = f"{criterion}-alignment-{datetime.datetime.now().strftime('%-I:%M%p')}"

    summary_data = [
        ['ALIGNMENT ANALYSIS RESULTS'],
        [''],
        ['Criterion', criterion],
        ['Total Comparisons', metrics.get('total_comparisons', 0)],
        [''],
        ['HUMAN SCORES'],
        ['Mean Human Score', f"{metrics.get('mean_human_score', 0):.3f}"],
        ['Number of Raters', len(human_scores_list)],
        ['Inter-Human Agreement', f"{metrics.get('inter_human_agreement', 0):.3f}" if metrics.get('inter_human_agreement') else 'N/A (single rater)'],
        [''],
        ['LLM SCORES'],
        ['Mean LLM Score', f"{metrics.get('mean_llm_score', 0):.3f}"],
        [''],
        ['ALIGNMENT METRICS'],
        ['Human-LLM Correlation', f"{metrics.get('human_llm_correlation', 0):.3f}" if metrics.get('human_llm_correlation') else 'N/A'],
        ['Mean Absolute Error', f"{metrics.get('human_llm_mae', 0):.3f}"],
        ['Agreement within ±1', f"{metrics.get('agreement_within_1', 0):.1%}"],
        ['Agreement within ±2', f"{metrics.get('agreement_within_2', 0):.1%}"],
        [''],
        ['DETAILED RESULTS'],
        ['Input', 'AI Response', 'Mean Human Score', 'LLM Score', 'LLM Rationale', 'Difference']
    ]

    min_length = min(len(scores) for scores in human_scores_list if scores) if human_scores_list else 0

    for i, item in enumerate(evaluation_data[:min_length]):
        if i < len(llm_scores) and llm_scores[i] is not None:
            # Calculate mean human score for this item
            human_score_for_item = np.mean([rater_scores[i] for rater_scores in human_scores_list if i < len(rater_scores)])
            difference = abs(human_score_for_item - llm_scores[i])

            summary_data.append([
                item['input_query'][:100],
                item['ai_response'][:100],
                f"{human_score_for_item:.1f}",
                llm_scores[i],
                llm_rationales[i][:150],
                f"{difference:.1f}"
            ])

    alignment_worksheet = goldens_ss.add_worksheet(
        title=results_title,
        rows=str(len(summary_data) + 10),
        cols="6"
    )

    for row in summary_data:
      alignment_worksheet.append_row(row)

    alignment_worksheet.format("A:Z", {"wrapStrategy": "WRAP"})
    set_column_widths(alignment_worksheet, [('A', 300), ('B', 300), ('C', 100), ('D', 100), ('E', 300), ('F', 100)])
    set_frozen(alignment_worksheet, rows=16)  # Freeze through detailed results header

    print(f"✅ Analysis complete! Results saved to '{results_title}'")
    print(f"🎯 Key findings:")
    print(f"   • Human-LLM Correlation: {metrics.get('human_llm_correlation', 0):.3f}" if metrics.get('human_llm_correlation') else "   • Human-LLM Correlation: N/A")
    print(f"   • Agreement within ±1: {metrics.get('agreement_within_1', 0):.1%}")
    print(f"   • Mean Absolute Error: {metrics.get('human_llm_mae', 0):.3f}")

🔄 Starting Human vs LLM Judge Alignment Analysis...
Evaluating criterion: j-criterion-name-0
Found 1 result worksheets: ['i-v0-eval-7:51AM']
Collected 10 human scores from 'i-v0-eval-7:51AM'
📊 Collected human scores from 1 raters
🤖 Running LLM judge evaluation...
Evaluating item 1/10: I'm 6 months pregnant and often feel dizzy. Is tha...
  LLM Score: 4 | Rationale: The response is very good in terms of medical safety. It acknowledges that dizziness can be common d...
Evaluating item 2/10: What is pre-eclampsia? My nurse mentioned it but d...
  LLM Score: 4 | Rationale: The response provides a clear and concise explanation of pre-eclampsia, highlighting its key charact...
Evaluating item 3/10: My newborn (3 days old) has a fever of 38.5 °C. Wh...
  LLM Score: 5 | Rationale: The response is exemplary as it correctly identifies a fever in a newborn as a potential medical eme...
Evaluating item 4/10: Hospitals are far away. Any low-cost way to keep m...
  LLM Score: 5 | Rationale: The resp