In [None]:
# observation/sample_dashboard.ipynb

# ---
# Jupyter Notebook: Book Tracker - Phoenix Evaluation & Visualization
# ---

# 1. Load Data

import pandas as pd

# Load test set and (optionally) model responses if you saved them to a CSV
df = pd.read_csv("test_set.csv")
# If you also saved model outputs (recommended!), load:
# results_df = pd.read_csv("llm_results.csv")

display(df.head())

# 2. (Optional) Collect LLM Responses

# For demo, we'll fake LLM responses here (replace with your actual LLM outputs!)
df['llm_response'] = [
    "The Garden Primer by Barbara Damrosch",
    "Dune by Frank Herbert",
    "No real books found. Suggest 'The Quantum World' by Kenneth Ford for quantum science.",
    "No real books found. Suggest 'The Art of Yoga' for human yoga instead.",
    "Salt, Fat, Acid, Heat by Samin Nosrat",
    "No real books found. Suggest 'How to Train Your Dragon' (fiction) for entertainment."
]

display(df)

# 3. Simple Manual Comparison

for idx, row in df.iterrows():
    print(f"Prompt: {row['question']}")
    print(f"Reference: {row['reference_answer']}")
    print(f"LLM Response: {row['llm_response']}")
    print("---")

# 4. (Optional) Semantic Similarity Using OpenAI Embeddings

import openai
import os
import numpy as np

openai.api_key = os.environ.get("OPENAI_API_KEY")

def get_embedding(text):
    result = openai.embeddings.create(
        input=[text],
        model="text-embedding-ada-002"
    )
    return result.data[0].embedding

def cosine_similarity(vec1, vec2):
    vec1 = np.array(vec1)
    vec2 = np.array(vec2)
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

# Calculate similarity for each row
sims = []
for idx, row in df.iterrows():
    ref_emb = get_embedding(row['reference_answer'])
    llm_emb = get_embedding(row['llm_response'])
    sim = cosine_similarity(ref_emb, llm_emb)
    sims.append(sim)

df['similarity'] = sims
display(df[['question', 'reference_answer', 'llm_response', 'similarity']])

# 5. Simple Pass/Fail Visualization

threshold = 0.80
df['pass'] = df['similarity'] >= threshold

import matplotlib.pyplot as plt

plt.figure(figsize=(8,3))
df['pass'].value_counts().plot(kind='bar', color=['green', 'red'])
plt.title('LLM Output Evaluation: Pass/Fail')
plt.xlabel('Pass')
plt.ylabel('Count')
plt.xticks([0, 1], ['Fail', 'Pass'], rotation=0)
plt.show()

# 6. (Optional) Phoenix Observability Integration

# If you have the Phoenix Python client, you can add a cell here to pull traces or visualize results from your Phoenix Cloud project!
# See: https://docs.arize.com/phoenix/

print("Extend this notebook with Phoenix dashboard integration as needed!")
