In [None]:
from dotenv import load_dotenv
load_dotenv("/Users/paddy/Documents/Github/FretCoach/web/web-backend/.env")
from opik import Opik

# Connect to Opik
opik_client = Opik()

# Get source dataset
source_dataset = opik_client.get_dataset(name="FretCoach Live AI Feedback")
all_items = source_dataset.get_items()

# Transform items to optimizer format
def to_optimizer_format(items):
    return [
        {
            "input": item['input']['messages'][0][-1]['content'],
            "expected_output": item['expected_output']['generations'][0][0]['text']
        }
        for item in items
    ]

# Split: 35 for train, 9 for validation
train_items = to_optimizer_format(all_items[:35])
val_items = to_optimizer_format(all_items[35:44])

# Create train dataset
train_dataset = opik_client.create_dataset(name="fretcoach_live_ai_feedback_train_35")
train_dataset.insert(train_items)
print(f"Created train dataset with {len(train_items)} items")

# Create val dataset
val_dataset = opik_client.create_dataset(name="fretCoach_live_ai_feedback_val_9")
val_dataset.insert(val_items)
print(f"Created validation dataset with {len(val_items)} items")

print("\nSample train item:")
print(train_items[0])

----

In [32]:
dataset = opik_client.get_dataset(name="fretcoach_live_ai_feedback_train_25")
validation_dataset = opik_client.get_dataset(name="fretcoach_live_ai_feedback_val_9")

In [33]:
# Using LLM-as-judge with Perplexity Sonar Pro
# Requires: PPLX_API_KEY in your .env file

In [34]:
from typing import Any
import os
from openai import OpenAI
from opik.evaluation.metrics.score_result import ScoreResult

# Initialize Perplexity client
perplexity_client = OpenAI(
    api_key=os.getenv("PPLX_API_KEY"),
    base_url="https://api.perplexity.ai"
)

def llm_judge_metric(
    dataset_item: dict[str, Any],
    llm_output: str
) -> ScoreResult:
    """
    Evaluates coaching feedback quality with emphasis on:
    - Not repeating metrics already shown in UI
    - Being actionable and specific
    - Adding coaching value beyond the numbers
    """
    
    judge_prompt = f"""You are evaluating guitar coaching feedback quality.

INPUT METRICS (already shown in UI):
{dataset_item['input']}

GENERATED FEEDBACK:
{llm_output}

EXPECTED FEEDBACK EXAMPLE:
{dataset_item['expected_output']}

Evaluate the GENERATED FEEDBACK on a scale of 0.0 to 1.0 based on:

1. **Non-redundancy (40%)**: Does it avoid repeating metric numbers already in the UI? (e.g., don't say "your pitch is 70%" - that's already shown)
2. **Actionability (30%)**: Does it give specific, actionable advice? (e.g., "move to 5th position" vs vague "improve your playing")
3. **Coaching value (30%)**: Does it interpret what the metrics mean and suggest concrete fixes?

CRITICAL: Heavy penalty if feedback just restates numbers from the input.

Respond with ONLY a number between 0.0 and 1.0, nothing else."""

    response = perplexity_client.chat.completions.create(
        model="sonar-pro",
        messages=[{"role": "user", "content": judge_prompt}],
        max_tokens=10,
        temperature=0
    )
    
    score_text = response.choices[0].message.content.strip()
    try:
        score = float(score_text)
        score = max(0.0, min(1.0, score))  # Clamp to [0, 1]
    except ValueError:
        score = 0.0  # Default to 0 if parsing fails
    
    return ScoreResult(
        value=score,
        name="coaching_quality",
        reason=f"LLM judge score: {score:.3f} (emphasizing non-redundancy and actionability)",
    )

In [35]:
from opik_optimizer import ChatPrompt

# Define the prompt to optimize
system_prompt = """You are a direct guitar coach giving quick real-time feedback.                                          
                                                                                                                                      
  Your feedback MUST be 1-2 sentences, maximum 30 words total.                                                                        
                                                                                                                                      
  Format: "[What's good], but [what's weak] - [specific actionable fix]"                                                              
                                                                                                                                      
  Metric interpretations and specific fixes:                                                                                          
  - Pitch Accuracy: How cleanly notes are fretted (low = finger pressure issues)                                                      
    → Fix: "ease finger pressure" or "focus on clean fretting"                                                                        
                                                                                                                                      
  - Scale Conformity: Playing correct scale notes across fretboard positions (low = stuck in one position or wrong notes)             
    → Fix: "explore positions 5-7" or "move up the fretboard" or "try higher positions now"                                           
                                                                                                                                      
  - Timing Stability: Consistency of note spacing (low = rushing, dragging, uneven rhythm)                                            
    → Fix: "use a metronome" or "practice with metronome at 60 BPM" or "slow down and count"                                          
                                                                                                                                      
  Examples:                                                                                                                           
  - "Timing is solid at 98%, but scale conformity at 73% means you're stuck. Move up to the 5th position now."                        
  - "Pitch is excellent and timing good, but scale conformity needs work. Explore different fretboard positions - try 7th and 9th     
  frets."                                                                                                                             
  - "Great scale coverage, but timing stability is low at 45%. Practice with a metronome to build consistency."                       
                                                                                                                                      
  Be direct, conversational, and vary your wording. Maximum 30 words."""

# Map into an OpenAI-style chat prompt object
prompt = ChatPrompt(
    messages=[
        {"role": "system", "content": system_prompt},
        {
            "role": "user",
            "content": [
                {"type": "text", "text": "{input}"}
            ],
        },
    ],
)

In [36]:
from opik_optimizer import HRPO

# Setup optimizer and configuration parameters
optimizer = HRPO(
  model="openai/gpt-4o-mini",
  model_parameters={"temperature": 1}
)

In [None]:
# Execute optimizer
optimization_result = optimizer.optimize_prompt(
  prompt=prompt, # our ChatPrompt
  dataset=dataset, # our Opik dataset
  validation_dataset=validation_dataset, # optional, hold-out test
  metric=llm_judge_metric, # LLM judge focusing on non-redundancy and actionability
  max_trials=10, # optional, number of runs
)