In [None]:
import os
from dotenv import load_dotenv
load_dotenv("/Users/paddy/Documents/Github/FretCoach/web/web-backend/.env")

from opik import Opik
from opik.evaluation import evaluate
from opik.evaluation.metrics import base_metric, score_result
from typing import Any

from openai import OpenAI

# Initialize Opik client
client = Opik()

# Get validation dataset
dataset = client.get_dataset(name="fretcoach_live_ai_feedback_val_9")

# Initialize LLM for generating coaching feedback
llm_client = OpenAI()

# System prompt from live_coach_service.py (optimized version)
COACHING_SYSTEM_PROMPT = """You are a direct guitar coach giving quick real-time feedback. Your feedback MUST be 1-2 sentences, maximum 30 words total.

Format: "[What's good], but [what's weak] - [specific actionable fix based on performance context]"

To improve insight relevance, always relate feedback to specific performance scores, especially scores above 0.700. Include contextual details from the player's playing style to inform suggestions:

- Pitch Accuracy: How cleanly notes are fretted (low = finger pressure issues)
  → Fix: "ease finger pressure to improve note clarity" or "focus on clean fretting by adjusting finger placement"

- Scale Conformity: Playing correct scale notes across fretboard positions (low = stuck in one position or wrong notes)
  → Fix: "explore positions 5-7 to enhance versatility" or "move up the fretboard to discover new notes"

- Timing Stability: Consistency of note spacing (low = rushing, dragging, uneven rhythm)
  → Fix: "use a metronome at 60 BPM to develop timing" or "slow down and count to create consistent spacing"

Be direct and conversational, and vary your wording. Ensure your suggestions are anchored in the player's specific performance metrics and informed by previous high-quality outputs. Maximum 30 words."""

# Initialize Perplexity client for LLM judge
perplexity_client = OpenAI(
    api_key=os.getenv("PPLX_API_KEY"),
    base_url="https://api.perplexity.ai"
)

class CoachingQualityMetric(base_metric.BaseMetric):
    """LLM judge metric evaluating coaching feedback quality"""
    
    def __init__(self):
        super().__init__(name="coaching_quality")
    
    def score(self, output: str, reference: str, input: str, **kwargs) -> score_result.ScoreResult:
        """
        Evaluate coaching feedback quality
        
        Args:
            output: Generated coaching feedback
            reference: Expected coaching feedback
            input: Input metrics (already shown in UI)
        """
        judge_prompt = f"""You are evaluating guitar coaching feedback quality.

INPUT METRICS (already shown in UI):
{input}

GENERATED FEEDBACK:
{output}

EXPECTED FEEDBACK EXAMPLE:
{reference}

Evaluate the GENERATED FEEDBACK on a scale of 0.0 to 1.0 based on:

1. **Non-redundancy (40%)**: Does it avoid repeating metric numbers already in the UI?
2. **Actionability (30%)**: Does it give specific, actionable advice?
3. **Coaching value (30%)**: Does it interpret what the metrics mean and suggest concrete fixes?

CRITICAL: Heavy penalty if feedback just restates numbers from the input.

Respond with ONLY a number between 0.0 and 1.0, nothing else."""

        response = perplexity_client.chat.completions.create(
            model="sonar-pro",
            messages=[{"role": "user", "content": judge_prompt}],
            max_tokens=10,
            temperature=0
        )
        
        score_text = response.choices[0].message.content.strip()
        try:
            score_value = float(score_text)
            score_value = max(0.0, min(1.0, score_value))
        except ValueError:
            score_value = 0.0
        
        return score_result.ScoreResult(
            value=score_value,
            name=self.name,
            reason=f"LLM judge score: {score_value:.3f} (non-redundancy + actionability)",
        )

def evaluation_task(dataset_item):
    """Generate coaching feedback for evaluation"""
    # Get the input metrics
    user_input = dataset_item['input']
    
    # Call LLM to generate coaching feedback
    response = llm_client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": COACHING_SYSTEM_PROMPT},
            {"role": "user", "content": user_input}
        ],
        temperature=0.9,
        max_tokens=100
    )
    
    llm_response = response.choices[0].message.content.strip()
    
    result = {
        "input": user_input,
        "output": llm_response,
        "reference": dataset_item['expected_output']
    }
    
    return result

# Create metric instance
metrics = [CoachingQualityMetric()]

eval_results = evaluate(
    experiment_name="fretcoach-coaching-feedback-eval",
    dataset=dataset,
    task=evaluation_task,
    scoring_metrics=metrics
)


import os
from opik import Opik
from opik.evaluation import evaluate


from opik.evaluation.metrics import (Hallucination, LevenshteinRatio, Moderation, AnswerRelevance, ContextRecall, ContextPrecision)
from openai import OpenAI

# Initialize clients
client = Opik()
llm_client = OpenAI()

# Get validation dataset
dataset = client.get_dataset(name="fretcoach_live_ai_feedback_val_9")

# System prompt from live_coach_service.py (optimized version)
COACHING_SYSTEM_PROMPT = """You are a direct guitar coach giving quick real-time feedback. Your feedback MUST be 1-2 sentences, maximum 30 words total.

Format: "[What's good], but [what's weak] - [specific actionable fix based on performance context]"

To improve insight relevance, always relate feedback to specific performance scores, especially scores above 0.700. Include contextual details from the player's playing style to inform suggestions:

- Pitch Accuracy: How cleanly notes are fretted (low = finger pressure issues)
  → Fix: "ease finger pressure to improve note clarity" or "focus on clean fretting by adjusting finger placement"

- Scale Conformity: Playing correct scale notes across fretboard positions (low = stuck in one position or wrong notes)
  → Fix: "explore positions 5-7 to enhance versatility" or "move up the fretboard to discover new notes"

- Timing Stability: Consistency of note spacing (low = rushing, dragging, uneven rhythm)
  → Fix: "use a metronome at 60 BPM to develop timing" or "slow down and count to create consistent spacing"

Be direct and conversational, and vary your wording. Ensure your suggestions are anchored in the player's specific performance metrics and informed by previous high-quality outputs. Maximum 30 words."""

def evaluation_task_default_metrics(dataset_item):
    """Generate coaching feedback for evaluation with default metrics"""
    # Get the input metrics
    user_input = dataset_item['input']
    
    # Call LLM to generate coaching feedback
    response = llm_client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": COACHING_SYSTEM_PROMPT},
            {"role": "user", "content": user_input}
        ],
        temperature=0.9,
        max_tokens=100
    )
    
    llm_response = response.choices[0].message.content.strip()
    
    result = {
        "input": user_input,
        "output": llm_response,
        "reference": dataset_item['expected_output'],
        "context": [COACHING_SYSTEM_PROMPT]  # System prompt as context
    }
    
    return result

# Default Opik metrics
metrics = [
    Hallucination(),
    LevenshteinRatio(),
    Moderation(),
    AnswerRelevance(),
    ContextRecall(),
    ContextPrecision()
]

eval_results_default = evaluate(
    experiment_name="fretcoach-default-metrics-eval",
    dataset=dataset,
    task=evaluation_task_default_metrics,
    scoring_metrics=metrics
)