# Weights & Biases is Great

In [None]:
!git clone https://github.com/morganmcg1/voice-judge.git && cp -r voice-judge/* .
!uv pip install -r pyproject.toml -q

In [14]:
from pprint import pprint
import weave

import ipywidgets as widgets
from IPython.display import display

from audio_utils import AudioRanker, wave_read_to_wav_bytes
from judge import (
    JudgeRanking,
    run_speech_llm,
    update_pairwise_comparison_history,
)
from preference_learner import PreferenceLearner, USER_CONTEXT_PROMPT

In [None]:
from google.colab import output
output.enable_custom_widget_manager()

In [None]:
weave_client = weave.init("wandb-voice-ai/voice-judge")

In [None]:
# SPEECH_AUDIO_DATASET_URI = "weave:///wandb-voice-ai/voice-judge/object/generated_speech_audio:v1"
SPEECH_AUDIO_DATASET_URI = "weave:///wandb-voice-ai/voice-judge/object/generated_speech_audio_test"
N_SAMPLES_TO_RANK = 3
PREFERENCE_LEARNER_MODEL =  "gemini-2.0-flash" #"gemini-2.5-pro-preview-05-06",  # "gemini-2.0-flash"
JUDGE_MODEL =  "gemini-2.0-flash" # "gemini-2.5-pro-preview-05-06"

print("Downloading speech samples from Weave...")
ds_ref = weave.ref(SPEECH_AUDIO_DATASET_URI).get()
speech_samples = list(ds_ref.rows)

samples_to_rank = {}
for i in range(N_SAMPLES_TO_RANK):
    samples_to_rank[speech_samples[i]["voice_instructions_id"]] = {
        "audio": speech_samples[i]["audio"],
        "audio_bytes": wave_read_to_wav_bytes(speech_samples[i]["audio"]),
        "instructions": speech_samples[i]["voice_instructions"],
        "short_hash": None,
        "pairwise_comparison_history": {},
    }

ids_to_rank = list(samples_to_rank.keys())

[36m[1mweave[0m: Logged in as Weights & Biases user: morgan.
[36m[1mweave[0m: View Weave data at https://wandb.ai/wandb-voice-ai/voice-judge/weave


Downloading speech samples from Weave...


In [3]:
# Register our samples for ranking
@weave.op
def rank_audio(sample_1: dict, sample_2: dict) -> None:
    pass

sample_1_id = ids_to_rank[0]
sample_2_id = ids_to_rank[1]

sample_1 = {"id": sample_1_id, **samples_to_rank[sample_1_id]}
sample_2 = {"id": sample_2_id, **samples_to_rank[sample_2_id]}

_, target_call = rank_audio.call(sample_1, sample_2)

In [4]:
# Init Ranker

ranker = AudioRanker(
    [
        {
            "id": sample_1_id,
            "audio": samples_to_rank[sample_1_id]["audio"],
            "original_input_order": 1,
        },
        {
            "id": sample_2_id,
            # "audio": samples_to_rank[sample_2_id]["audio_bytes"],
            "audio": samples_to_rank[sample_2_id]["audio"],
            "original_input_order": 2,
        },
    ],
    weave_client=weave_client,
    target_call=target_call,
    image_path="assets/gta-vi_guy_in_front_of_truck_cropped.jpg",
)

[36m[1mweave[0m: 🍩 https://wandb.ai/wandb-voice-ai/voice-judge/r/call/01971202-051b-78a0-a866-0cdd5ebc3146
[36m[1mweave[0m: 🍩 https://wandb.ai/wandb-voice-ai/voice-judge/r/call/01971202-2fed-7400-a505-aceaed904f25
[36m[1mweave[0m: 🍩 https://wandb.ai/wandb-voice-ai/voice-judge/r/call/01971202-5f05-7771-be74-6e740ff39963
[36m[1mweave[0m: 🍩 https://wandb.ai/wandb-voice-ai/voice-judge/r/call/01971202-77e6-7b83-b826-a12845cbb5fa
[36m[1mweave[0m: 🍩 https://wandb.ai/wandb-voice-ai/voice-judge/r/call/01971204-496b-7bd2-8c92-a96f3b07c9cf
[36m[1mweave[0m: 🍩 https://wandb.ai/wandb-voice-ai/voice-judge/r/call/0197121a-6639-7402-b721-95d247ab431d
[36m[1mweave[0m: 🍩 https://wandb.ai/wandb-voice-ai/voice-judge/r/call/0197121b-4ae5-7bf2-ab3d-170957559b92
[36m[1mweave[0m: 🍩 https://wandb.ai/wandb-voice-ai/voice-judge/r/call/0197121b-b288-7870-90a4-bd501382738d
[36m[1mweave[0m: 🍩 https://wandb.ai/wandb-voice-ai/voice-judge/r/call/0197121c-10f5-7fc3-ad53-13eca1360388


[36m[1mweave[0m: 📦 Published to https://wandb.ai/wandb-voice-ai/voice-judge/weave/objects/AudioRanker/versions/ym5a2rKViLvZtIjKh5VMKYzRxpZOmGhGDTLgSi8hDN8


In [5]:
# Do ranking

ranker.display_widget()

VBox(children=(HBox(children=(VBox(children=(HTML(value='<h3>Which voice sample suits this character?</h3>'), …

In [6]:
ranking = ranker.get_final_rankings()
samples_to_rank = update_pairwise_comparison_history(samples_to_rank, ranking)
ranking

{'rankings': [{'rank': 1,
   'id': 'weary_georgia_mutter_20250526_2033',
   'original_input_order': 2,
   'short_hash': 'c8ab9b2d'},
  {'rank': 2,
   'id': 'surprised_french_veteran_20250526_2033',
   'original_input_order': 1,
   'short_hash': '39de6b56'}],
 'completed_at': '2025-05-27_14-50',
 'preferred_id': 'weary_georgia_mutter_20250526_2033',
 'rejected_id': 'surprised_french_veteran_20250526_2033'}

In [11]:
learner = PreferenceLearner()
learner.patterns

{'strong': [], 'emerging': []}

In [16]:
await learner.update(ranking, samples_to_rank)
ranking_patterns =learner.patterns

Updating comparisons...
Running pattern update...


Pattern update result:
('reasoning: The provided pairwise comparisons consistently prefer voices with '
 'varied volume and pitch over monotonous voices. This reinforces the existing '
 'emerging pattern. Since this pattern is seen repeatedly, it is now promoted '
 'to a strong pattern. There are no new patterns contradicted or observed in '
 'the recent comparisons.')
"strong: ['Preference for voices with varied volume and pitch.']"
"emerging: ['Avoid monotonous voices.']"


## Update the Judge

In [18]:
BASIC_JUDGE_SYSTEM_INSTRUCTION = f"""Assess the generated voices provided. The task is to assess the appropriateness \
of the audio voice sample for the given task. If multiple voice samples are provided, rank them in order of preference.

Here is some context about the task:

{USER_CONTEXT_PROMPT}

"""

seed_judge_prompt = f"""Based on the following criteria, the task is to assess the appropriateness \
of the audio voice sample for a video game character in his late 50's. He should sound like a man in his late 50's \
and be a little on the wild side.

## Voice characteristics to consider
Consider the following aspects of the voice:
- style
- tone
- accent
- speed
- volume
- pitch
- intonation
and more
"""

judge_prompt_postfix = f"""## Assessment
- If a single voice sample is provided, assess it according to the above criteria and return a bool of whether \
it is appropriate.
- If multiple voice samples are provided, rank them in order of preference according to the above criteria.
"""

# ---------Analyzer Prompts ---------

ANALYZER_SYSTEM_INSTRUCTION = f"""The task is to help align a LLLM Judge towards user preferences. This will \
be acheived by analysing the current prompt as well as new preference data derived from the user, followed by \
making recommendations for how to update the prompt to align more with the user's preferences."""

judge_prompt_analyser_prompt = """The task is to optimze a given LLM judge prompt to align more with a human \
rater's preferences.

## Current prompt
Below is the current judge prompt: 

<current_judge_prompt>
{current_judge_prompt}
</current_judge_prompt>

## Rater preferences
Below are patterns that have been observed from a human rater's preferences when doing pairwise comparisons of \
voice samples.

<rater_preferences>
{ranking_patterns}
</rater_preferences>

## Analysis
Critially analyse the <current_judge_prompt> and <rater_preferences> and make recommendations for whether or not \
the <current_judge_prompt> needs to be updated. You are not required to make edits to the <current_judge_prompt> \
if you fell it is already aligned with the <rater_preferences>.

## Output
Based on your analysis, please output a full updated judge prompt below.

"""
from pydantic import BaseModel, Field

class JudgePromptAnalysis(BaseModel):
    reasoning: str = Field(description="A detailed explanation of your analysis of the <current_judge_prompt>, \
<rater_preferences> and what edits, if any, are needed to align the <current_judge_prompt> with the rater's preferences.")
    updated_judge_prompt: str = Field(description="The updated judge prompt based on the analysis.")

judge_prompt_analyser_prompt.format(current_judge_prompt=seed_judge_prompt, ranking_patterns=ranking_patterns)
# judge_prompt_analyser_prompt+=judge_prompt_postfix

ANALYZER_MODEL = "gemini-2.0-flash" #"gemini-2.5-pro-preview-05-06" "gemini-2.5-pro-preview-05-06",  # "gemini-2.0-flash",

@weave.op
async def run_judge_prompt_analyser():
    # Example with file path
    result = await run_speech_llm(
        system_instruction=ANALYZER_SYSTEM_INSTRUCTION,
        prompt=judge_prompt_analyser_prompt,
        model_name=ANALYZER_MODEL,
        temperature=1.0,
        response_model=JudgePromptAnalysis,
        initial_audio_parts_prompt="\n\nvoice_1:\n",
        audio_parts_prompt_divider="\n\nvoice_{input_order}:\n",
    )
    return result

In [21]:
analyzer_result = await run_judge_prompt_analyser()

print(f"**Reasoning:**\n{analyzer_result.reasoning}\n")
print(f"**Updated Judge Prompt:**\n{analyzer_result.updated_judge_prompt}")

**Reasoning:**
The current judge prompt lacks specific guidance on how to evaluate voice samples based on the identified rater preferences. The rater prioritizes naturalness, clarity, and expressiveness, and penalizes robotic or monotone voices, as well as background noise and audio artifacts. The prompt should be updated to explicitly incorporate these criteria to better align with the rater's judgment. Specifically, I will add instructions to explicitly prioritize samples with natural prosody and intonation, clear articulation, and expressive delivery. Also, I will add instructions to penalize samples with robotic or monotone delivery, background noise, and any audio artifacts (clipping, distortion, etc).

**Updated Judge Prompt:**
You are a highly skilled AI assistant that evaluates the quality of generated voice samples. Your evaluation should focus on the following criteria, with an emphasis on naturalness, clarity, and expressiveness:

1.  Naturalness: Assess how human-like the v

## Run the updated Judge

In [None]:
speech_samples_to_eval = speech_samples[-10]

In [15]:
speech_samples_to_eval = speech_samples[-10]

BASIC_JUDGE_SYSTEM_INSTRUCTION = "Assess the generated voices provided"
judge_prompt = f"""Based on the following criteria, your task is to rank the voices proivded:\

{}

"""


@weave.op
async def run_speech_judge():
    # Example with file path
    result = await run_speech_llm(
        system_instruction=BASIC_JUDGE_SYSTEM_INSTRUCTION,
        prompt=judge_prompt,
        model_name="gemini-2.5-pro-preview-05-06",  # "gemini-2.0-flash",
        temperature=0.1,
        response_model=JudgeRanking,
        audio_data=[
            samples_with_rankings[0]["audio_bytes"],
            samples_with_rankings[1]["audio_bytes"],
            samples_with_rankings[2]["audio_bytes"],
        ],
        initial_audio_parts_prompt="\n\nvoice_1:\n",
        audio_parts_prompt_divider="\n\nvoice_{input_order}:\n",
    )
    return result


# Run example
# result = asyncio.run(example())
result = await run_speech_judge()
pprint(result.thinking)
pprint(result.ranking)
print()

# for s in samples_with_rankings:
#     print(s.get("id"), s.get("short_hash"))
# print()

print("Human Ranking:")
for i in range(3):
    pprint(f"Voice {i + 1}: {final_rankings[i]}")

('The primary goal is to rank voices based on their suitability as a '
 "'frustrated gamer' along with other vocal qualities. \n"
 '\n'
 'Voice 1: Exhibits excellent clarity and naturalness. The pitch is slightly '
 "higher and has a strained quality that perfectly matches the 'frustrated "
 "gamer' persona. Intonation effectively conveys rising frustration. Pacing is "
 'good, and energy is high and engaging. The timbre supports the emotional '
 'state. Highly suitable for the context.\n'
 '\n'
 'Voice 2: Clear and natural, but the pitch is lower and the delivery is '
 'calmer, sounding more like general annoyance or disappointment rather than '
 'acute gamer frustration. While expressive, the energy level is lower '
 'compared to the others for this specific context. Timbre is pleasant but '
 "doesn't convey the same level of agitation. Less suitable for a 'frustrated "
 "gamer' but good for a generally annoyed tone.\n"
 '\n'
 'Voice 3: Excellent clarity and naturalness. The pitch is