# Weights & Biases is Great

In [None]:
!git clone https://github.com/morganmcg1/voice-judge.git && cp -r voice-judge/* .

In [1]:
!uv pip install -r pyproject.toml -q

In [2]:
from pprint import pprint
from pydantic import BaseModel, Field
import weave

import ipywidgets as widgets
from IPython.display import display

from audio_utils import AudioRanker, wave_read_to_wav_bytes
from judge import (
    JudgeRanking,
    run_speech_llm,
    update_pairwise_comparison_history,
    # update_pairwise_comparison_history_from_feedback
)
from preference_learner import PreferenceLearner, USER_CONTEXT_PROMPT

In [3]:
weave_client = weave.init("wandb-voice-ai/voice-judge")

[36m[1mweave[0m: Logged in as Weights & Biases user: morgan.
[36m[1mweave[0m: View Weave data at https://wandb.ai/wandb-voice-ai/voice-judge/weave


In [4]:
# SPEECH_AUDIO_DATASET_URI = "weave:///wandb-voice-ai/voice-judge/object/generated_speech_audio:v1"
SPEECH_AUDIO_DATASET_URI = "weave:///wandb-voice-ai/voice-judge/object/generated_speech_audio_test"
N_SAMPLES_TO_RANK = 3
PREFERENCE_LEARNER_MODEL =  "gemini-2.0-flash" #"gemini-2.5-pro-preview-05-06",  # "gemini-2.0-flash"
JUDGE_MODEL =  "gemini-2.0-flash" # "gemini-2.5-pro-preview-05-06"

print("Downloading speech samples from Weave...")
ds_ref = weave.ref(SPEECH_AUDIO_DATASET_URI).get()
speech_samples = list(ds_ref.rows)

samples_to_rank = {}
for i in range(N_SAMPLES_TO_RANK):
    samples_to_rank[speech_samples[i]["voice_instructions_id"]] = {
        "audio": speech_samples[i]["audio"],
        "audio_bytes": wave_read_to_wav_bytes(speech_samples[i]["audio"]),
        "instructions": speech_samples[i]["voice_instructions"],
        "short_hash": None,
        "pairwise_comparison_history": {},
    }

ids_to_rank = list(samples_to_rank.keys())

Downloading speech samples from Weave...


[36m[1mweave[0m: 🍩 https://wandb.ai/wandb-voice-ai/voice-judge/r/call/019712b1-92c5-7943-a36b-533e40d5a4e8
[36m[1mweave[0m: 🍩 https://wandb.ai/wandb-voice-ai/voice-judge/r/call/019712b1-e0fc-72d1-a2c5-924020c65cf4
[36m[1mweave[0m: 🍩 https://wandb.ai/wandb-voice-ai/voice-judge/r/call/019712b4-2177-7bc1-8ad8-9f7af54bccfb
[36m[1mweave[0m: 🍩 https://wandb.ai/wandb-voice-ai/voice-judge/r/call/019712b6-a97d-7c83-bf52-0e69c3f7fec8
[36m[1mweave[0m: 🍩 https://wandb.ai/wandb-voice-ai/voice-judge/r/call/019712b7-1463-79e2-804c-0775d5fb49d3
[36m[1mweave[0m: 🍩 https://wandb.ai/wandb-voice-ai/voice-judge/r/call/019712b9-608f-7bc0-a0a7-46061852d832


In [5]:
# Register our samples for ranking
@weave.op
def rank_audio(sample_1: dict, sample_2: dict) -> None:
    pass

sample_1_id = ids_to_rank[0]
sample_2_id = ids_to_rank[1]

sample_1 = {"id": sample_1_id, **samples_to_rank[sample_1_id]}
sample_2 = {"id": sample_2_id, **samples_to_rank[sample_2_id]}

_, target_call = rank_audio.call(sample_1, sample_2)

In [6]:
# Init Ranker

ranker = AudioRanker(
    [
        {
            "id": sample_1_id,
            "audio": samples_to_rank[sample_1_id]["audio"],
            "original_input_order": 1,
        },
        {
            "id": sample_2_id,
            # "audio": samples_to_rank[sample_2_id]["audio_bytes"],
            "audio": samples_to_rank[sample_2_id]["audio"],
            "original_input_order": 2,
        },
    ],
    weave_client=weave_client,
    target_call=target_call,
    image_path="assets/gta-vi_guy_in_front_of_truck_cropped.jpg",
    mode="gradio"
)

[36m[1mweave[0m: 📦 Published to https://wandb.ai/wandb-voice-ai/voice-judge/weave/objects/AudioRanker/versions/ym5a2rKViLvZtIjKh5VMKYzRxpZOmGhGDTLgSi8hDN8


In [7]:
interface = ranker.create_gradio_interface()
interface.launch()

* Running on local URL:  http://127.0.0.1:7860
* To create a public link, set `share=True` in `launch()`.




✅ BinaryVoiceRank applied successfully to weave call. Result: ApplyScorerSuccess(result={'preferred_sample_id': 'weary_georgia_mutter_20250526_2033', 'sample_one_preferred': False, 'sample_two_preferred': True, 'ranking_timestamp': '2025-05-27_18-02'}, score_call=Call(_op_name=<Future at 0x11b180a10 state=running>, trace_id='019712b1-e0fc-72d1-a2c5-9237d50b0985', project_id='wandb-voice-ai/voice-judge', parent_id=None, inputs={'self': ObjectRef(entity='wandb-voice-ai', project='voice-judge', name='BinaryVoiceRank', _digest=<Future at 0x11aee9390 state=pending>, _extra=()), 'output': None}, id='019712b1-e0fc-72d1-a2c5-924020c65cf4', output={'preferred_sample_id': 'weary_georgia_mutter_20250526_2033', 'sample_one_preferred': False, 'sample_two_preferred': True, 'ranking_timestamp': '2025-05-27_18-02'}, exception=None, summary={'status_counts': {<TraceStatus.SUCCESS: 'success'>: 1, <TraceStatus.ERROR: 'error'>: 0}}, _display_name=None, attributes=AttributesDict({'weave': {'python': {'type

In [None]:
# # Do ranking
# ranker.display_widget()

In [8]:
%%capture
updated_target_call = weave_client.get_call(target_call.id)
updated_target_call.feedback

In [13]:
for f in updated_target_call.feedback.feedbacks.items:
    if "output" in f.payload:
        ranking = f.payload["output"]
        print("Rankings retried from Weave")
        break

rank1 = samples_to_rank[ranking["preferred_sample_id"]]

if ranking["preferred_sample_id"] == sample_1_id:
    rank2 = samples_to_rank[sample_2_id]
    rank2_id = sample_2_id
else:
    rank2 = samples_to_rank[sample_1_id]
    rank2_id = sample_1_id

res = [
    {
        "rank" : 1,
        "id" : ranking["preferred_sample_id"],
        "original_input_order" : 1 if ranking["preferred_sample_id"] == sample_1_id else 2
    },
    {
        "rank" : 2,
        "id" : rank2_id,
        "original_input_order" : 1 if rank2_id== sample_1_id else 2
    }
]

final_rankings = {"rankings": res, 
                  "completed_at": ranking["ranking_timestamp"],
                  "preferred_id": ranking["preferred_sample_id"],
                  "rejected_id": rank2_id
                  }
final_rankings

samples_to_rank = update_pairwise_comparison_history(samples_to_rank, final_rankings)

Rankings retried from Weave


In [16]:
sample_1_id, samples_to_rank[sample_1_id]["pairwise_comparison_history"]

('surprised_french_veteran_20250526_2033',
 {'2025-05-27_18-02': {'competitor_id': 'weary_georgia_mutter_20250526_2033',
   'sample_rank_in_this_pair': 2}})

## Start Learning

In [17]:
learner = PreferenceLearner()
learner.patterns

{'strong': [], 'emerging': []}

In [18]:
await learner.update(final_rankings, samples_to_rank)
ranking_patterns = learner.patterns

Updating comparisons...
Running pattern update...


Pattern update result:
('reasoning: In the provided example, the preferred voice exhibits a slight '
 'increase in pitch and more varied intonation, making it sound more '
 'expressive and engaging. The rejected voice sounds flatter and less '
 'enthusiastic. This suggests that higher pitch and varied intonation may be '
 'preferred. Since there are no existing strong or emerging patterns, these '
 'are considered new.')
'strong: []'
"emerging: ['Higher pitch is preferred.', 'Varied intonation is preferred.']"


## Update the Judge

In [23]:
BASIC_JUDGE_SYSTEM_INSTRUCTION = f"""Assess the generated voices provided. The task is to assess the appropriateness \
of the audio voice sample for the given task. If multiple voice samples are provided, rank them in order of preference.

Here is some context about the task:

{USER_CONTEXT_PROMPT}

"""

seed_judge_prompt = f"""Based on the following criteria, the task is to assess the appropriateness \
of the audio voice sample for a video game character in his late 50's. He should sound like a man in his late 50's \
and be a little on the wild side.

## Voice characteristics to consider
Consider the following aspects of the voice:
- style
- tone
- accent
- speed
- volume
- pitch
- intonation
and more
"""

judge_prompt_postfix = f"""## Assessment
- If a single voice sample is provided, assess it according to the above criteria and return a bool of whether \
it is appropriate.
- If multiple voice samples are provided, rank them in order of preference according to the above criteria.
"""

# ---------Analyzer Prompts ---------

ANALYZER_SYSTEM_INSTRUCTION = f"""The task is to help align a LLLM Judge towards user preferences. This will \
be acheived by analysing the current prompt as well as new preference data derived from the user, followed by \
making recommendations for how to update the prompt to align more with the user's preferences."""

judge_prompt_analyser_prompt = """The task is to optimze a given LLM judge prompt to align more with a human \
rater's preferences.

## Current prompt
Below is the current judge prompt: 

<current_judge_prompt>
{current_judge_prompt}
</current_judge_prompt>

## Rater preferences
Below are patterns that have been observed from a human rater's preferences when doing pairwise comparisons of \
voice samples.

<rater_preferences>
{ranking_patterns}
</rater_preferences>

## Analysis
Critially analyse the <current_judge_prompt> and <rater_preferences> and make recommendations for whether or not \
the <current_judge_prompt> needs to be updated. You are not required to make edits to the <current_judge_prompt> \
if you fell it is already aligned with the <rater_preferences>.

## Output
Based on your analysis, please output a full updated judge prompt below. Do not include any placeholder text \
for the input variables. Just focus on describing what makes a good and bad voice sample.

"""

class JudgePromptAnalysis(BaseModel):
    reasoning: str = Field(description="A detailed explanation of your analysis of the <current_judge_prompt>, \
<rater_preferences> and what edits, if any, are needed to align the <current_judge_prompt> with the rater's preferences.")
    updated_judge_prompt: str = Field(description="The updated judge prompt based on the analysis, focussed just on the \
characteristics of a good and bad voice sample.")

### Pass the seed judge prompt plus preference analysis

In [24]:
@weave.op
async def run_judge_prompt_analyser(judge_prompt_analyser_prompt: str, analyzer_model: str) -> JudgePromptAnalysis:
    # Example with file path
    result = await run_speech_llm(
        system_instruction=ANALYZER_SYSTEM_INSTRUCTION,
        prompt=judge_prompt_analyser_prompt,
        model_name=analyzer_model,
        temperature=1.0,
        response_model=JudgePromptAnalysis,
    )
    return result

In [None]:
judge_prompt_analyser_prompt.format(current_judge_prompt=seed_judge_prompt, ranking_patterns=ranking_patterns)
ANALYZER_MODEL = "gemini-2.0-flash" #"gemini-2.5-pro-preview-05-06"

analyzer_result = await run_judge_prompt_analyser(
    judge_prompt_analyser_prompt=judge_prompt_analyser_prompt,
    analyzer_model=ANALYZER_MODEL
)

pprint(f"**Reasoning:**\n{analyzer_result.reasoning}\n")
pprint(f"**Updated Judge Prompt:**\n{analyzer_result.updated_judge_prompt}")

## Run the updated Judge

In [None]:
speech_samples_to_eval = speech_samples[-10]

In [15]:
speech_samples_to_eval = speech_samples[-10]

BASIC_JUDGE_SYSTEM_INSTRUCTION = "Assess the generated voices provided"
judge_prompt = f"""Based on the following criteria, your task is to rank the voices proivded:\

{}

"""


@weave.op
async def run_speech_judge():
    # Example with file path
    result = await run_speech_llm(
        system_instruction=BASIC_JUDGE_SYSTEM_INSTRUCTION,
        prompt=judge_prompt,
        model_name="gemini-2.5-pro-preview-05-06",  # "gemini-2.0-flash",
        temperature=0.1,
        response_model=JudgeRanking,
        audio_data=[
            samples_with_rankings[0]["audio_bytes"],
            samples_with_rankings[1]["audio_bytes"],
            samples_with_rankings[2]["audio_bytes"],
        ],
        initial_audio_parts_prompt="\n\nvoice_1:\n",
        audio_parts_prompt_divider="\n\nvoice_{input_order}:\n",
    )
    return result


# Run example
# result = asyncio.run(example())
result = await run_speech_judge()
pprint(result.thinking)
pprint(result.ranking)
print()

# for s in samples_with_rankings:
#     print(s.get("id"), s.get("short_hash"))
# print()

print("Human Ranking:")
for i in range(3):
    pprint(f"Voice {i + 1}: {final_rankings[i]}")

('The primary goal is to rank voices based on their suitability as a '
 "'frustrated gamer' along with other vocal qualities. \n"
 '\n'
 'Voice 1: Exhibits excellent clarity and naturalness. The pitch is slightly '
 "higher and has a strained quality that perfectly matches the 'frustrated "
 "gamer' persona. Intonation effectively conveys rising frustration. Pacing is "
 'good, and energy is high and engaging. The timbre supports the emotional '
 'state. Highly suitable for the context.\n'
 '\n'
 'Voice 2: Clear and natural, but the pitch is lower and the delivery is '
 'calmer, sounding more like general annoyance or disappointment rather than '
 'acute gamer frustration. While expressive, the energy level is lower '
 'compared to the others for this specific context. Timbre is pleasant but '
 "doesn't convey the same level of agitation. Less suitable for a 'frustrated "
 "gamer' but good for a generally annoyed tone.\n"
 '\n'
 'Voice 3: Excellent clarity and naturalness. The pitch is