In [1]:
from pprint import pprint
import datetime
from audio_utils import AudioRanker, wave_read_to_wav_bytes
from judge import (
    BASE_SYSTEM_INSTRUCTION,
    JudgeBasics,
    JudgeRanking,
    run_speech_llm,
    update_pairwise_comparison_history,
    BinaryVoiceRank,
)
from preference_learner import PreferenceLearner

In [2]:
import weave

# SPEECH_AUDIO_DATASET_URI = "weave:///wandb-voice-ai/voice-judge/object/generated_speech_audio:v1"
SPEECH_AUDIO_DATASET_URI = "weave:///wandb-voice-ai/voice-judge/object/generated_speech_audio_test"
N_SAMPLES_TO_RANK = 3
PREFERENCE_LEARNER_MODEL =  "gemini-2.0-flash" #"gemini-2.5-pro-preview-05-06",  # "gemini-2.0-flash"
JUDGE_MODEL =  "gemini-2.0-flash" # "gemini-2.5-pro-preview-05-06"

weave_client = weave.init("wandb-voice-ai/voice-judge")
print("Downloading speech samples from Weave...")
ds_ref = weave.ref(SPEECH_AUDIO_DATASET_URI).get()
speech_samples = list(ds_ref.rows)

samples_to_rank = {}
for i in range(N_SAMPLES_TO_RANK):
    samples_to_rank[speech_samples[i]["voice_instructions_id"]] = {
        "audio": speech_samples[i]["audio"],
        "audio_bytes": wave_read_to_wav_bytes(speech_samples[i]["audio"]),
        "instructions": speech_samples[i]["voice_instructions"],
        "short_hash": None,
        "pairwise_comparison_history": {},
    }

ids_to_rank = list(samples_to_rank.keys())

[36m[1mweave[0m: Logged in as Weights & Biases user: morgan.
[36m[1mweave[0m: View Weave data at https://wandb.ai/wandb-voice-ai/voice-judge/weave


Downloading speech samples from Weave...


[36m[1mweave[0m: 🍩 https://wandb.ai/wandb-voice-ai/voice-judge/r/call/01971199-f0de-7783-a12a-7b763b85b906
[36m[1mweave[0m: 🍩 https://wandb.ai/wandb-voice-ai/voice-judge/r/call/0197119a-0e3d-7340-9988-8c68519c8c41
[36m[1mweave[0m: 🍩 https://wandb.ai/wandb-voice-ai/voice-judge/r/call/0197119c-973c-7bc2-994d-7816d7695dee
[36m[1mweave[0m: 🍩 https://wandb.ai/wandb-voice-ai/voice-judge/r/call/0197119c-d869-79e2-a37f-dd7b796684bd
[36m[1mweave[0m: 🍩 https://wandb.ai/wandb-voice-ai/voice-judge/r/call/0197119d-6274-7f72-8f5c-b68a054d0485
[36m[1mweave[0m: 🍩 https://wandb.ai/wandb-voice-ai/voice-judge/r/call/0197119d-d572-71a1-911c-23559aae6bca
[36m[1mweave[0m: 🍩 https://wandb.ai/wandb-voice-ai/voice-judge/r/call/0197119e-0a1e-7ef1-aa17-2d9440ce567a
[36m[1mweave[0m: 🍩 https://wandb.ai/wandb-voice-ai/voice-judge/r/call/0197119e-5ead-7b52-bbf4-075a2744f4ae
[36m[1mweave[0m: 🍩 https://wandb.ai/wandb-voice-ai/voice-judge/r/call/019711a2-a3a2-7240-a9a9-695e514c72a8
[36m[1mw

In [3]:
# Register our samples for ranking
@weave.op
def rank_audio(sample_1: dict, sample_2: dict) -> None:
    pass

sample_1_id = ids_to_rank[0]
sample_2_id = ids_to_rank[1]

sample_1 = {"id": sample_1_id, **samples_to_rank[sample_1_id]}
sample_2 = {"id": sample_2_id, **samples_to_rank[sample_2_id]}

_, target_call = rank_audio.call(sample_1, sample_2)

In [4]:
# Init Ranker

ranker = AudioRanker(
    [
        {
            "id": sample_1_id,
            "audio": samples_to_rank[sample_1_id]["audio"],
            "original_input_order": 1,
        },
        {
            "id": sample_2_id,
            # "audio": samples_to_rank[sample_2_id]["audio_bytes"],
            "audio": samples_to_rank[sample_2_id]["audio"],
            "original_input_order": 2,
        },
    ],
    weave_client=weave_client,
    target_call=target_call,
    image_path="assets/gta-vi_guy_in_front_of_truck_cropped.jpg",
)

[36m[1mweave[0m: 📦 Published to https://wandb.ai/wandb-voice-ai/voice-judge/weave/objects/AudioRanker/versions/ym5a2rKViLvZtIjKh5VMKYzRxpZOmGhGDTLgSi8hDN8


In [5]:
# Do ranking

ranker.display_widget()

VBox(children=(HBox(children=(VBox(children=(HTML(value='<h3>Which voice sample suits this character?</h3>'), …

In [6]:
ranking = ranker.get_final_rankings()
samples_to_rank = update_pairwise_comparison_history(samples_to_rank, ranking)
ranking

{'rankings': [{'rank': 1,
   'id': 'weary_georgia_mutter_20250526_2033',
   'original_input_order': 2,
   'short_hash': 'c8ab9b2d'},
  {'rank': 2,
   'id': 'surprised_french_veteran_20250526_2033',
   'original_input_order': 1,
   'short_hash': '39de6b56'}],
 'completed_at': '2025-05-27_12-56',
 'preferred_id': 'weary_georgia_mutter_20250526_2033',
 'rejected_id': 'surprised_french_veteran_20250526_2033'}

In [46]:
learner = PreferenceLearner()

await learner.update(ranking, samples_to_rank)

Updating comparisons...
Running pattern update...


Pattern update result:
('reasoning: The preferred voice in the provided comparison demonstrates a '
 'tone that is slightly more engaging and casual, achieved through variations '
 'in intonation and slight variations in speed. This suggests that a more '
 'conversational and relatable delivery is preferred. The rejected voice has a '
 'flatter affect and a slightly less relaxed delivery, hence the preference.')
'strong: []'
("emerging: ['Preference for a conversational and relatable tone.', "
 "'Preference for variations in intonation and speed to enhance engagement.', "
 "'Avoidance of flat affect and less relaxed delivery.']")


In [15]:
BASIC_JUDGE_SYSTEM_INSTRUCTION = "Assess the generated voices provided"
judge_prompt = f"""Based on the following criteria, your task is to rank the voices proivded:\

{str(builder_result.how_to_judge_a_voice)}

"""


@weave.op
async def run_speech_judge():
    # Example with file path
    result = await run_speech_llm(
        system_instruction=BASIC_JUDGE_SYSTEM_INSTRUCTION,
        prompt=judge_prompt,
        model_name="gemini-2.5-pro-preview-05-06",  # "gemini-2.0-flash",
        temperature=0.1,
        response_model=JudgeRanking,
        audio_data=[
            samples_with_rankings[0]["audio_bytes"],
            samples_with_rankings[1]["audio_bytes"],
            samples_with_rankings[2]["audio_bytes"],
        ],
        initial_audio_parts_prompt="\n\nvoice_1:\n",
        audio_parts_prompt_divider="\n\nvoice_{input_order}:\n",
    )
    return result


# Run example
# result = asyncio.run(example())
result = await run_speech_judge()
pprint(result.thinking)
pprint(result.ranking)
print()

# for s in samples_with_rankings:
#     print(s.get("id"), s.get("short_hash"))
# print()

print("Human Ranking:")
for i in range(3):
    pprint(f"Voice {i + 1}: {final_rankings[i]}")

('The primary goal is to rank voices based on their suitability as a '
 "'frustrated gamer' along with other vocal qualities. \n"
 '\n'
 'Voice 1: Exhibits excellent clarity and naturalness. The pitch is slightly '
 "higher and has a strained quality that perfectly matches the 'frustrated "
 "gamer' persona. Intonation effectively conveys rising frustration. Pacing is "
 'good, and energy is high and engaging. The timbre supports the emotional '
 'state. Highly suitable for the context.\n'
 '\n'
 'Voice 2: Clear and natural, but the pitch is lower and the delivery is '
 'calmer, sounding more like general annoyance or disappointment rather than '
 'acute gamer frustration. While expressive, the energy level is lower '
 'compared to the others for this specific context. Timbre is pleasant but '
 "doesn't convey the same level of agitation. Less suitable for a 'frustrated "
 "gamer' but good for a generally annoyed tone.\n"
 '\n'
 'Voice 3: Excellent clarity and naturalness. The pitch is