In [1]:
import asyncio
from pprint import pprint
from audio_utils import AudioRankerSequential, wave_read_to_wav_bytes
from judge import BASE_SYSTEM_INSTRUCTION, JudgeBasics, JudgeRanking, run_speech_llm

In [2]:
import weave

# SPEECH_AUDIO_DATASET_URI = "weave:///wandb-voice-ai/voice-judge/object/generated_speech_audio:v1"
SPEECH_AUDIO_DATASET_URI = "weave:///wandb-voice-ai/voice-judge/object/generated_speech_audio_test"

weave_client = weave.init("wandb-voice-ai/voice-judge")
print("Downloading speech samples from Weave...")
ds_ref = weave.ref(SPEECH_AUDIO_DATASET_URI).get()
speech_samples = list(ds_ref.rows)

samples_to_rank = []
for i in range(3):
    samples_to_rank.append(
        {
            "id": speech_samples[i]["voice_instructions_id"],
            "audio": speech_samples[i]["audio"],
            "instructions": speech_samples[i]["voice_instructions"],
        }
    )

[36m[1mweave[0m: Logged in as Weights & Biases user: morgan.
[36m[1mweave[0m: View Weave data at https://wandb.ai/wandb-voice-ai/voice-judge/weave


Downloading speech samples from Weave...


[36m[1mweave[0m: 🍩 https://wandb.ai/wandb-voice-ai/voice-judge/r/call/01970e6b-22fc-7c03-ba45-3925608f1e5c
[36m[1mweave[0m: 🍩 https://wandb.ai/wandb-voice-ai/voice-judge/r/call/01970e6c-42fd-7263-b0d9-50ffb3349a4d
[36m[1mweave[0m: 🍩 https://wandb.ai/wandb-voice-ai/voice-judge/r/call/01970e6e-6128-74c1-88ca-cd2ee6553f03
[36m[1mweave[0m: 🍩 https://wandb.ai/wandb-voice-ai/voice-judge/r/call/01970e6e-a4c1-76a1-82b1-b06fb20ab92c


In [3]:
@weave.op
def rank_audio(sample_one, sample_two, sample_three):
    pass

_, call = rank_audio.call(samples_to_rank[0], samples_to_rank[1], samples_to_rank[2])
target_call_id = call.id

ranker = AudioRankerSequential(
    [
        {
            "id": speech_samples[0]["voice_instructions_id"],
            "audio": wave_read_to_wav_bytes(speech_samples[0]["audio"]),
            "original_input_order": 1,
        },
        {
            "id": speech_samples[1]["voice_instructions_id"],
            "audio": wave_read_to_wav_bytes(speech_samples[1]["audio"]),
            "original_input_order": 2,
        },
        {
            "id": speech_samples[2]["voice_instructions_id"],
            "audio": wave_read_to_wav_bytes(speech_samples[2]["audio"]),
            "original_input_order": 3,
        },
    ],
    weave_client=weave_client,
    target_call_id=target_call_id,
)

[36m[1mweave[0m: 📦 Published to https://wandb.ai/wandb-voice-ai/voice-judge/weave/objects/AudioSequentialRanking/versions/0hd2Wj2WfhQyILdb85TPQUd5YBnY7agbbYYdFEo6X0E


In [4]:
ranker.display_widget()

VBox(children=(VBox(children=(HTML(value='<h3>Listen to Audio Samples:</h3>'), VBox(children=(Label(value='Sam…

In [5]:
final_rankings = ranker.get_final_rankings()
final_rankings

[{'rank': 1,
  'id': 'weary_georgia_mutter_20250526_2033',
  'original_input_order': 2,
  'short_hash': 'c8ab9b2d'},
 {'rank': 2,
  'id': 'hushed_georgia_rush_20250526_2033',
  'original_input_order': 3,
  'short_hash': 'adcc9dc3'},
 {'rank': 3,
  'id': 'surprised_french_veteran_20250526_2033',
  'original_input_order': 1,
  'short_hash': '39de6b56'}]

In [13]:
samples_with_rankings = []
for r in final_rankings:
    for s in samples_to_rank:
            if s.get("id") == r.get("id"):
                samples_with_rankings.append(
                    {
                        "id": s.get("id"),
                        "audio": s.get("audio"),
                        "instructions": s.get("instructions"),
                        "rank": r.get("rank"),
                        "audio_bytes": wave_read_to_wav_bytes(s.get("audio")),
                        "short_hash": r.get("short_hash"),
                    }
                )

# sort the samples_with_rankings list by rank
samples_with_rankings.sort(key=lambda x: x.get("rank"))

In [14]:
task_prompt = "Given a ranking of 3 voices by a human judge from most liked to least liked, \
describe the voice characteristics that lead to the ranking."

@weave.op
async def run_speech_analyzer():
    # Example with file path
    builder_result = await run_speech_llm(
        system_instruction=BASE_SYSTEM_INSTRUCTION,
        prompt=task_prompt,
        model_name="gemini-2.5-pro-preview-05-06", # "gemini-2.0-flash",
        temperature=0.1,
        response_model=JudgeBasics,
        audio_data=[
            samples_with_rankings[0]["audio_bytes"],
            samples_with_rankings[1]["audio_bytes"],
            samples_with_rankings[2]["audio_bytes"],
        ],
    )
    return builder_result


# Run example
# result = asyncio.run(example())
builder_result = await run_speech_analyzer()
pprint(builder_result.thinking)
pprint(builder_result.how_to_judge_a_voice)

('The ranking seems to prioritize naturalness, clarity, and an engaging tone '
 "that fits the context of the dialogue (which sounds like a gamer's "
 'frustration). \n'
 'Rank 1 voice is clear, energetic, and has a natural, expressive intonation. '
 'The pitch is moderate and pleasant. It sounds like a young adult, fitting '
 'the context of talking about GTA 6. The delivery has a good balance of '
 'exasperation and clarity.\n'
 "Rank 2 voice is deeper and more measured. It's clear, but less energetic and "
 'expressive than Rank 1. The pacing is slightly slower. While a good voice, '
 'it might sound a bit too serious or gruff for the specific line compared to '
 'the more animated Rank 1. It has a slightly gravelly quality.\n'
 'Rank 3 voice is higher-pitched and sounds younger, almost adolescent. The '
 'delivery feels a bit more forced or theatrical, and less natural than the '
 "other two. The intonation is more pronounced but doesn't sound as authentic. "
 'It also sounds sligh

In [15]:
BASIC_JUDGE_SYSTEM_INSTRUCTION = "Assess the generated voices provided"
judge_prompt = f"""Based on the following criteria, your task is to rank the voices proivded:\

{str(builder_result.how_to_judge_a_voice)}

"""

@weave.op
async def run_speech_judge():
    # Example with file path
    result = await run_speech_llm(
        system_instruction=BASIC_JUDGE_SYSTEM_INSTRUCTION,
        prompt=judge_prompt,
        model_name="gemini-2.5-pro-preview-05-06", # "gemini-2.0-flash",
        temperature=0.1,
        response_model=JudgeRanking,
        audio_data=[
            samples_with_rankings[0]["audio_bytes"],
            samples_with_rankings[1]["audio_bytes"],
            samples_with_rankings[2]["audio_bytes"],
        ],
        initial_audio_parts_prompt="\n\nvoice_1:\n",
        audio_parts_prompt_divider="\n\nvoice_{input_order}:\n",
    )
    return result


# Run example
# result = asyncio.run(example())
result = await run_speech_judge()
pprint(result.thinking)
pprint(result.ranking)
print()

# for s in samples_with_rankings:
#     print(s.get("id"), s.get("short_hash"))
# print()

print("Human Ranking:")
for i in range(3):
    pprint(f"Voice {i+1}: {final_rankings[i]}")

('The primary goal is to rank voices based on their suitability as a '
 "'frustrated gamer' along with other vocal qualities. \n"
 '\n'
 'Voice 1: Exhibits excellent clarity and naturalness. The pitch is slightly '
 "higher and has a strained quality that perfectly matches the 'frustrated "
 "gamer' persona. Intonation effectively conveys rising frustration. Pacing is "
 'good, and energy is high and engaging. The timbre supports the emotional '
 'state. Highly suitable for the context.\n'
 '\n'
 'Voice 2: Clear and natural, but the pitch is lower and the delivery is '
 'calmer, sounding more like general annoyance or disappointment rather than '
 'acute gamer frustration. While expressive, the energy level is lower '
 'compared to the others for this specific context. Timbre is pleasant but '
 "doesn't convey the same level of agitation. Less suitable for a 'frustrated "
 "gamer' but good for a generally annoyed tone.\n"
 '\n'
 'Voice 3: Excellent clarity and naturalness. The pitch is