# Align a Voice LLM Judge

In this notebook we'll learn how to align a multi-modal LLM judge that can take in audio and guide it towards our preference for what a character from the GTA-VI should sound like.

### Who Validates the Validators
This human-LLM collaboration is inspired by the [Who Validates the Validators paper](https://arxiv.org/pdf/2404.12272), which focussed on the minimum about of labelling from a human to get the maximum out of aligning a LLM.

The repo for this notebook can be found here: https://github.com/morganmcg1/voice-judge

# Setup

In [None]:
# comment out if you already have the repo
!git clone https://github.com/morganmcg1/voice-judge.git && cp -r voice-judge/* .
!uv pip install -r pyproject.toml -q

## Imports

In [1]:
from pprint import pprint
from pydantic import BaseModel, Field
import weave

import ipywidgets as widgets
from IPython.display import display

from audio_utils import AudioRanker, wave_read_to_wav_bytes
from judge import (
    JudgeRanking,
    run_speech_llm,
    update_pairwise_comparison_history,
    # update_pairwise_comparison_history_from_feedback
)
from preference_learner import PreferenceLearner, USER_CONTEXT_PROMPT

# Download data

### Weave login
You'll need a free Weights & Biases account to log in to Weave and run this notebook, sign up here: www.wandb.ai/site

In [None]:
weave_client = weave.init("voice-judge")

### Download Data

In [None]:
SPEECH_AUDIO_DATASET_URI = "weave:///wandb-voice-ai/voice-judge/object/generated_speech_audio_test:v1"

N_SAMPLES_TO_RANK = 16
N_SAMPLES_TO_EVAL = 3

PREFERENCE_LEARNER_MODEL =  "gemini-2.0-flash" #"gemini-2.5-pro-preview-05-06",  # "gemini-2.0-flash"
JUDGE_MODEL =  "gemini-2.0-flash" # "gemini-2.5-pro-preview-05-06"
ANALYZER_MODEL = "gemini-2.0-flash" #"gemini-2.5-pro-preview-05-06"

In [None]:
print("Downloading speech samples from Weave...")
ds_ref = weave.ref(SPEECH_AUDIO_DATASET_URI).get()
speech_samples = list(ds_ref.rows)
print(f"{len(speech_samples)} speech samples downloaded from Weave")

samples_to_rank = {}
for i in range(N_SAMPLES_TO_RANK):
    samples_to_rank[speech_samples[i]["voice_instructions_id"]] = {
        "audio": speech_samples[i]["audio"],
        "audio_bytes": wave_read_to_wav_bytes(speech_samples[i]["audio"]),
        "instructions": speech_samples[i]["voice_instructions"],
        "short_hash": None,
        "pairwise_comparison_history": {},
    }

ids_to_rank = list(samples_to_rank.keys())

samples_to_eval = {}
for sample in speech_samples[-N_SAMPLES_TO_EVAL:]:
    samples_to_eval[sample["voice_instructions_id"]] = {
        "audio": sample["audio"],
        "audio_bytes": wave_read_to_wav_bytes(sample["audio"]),
        "instructions": sample["voice_instructions"],
    }
ids_to_eval = list(samples_to_eval.keys())

full_eval = {}
for sample in speech_samples:
    full_eval[sample["voice_instructions_id"]] = {
        "audio": sample["audio"],
        "audio_bytes": wave_read_to_wav_bytes(sample["audio"]),
        "instructions": sample["voice_instructions"],
    }
full_ids_to_eval = list(full_eval.keys())

# Round 1 - Rank and Update Judge
## Pick first samples to rank

In [4]:
# Register our samples for ranking
@weave.op
def rank_audio(sample_1: dict, sample_2: dict) -> None:
    pass

sample_1_id = ids_to_rank[0]
sample_2_id = ids_to_rank[1]

sample_1 = {"id": sample_1_id, **samples_to_rank[sample_1_id]}
sample_2 = {"id": sample_2_id, **samples_to_rank[sample_2_id]}

_, target_call = rank_audio.call(sample_1, sample_2)

In [None]:
# Init Ranker
ranker = AudioRanker(
    [
        {
            "id": sample_1_id,
            "audio": samples_to_rank[sample_1_id]["audio"],
            "original_input_order": 1,
        },
        {
            "id": sample_2_id,
            # "audio": samples_to_rank[sample_2_id]["audio_bytes"],
            "audio": samples_to_rank[sample_2_id]["audio"],
            "original_input_order": 2,
        },
    ],
    weave_client=weave_client,
    target_call=target_call,
    image_path="assets/gta-vi_guy_in_front_of_truck_cropped.jpg",
    mode="gradio"
)

## Run the ranker
First we'll run our ranker UI and rank the 2 voice samples selected

In [None]:
interface = ranker.create_gradio_interface()
interface.launch()

### Download feedback from Weave

In [8]:
%%capture
updated_target_call = weave_client.get_call(target_call.id)
updated_target_call.feedback

In [None]:
assert updated_target_call.feedback is not None, "No feedback found for target call"
for f in updated_target_call.feedback.feedbacks.items:
    if "output" in f.payload:
        ranking = f.payload["output"]
        print("Rankings retrieved from Weave")
        break

rank1 = samples_to_rank[ranking["preferred_sample_id"]]

if ranking["preferred_sample_id"] == sample_1_id:
    rank2 = samples_to_rank[sample_2_id]
    rank2_id = sample_2_id
else:
    rank2 = samples_to_rank[sample_1_id]
    rank2_id = sample_1_id

res = [
    {
        "rank" : 1,
        "id" : ranking["preferred_sample_id"],
        "original_input_order" : 1 if ranking["preferred_sample_id"] == sample_1_id else 2
    },
    {
        "rank" : 2,
        "id" : rank2_id,
        "original_input_order" : 1 if rank2_id== sample_1_id else 2
    }
]

final_rankings = {"rankings": res, 
                  "completed_at": ranking["ranking_timestamp"],
                  "preferred_id": ranking["preferred_sample_id"],
                  "rejected_id": rank2_id
                  }
final_rankings

samples_to_rank = update_pairwise_comparison_history(samples_to_rank, final_rankings)

In [None]:
sample_1_id, samples_to_rank[sample_1_id]["pairwise_comparison_history"]

## Anlysze Preferences

Based on the ranking selected, a LLM will now try and identify the differences in the preferred vs the rejected voice sample.

In [None]:
learner = PreferenceLearner(model_name=PREFERENCE_LEARNER_MODEL)
learner.patterns

In [None]:
await learner.update(final_rankings, samples_to_rank)
ranking_patterns = learner.patterns

## The Judge and The Analyzer
Below is a seed judge prompt as well as some other prompts for the analyzer

#### Basic judge prompts

In [16]:
BASIC_JUDGE_SYSTEM_INSTRUCTION = f"""Assess the generated voices provided. The task is to assess the appropriateness \
of the audio voice sample for the given task. If multiple voice samples are provided, rank them in order of preference.

Here is some context about the task:

{USER_CONTEXT_PROMPT}

"""

seed_judge_prompt = """Based on the following criteria, the task is to assess the appropriateness \
of the audio voice sample for a video game character in his late 50's. He should sound like a man in his late 50's \
and be a little on the wild side.

## Voice characteristics to consider
Consider the following aspects of the voice:
- style
- tone
- accent
- speed
- volume
- pitch
- intonation
and more
"""

judge_prompt_postfix = """## Assessment
- If a single voice sample is provided, assess it according to the above criteria and return a bool of whether \
it is appropriate.
- If multiple voice samples are provided, rank them in order of preference according to the above criteria.

## Voice samples
Below are the voice samples to assess:

"""

#### Analyzer prompts

In [17]:
ANALYZER_SYSTEM_INSTRUCTION = """The task is to help align a LLLM Judge towards user preferences. This will \
be acheived by analysing the current prompt as well as new preference data derived from the user, followed by \
making recommendations for how to update the prompt to align more with the user's preferences."""

judge_prompt_analyser_prompt = """The task is to optimze a given LLM judge prompt to align more with a human \
rater's preferences.

## Current prompt
Below is the current judge prompt: 

<current_judge_prompt>
{current_judge_prompt}
</current_judge_prompt>

## Rater preferences
Below are patterns that have been observed from a human rater's preferences when doing pairwise comparisons of \
voice samples.

<rater_preferences>
{ranking_patterns}
</rater_preferences>

## Analysis
Critially analyse the <current_judge_prompt> and <rater_preferences> and make recommendations for whether or not \
the <current_judge_prompt> needs to be updated. You are not required to make edits to the <current_judge_prompt> \
if you fell it is already aligned with the <rater_preferences>.

## Output
Based on your analysis, please output a full updated judge prompt below. Do not include any placeholder text \
for the input variables. Just focus on describing what makes a good and bad voice sample purely \
based on the <rater_preferences>. Do not include mentions of the outputs needed such as scores or rankings. \
Just describe what makes a good and bad voice sample purely based on the <rater_preferences>.

Output your new judge prompt using markdown formatting.

"""

class JudgePromptAnalysis(BaseModel):
    reasoning: str = Field(description="A detailed explanation of your analysis of the <current_judge_prompt>, \
<rater_preferences> and what edits, if any, are needed to align the <current_judge_prompt> with the rater's preferences.")
    updated_judge_prompt: str = Field(description="The updated judge prompt based on the analysis, focussed just on the \
characteristics of a good and bad voice sample.")

### Run the Analyzer to update the Judge

With these preferences, we'll update a basic "seed" LLM Judge prompt to try and make it less generic and more aligned with our personal preferences.

#### Pass the seed judge prompt plus preference analysis

In [18]:
@weave.op
async def run_judge_prompt_analyser(judge_prompt_analyser_prompt: str, analyzer_model: str) -> JudgePromptAnalysis:
    # Example with file path
    result = await run_speech_llm(
        system_instruction=ANALYZER_SYSTEM_INSTRUCTION,
        prompt=judge_prompt_analyser_prompt,
        model_name=analyzer_model,
        temperature=1.0,
        response_model=JudgePromptAnalysis,
    )
    return result

In [19]:
def format_patterns(patterns):
    strong_str = ""
    for pat in patterns.get('strong', []):
        strong_str += f"  - {pat}\n"
    emerging_str = ""
    for pat in patterns.get('emerging', []):
        emerging_str += f"  - {pat}\n"
    return f"Strong patterns: {strong_str}\n\nEmerging patterns:\n{emerging_str}\n"

In [None]:
format_patterns(ranking_patterns)

In [None]:
patterns_str =format_patterns(ranking_patterns)
judge_prompt_analyser_prompt = judge_prompt_analyser_prompt.format(
    current_judge_prompt=seed_judge_prompt, 
    ranking_patterns=patterns_str
)

analyzer_result = await run_judge_prompt_analyser(
    judge_prompt_analyser_prompt=judge_prompt_analyser_prompt,
    analyzer_model=ANALYZER_MODEL
)

print("**Reasoning:**")
pprint(analyzer_result.reasoning)
print()
pprint("**Updated Judge Prompt:**")
pprint(analyzer_result.updated_judge_prompt)

round_1_judge_prompt = analyzer_result.updated_judge_prompt

## Run the updated Judge with the Round 1 judge prompt

In [22]:
@weave.op
async def run_speech_judge(new_judge_prompt, speech_samples_to_eval, judge_model):
    # Example with file path
    result = await run_speech_llm(
        system_instruction=BASIC_JUDGE_SYSTEM_INSTRUCTION,
        prompt=new_judge_prompt,
        model_name=judge_model,
        temperature=0.1,
        response_model=JudgeRanking,
        audio_data=[
            speech_samples_to_eval[0]["audio_bytes"],
            speech_samples_to_eval[1]["audio_bytes"],
            speech_samples_to_eval[2]["audio_bytes"],
        ],
        initial_audio_parts_prompt="\n\nvoice_1:\n",
        audio_parts_prompt_divider="\n\nvoice_{input_order}:\n",
    )
    return result

In [None]:
new_judge_prompt = round_1_judge_prompt + judge_prompt_postfix
speech_samples_to_eval = []
for eval_id in ids_to_eval:
    speech_samples_to_eval.append(samples_to_eval[eval_id])

# Run new judge
result = await run_speech_judge(
    new_judge_prompt=new_judge_prompt,
    speech_samples_to_eval=speech_samples_to_eval,
    judge_model=JUDGE_MODEL
)
pprint(result.thinking)
pprint(result.ranking)
print()

# Round 2 - Rank and Update Judge

## Get next samples to rank

In [None]:
# Register our samples for ranking
sample_1_id = ids_to_rank[2]
sample_2_id = ids_to_rank[3]

sample_1 = {"id": sample_1_id, **samples_to_rank[sample_1_id]}
sample_2 = {"id": sample_2_id, **samples_to_rank[sample_2_id]}

_, target_call = rank_audio.call(sample_1, sample_2)

# Init Ranker
ranker = AudioRanker(
    [
        {
            "id": sample_1_id,
            "audio": samples_to_rank[sample_1_id]["audio"],
            "original_input_order": 1,
        },
        {
            "id": sample_2_id,
            # "audio": samples_to_rank[sample_2_id]["audio_bytes"],
            "audio": samples_to_rank[sample_2_id]["audio"],
            "original_input_order": 2,
        },
    ],
    weave_client=weave_client,
    target_call=target_call,
    image_path="assets/gta-vi_guy_in_front_of_truck_cropped.jpg",
    mode="gradio"
)


## Run the ranker
First we'll run our ranker UI and rank the 2 voice samples selected

In [None]:
interface = ranker.create_gradio_interface()
interface.launch()

In [26]:
%%capture
updated_target_call = weave_client.get_call(target_call.id)
updated_target_call.feedback

In [None]:
for f in updated_target_call.feedback.feedbacks.items:
    if "output" in f.payload:
        ranking = f.payload["output"]
        print("Rankings retrieved from Weave")
        break

rank1 = samples_to_rank[ranking["preferred_sample_id"]]

if ranking["preferred_sample_id"] == sample_1_id:
    rank2 = samples_to_rank[sample_2_id]
    rank2_id = sample_2_id
else:
    rank2 = samples_to_rank[sample_1_id]
    rank2_id = sample_1_id

res = [
    {
        "rank" : 1,
        "id" : ranking["preferred_sample_id"],
        "original_input_order" : 1 if ranking["preferred_sample_id"] == sample_1_id else 2
    },
    {
        "rank" : 2,
        "id" : rank2_id,
        "original_input_order" : 1 if rank2_id== sample_1_id else 2
    }
]

final_rankings = {"rankings": res, 
                  "completed_at": ranking["ranking_timestamp"],
                  "preferred_id": ranking["preferred_sample_id"],
                  "rejected_id": rank2_id
                  }
final_rankings

samples_to_rank = update_pairwise_comparison_history(samples_to_rank, final_rankings)

In [None]:
sample_1_id, samples_to_rank[sample_1_id]["pairwise_comparison_history"]

## Anlysze Preferences

Based on the ranking selected, a LLM will now try and identify the differences in the preferred vs the rejected voice sample.

In [None]:
learner.patterns

In [None]:
learner.comparisons

In [None]:
await learner.update(final_rankings, samples_to_rank)
ranking_patterns = learner.patterns

In [None]:
ranking_patterns

## Update the Judge

With these preferences, we'll update a basic "seed" LLM Judge prompt to try and make it less generic and more aligned with our personal preferences.

### Pass Round 1 judge prompt plus Round 2 preference analysis

In [None]:
pprint(round_1_judge_prompt)

In [None]:
format_patterns(ranking_patterns)

In [37]:
judge_prompt_analyser_prompt = judge_prompt_analyser_prompt.format(
    current_judge_prompt=round_1_judge_prompt,  # <----- Round 1 judge prompt used this time
    ranking_patterns=format_patterns(ranking_patterns)
)

In [None]:
analyzer_result = await run_judge_prompt_analyser(
    judge_prompt_analyser_prompt=judge_prompt_analyser_prompt,
    analyzer_model=ANALYZER_MODEL
)

print("**Reasoning:**")
pprint(analyzer_result.reasoning)
print()
pprint("**Updated Judge Prompt:**")
pprint(analyzer_result.updated_judge_prompt)

round_2_judge_prompt = analyzer_result.updated_judge_prompt

## Run the updated Judge

In [None]:
new_judge_prompt = round_2_judge_prompt + judge_prompt_postfix
speech_samples_to_eval = []
for eval_id in ids_to_eval:
    speech_samples_to_eval.append(samples_to_eval[eval_id])

# Run new judge
result = await run_speech_judge(
    new_judge_prompt=new_judge_prompt,
    speech_samples_to_eval=speech_samples_to_eval,
    judge_model=JUDGE_MODEL
)
pprint(result.thinking)
pprint(result.ranking)
print()

# Round 3 - Rank and Update Judge

## Get next samples to rank

In [None]:
# Register our samples for ranking
sample_1_id = ids_to_rank[4]
sample_2_id = ids_to_rank[5]

sample_1 = {"id": sample_1_id, **samples_to_rank[sample_1_id]}
sample_2 = {"id": sample_2_id, **samples_to_rank[sample_2_id]}

_, target_call = rank_audio.call(sample_1, sample_2)

# Init Ranker
ranker = AudioRanker(
    [
        {
            "id": sample_1_id,
            "audio": samples_to_rank[sample_1_id]["audio"],
            "original_input_order": 1,
        },
        {
            "id": sample_2_id,
            # "audio": samples_to_rank[sample_2_id]["audio_bytes"],
            "audio": samples_to_rank[sample_2_id]["audio"],
            "original_input_order": 2,
        },
    ],
    weave_client=weave_client,
    target_call=target_call,
    image_path="assets/gta-vi_guy_in_front_of_truck_cropped.jpg",
    mode="gradio"
)


## Run the ranker
First we'll run our ranker UI and rank the 2 voice samples selected

In [None]:
interface = ranker.create_gradio_interface()
interface.launch()

In [None]:
%%capture
updated_target_call = weave_client.get_call(target_call.id)
updated_target_call.feedback

In [None]:
for f in updated_target_call.feedback.feedbacks.items:
    if "output" in f.payload:
        ranking = f.payload["output"]
        print("Rankings retrieved from Weave")
        break

rank1 = samples_to_rank[ranking["preferred_sample_id"]]

if ranking["preferred_sample_id"] == sample_1_id:
    rank2 = samples_to_rank[sample_2_id]
    rank2_id = sample_2_id
else:
    rank2 = samples_to_rank[sample_1_id]
    rank2_id = sample_1_id

res = [
    {
        "rank" : 1,
        "id" : ranking["preferred_sample_id"],
        "original_input_order" : 1 if ranking["preferred_sample_id"] == sample_1_id else 2
    },
    {
        "rank" : 2,
        "id" : rank2_id,
        "original_input_order" : 1 if rank2_id== sample_1_id else 2
    }
]

final_rankings = {"rankings": res, 
                  "completed_at": ranking["ranking_timestamp"],
                  "preferred_id": ranking["preferred_sample_id"],
                  "rejected_id": rank2_id
                  }
final_rankings

samples_to_rank = update_pairwise_comparison_history(samples_to_rank, final_rankings)

In [None]:
sample_1_id, samples_to_rank[sample_1_id]["pairwise_comparison_history"]

## Anlysze Preferences

Based on the ranking selected, a LLM will now try and identify the differences in the preferred vs the rejected voice sample.

In [None]:
learner.patterns

In [None]:
learner.comparisons

In [None]:
await learner.update(final_rankings, samples_to_rank)
ranking_patterns = learner.patterns

In [None]:
ranking_patterns

## Update the Judge

With these preferences, we'll update a basic "seed" LLM Judge prompt to try and make it less generic and more aligned with our personal preferences.

### Pass Round 2 judge prompt plus Round 3 preference analysis

In [None]:
pprint(round_2_judge_prompt)

In [None]:
format_patterns(ranking_patterns)

In [None]:
judge_prompt_analyser_prompt = judge_prompt_analyser_prompt.format(
    current_judge_prompt=round_2_judge_prompt,  # <----- Round 2 judge prompt used this time
    ranking_patterns=format_patterns(ranking_patterns)
)

In [None]:
analyzer_result = await run_judge_prompt_analyser(
    judge_prompt_analyser_prompt=judge_prompt_analyser_prompt,
    analyzer_model=ANALYZER_MODEL
)

print("**Reasoning:**")
pprint(analyzer_result.reasoning)
print()
pprint("**Updated Judge Prompt:**")
pprint(analyzer_result.updated_judge_prompt)

round_3_judge_prompt = analyzer_result.updated_judge_prompt

## Run the updated Judge

In [None]:
new_judge_prompt = round_3_judge_prompt + judge_prompt_postfix
speech_samples_to_eval = []
for eval_id in ids_to_eval:
    speech_samples_to_eval.append(samples_to_eval[eval_id])

# Run new judge
result = await run_speech_judge(
    new_judge_prompt=new_judge_prompt,
    speech_samples_to_eval=speech_samples_to_eval,
    judge_model=JUDGE_MODEL
)
pprint(result.thinking)
pprint(result.ranking)
print()

# Big Eval

In [None]:
new_judge_prompt = round_3_judge_prompt + judge_prompt_postfix
speech_samples_to_eval = []
for eval_id in full_ids_to_eval:
    speech_samples_to_eval.append(samples_to_eval[eval_id])

print(f"Running new judge on {len(speech_samples_to_eval)} samples")

# Run new judge
result = await run_speech_judge(
    new_judge_prompt=new_judge_prompt,
    speech_samples_to_eval=speech_samples_to_eval,
    judge_model=JUDGE_MODEL
)
pprint(result.thinking)
pprint(result.ranking)
print()