In [11]:
from oumi.core.analyze.llm_judge_analyzer import LLMJudgeAnalyzer

print("Available preset prompts:", LLMJudgeAnalyzer.list_presets())

Available preset prompts: ['instruction_quality', 'response_quality', 'conversation_coherence', 'safety', 'helpfulness', 'factuality']


In [12]:
import os
import sys


# IMPORTANT: Set these BEFORE importing torch or any ML libraries
# Disable all GPU/MPS backends to prevent crashes with IFD analyzer
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["CUDA_VISIBLE_DEVICES"] = ""  # Disable CUDA
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
os.environ["PYTORCH_MPS_HIGH_WATERMARK_RATIO"] = "0.0"  # Disable MPS memory allocation
os.environ["DISABLE_MPS_COMPAT"] = "1"  # Additional MPS disable flag
os.environ["HF_HUB_DISABLE_TELEMETRY"] = "1"  # Disable HuggingFace telemetry
os.environ["TRANSFORMERS_OFFLINE"] = "0"  # Allow model downloads

# Force CPU usage in PyTorch to avoid MPS crashes
import torch

# Forcefully disable MPS before anything else
torch.set_default_device("cpu")
if hasattr(torch.backends, "mps"):
    # Monkey-patch to prevent MPS usage
    original_is_available = torch.backends.mps.is_available
    torch.backends.mps.is_available = lambda: False
    print("MPS has been disabled - forcing CPU-only mode")
else:
    print("Using CPU for all computations")

print(f"PyTorch device: {torch.get_default_device()}")
print(f"PyTorch version: {torch.__version__}")

MPS has been disabled - forcing CPU-only mode
PyTorch device: cpu
PyTorch version: 2.6.0


In [13]:
from oumi.core.configs import AnalyzeConfig, SampleAnalyzerParams

# Option 1: Built-in helpfulness preset (NOT recommended for classification)
# This evaluates the response text alone without seeing the user query
llm_helpfulness_params_preset = SampleAnalyzerParams(
    id="llm_judge",
    params={
        "prompt_preset": "helpfulness",
        "inference_config": {
            "model_name": "gpt-4o-mini",
            "engine": "openai",
            "temperature": 0.1,
            "max_tokens": 256,
        },
        "batch_size": 10,
        "max_text_length": 4000,
        "parse_json_response": True,
    },
)

# Option 2: Custom "Classification Helpfulness" - Conversation-level (RECOMMENDED)
# Evaluates if the classification is appropriate for the user's query
llm_helpfulness_params_conversation = SampleAnalyzerParams(
    id="llm_judge",
    params={
        "prompt": """Evaluate how helpful this banking intent classification is (0-10).

Context: User asks a banking question, assistant classifies it into one of 77 intent categories (by ID number).

CONVERSATION FORMAT:
The conversation below contains three parts:
1. SYSTEM: Instructions and the full list of 77 intent categories with their IDs
2. USER: The banking query to classify
3. ASSISTANT: The classification result (a single numeric ID 0-76)

⚠️ IMPORTANT: The assistant's classification is at the VERY END of the conversation (after "ASSISTANT:").
Look for the last line starting with "ASSISTANT:" to find the classification ID.

Conversation to evaluate:
{text}

Evaluation criteria for helpfulness:
- Does the classification ID seem appropriate for the user's query?
- Would this classification lead to helpful downstream actions?
- Is it the most relevant intent among potential options?
- Does the assistant provide ONLY the ID (good format)?

Scoring guide:
- 10: Perfect classification, highly relevant to query, correct format
- 7-9: Good classification, reasonable fit for the query
- 4-6: Mediocre, somewhat relevant but not ideal
- 1-3: Poor classification, wrong intent for the query
- 0: No classification or completely wrong

Examples of good vs bad:
Good (score 9-10):
  USER: "My card was declined at the store"
  ASSISTANT: 25  ← (ID 25 = declined_card_payment, highly relevant)

Bad (score 1-3):
  USER: "My card was declined at the store"
  ASSISTANT: 0   ← (ID 0 = activate_my_card, wrong intent)

Respond with JSON:
- "score": 0-10 (10 = maximally helpful classification)
- "label": "very_helpful", "helpful", "somewhat_helpful", "not_helpful"
- "reasoning": brief explanation of why this classification is/isn't helpful for the user's query

JSON response:""",
        "analyze_message_level": False,
        "analyze_conversation_level": True,  # See full context
        "inference_config": {
            "model_name": "gpt-4o-mini",
            "engine": "openai",
            "temperature": 0.1,
            "max_tokens": 256,
        },
        "batch_size": 10,
        "max_text_length": 8000,
        "parse_json_response": True,
    },
)

# Option 3: Custom "Format Check" - Message-level for assistant only
# Just validates format, not semantic appropriateness
llm_helpfulness_params_format = SampleAnalyzerParams(
    id="llm_judge",
    params={
        "prompt": """Evaluate this assistant's classification response format (0-10).

Context: The assistant must classify a banking query by responding with a single intent ID (0-76).

Assistant response to evaluate:
{text}

Evaluation criteria:
- Is it a single numeric ID (0-76)?
- Is the format correct (just the number, no extra text)?
- Is it a valid ID in the expected range?

Good examples: "59", "0", "42" (single numbers)
Bad examples: "activate_my_card" (label instead of ID), "I think it's 59" (extra text), "" (empty)

Respond with JSON:
- "score": 0-10 (10 = perfect format)
  * 10: Valid ID (0-76), correct format
  * 5-9: Valid ID but minor formatting issues
  * 1-4: Invalid format or out of range
  * 0: Empty or completely wrong
- "label": "excellent", "good", "needs_improvement", "poor"
- "reasoning": brief explanation

JSON response:""",
        "filter_role": "assistant",
        "analyze_message_level": True,
        "analyze_conversation_level": False,
        "inference_config": {
            "model_name": "gpt-4o-mini",
            "engine": "openai",
            "temperature": 0.1,
            "max_tokens": 256,
        },
        "batch_size": 10,
        "max_text_length": 4000,
        "parse_json_response": True,
    },
)


In [14]:
import os
from oumi.core.configs import AnalyzeConfig
from oumi.core.analyze.dataset_analyzer import DatasetAnalyzer

# Create the full AnalyzeConfig
config = AnalyzeConfig(
    # Dataset configuration
    dataset_path="/Users/ryanarman/code/scratch/ryan_hillclimbing_experiments/banking77/notebooks/data/banking77_train.jsonl",
    # Analysis configuration
    sample_count=5,  # Analyze first 100 samples (LLM calls can be expensive!)
    analyzers=[
        llm_helpfulness_params_conversation,
    ],  # Just the LLM judge
    # Output configuration
    output_path="./analysis_output/banking77_llm_judge",
    generate_report=False,  # Optional: generate HTML report
)

# Finalize and run
config.finalize_and_validate()

from oumi.core.analyze.dataset_analyzer import DatasetAnalyzer

analyzer = DatasetAnalyzer(config)
analyzer.analyze_dataset()


[2025-12-30 17:10:12,266][oumi][rank0][pid:72116][MainThread][INFO]][base_map_dataset.py:91] Creating map dataset (type: TextSftJsonLinesDataset)... dataset_name: 'custom'
[2025-12-30 17:10:12,267][oumi.utils.analysis_utils][rank0][pid:72116][MainThread][INFO]][analysis_utils.py:225] Loaded text dataset from: /Users/ryanarman/code/scratch/ryan_hillclimbing_experiments/banking77/notebooks/data/banking77_train.jsonl
[2025-12-30 17:10:12,267][oumi][rank0][pid:72116][MainThread][INFO]][dataset_analyzer.py:154] Loaded dataset from config: None
[2025-12-30 17:10:12,268][oumi][rank0][pid:72116][MainThread][INFO]][dataset_analyzer.py:304] Initialized sample analyzer: llm_judge
[2025-12-30 17:10:12,303][oumi][rank0][pid:72116][MainThread][INFO]][dataset_analyzer.py:347] Starting analysis of dataset: None
[2025-12-30 17:10:12,304][oumi][rank0][pid:72116][MainThread][INFO]][dataset_analyzer.py:348] Using 1 sample analyzers: ['llm_judge']
[2025-12-30 17:10:12,305][oumi][rank0][pid:72116][MainThrea

Converting Unknown Dataset to DataFrames: 100%|██████████| 5/5 [00:00<00:00, 2360.86item/s]


[2025-12-30 17:10:12,313][oumi][rank0][pid:72116][MainThread][INFO]][llm_judge_analyzer.py:444] Initialized LLM Judge with model: gpt-4o-mini, engine: openai


100%|██████████| 5/5 [00:10<00:00,  2.02s/it]


[2025-12-30 17:10:22,430][oumi][rank0][pid:72116][MainThread][INFO]][llm_judge_analyzer.py:780] Skipping message-level analysis (analyze_message_level=False). Set analyze_message_level=True to enable.


In [15]:
schema = analyzer.get_schema()
print(f"len(schema): {len(schema)}")
merged_columns = analyzer.analysis_df.columns
print(f"len(merged_columns): {len(merged_columns)}")

len(schema): 12
len(merged_columns): 12


In [16]:
analyzer_names = [a.id for a in config.analyzers]
analyzer_names

['llm_judge']

# Conv level

In [19]:
analyzer.conversation_df.head()

Unnamed: 0,conversation_index,conversation_id,num_messages,conversation_text_content,conversation_text_content__llm_judge__score,conversation_text_content__llm_judge__label,conversation_text_content__llm_judge__reasoning,conversation_text_content__llm_judge__raw_response
0,0,0,3,SYSTEM: You are a banking intent classifier. C...,10.0,very_helpful,The classification ID 52 (request_refund) is p...,"```json\n{\n ""score"": 10,\n ""label"": ""very_h..."
1,1,1,3,SYSTEM: You are a banking intent classifier. C...,10.0,very_helpful,The classification ID 69 (verify_my_identity) ...,"```json\n{\n ""score"": 10,\n ""label"": ""very_h..."
2,2,2,3,SYSTEM: You are a banking intent classifier. C...,9.0,helpful,The classification ID 59 (top_up_failed) is hi...,"```json\n{\n ""score"": 9,\n ""label"": ""helpful..."
3,3,3,3,SYSTEM: You are a banking intent classifier. C...,4.0,somewhat_helpful,The classification ID 54 (Refund_not_showing_u...,"```json\n{\n ""score"": 4,\n ""label"": ""somewha..."
4,4,4,3,SYSTEM: You are a banking intent classifier. C...,9.0,helpful,The classification ID 57 (top_up_by_card_charg...,"```json\n{\n ""score"": 9,\n ""label"": ""helpful..."


In [None]:
from oumi.core.analyze.column_utils import (
    filter_analyzer_columns,
    get_analyzer_columns_by_analyzer,
    parse_analyzer_column_name,
)

conv_columns = analyzer.conversation_df.columns
row = analyzer.conversation_df.iloc[3]

# Choose the analzyer to analyze
analyzer_name = analyzer_names[0]


filtered_cols = filter_analyzer_columns(conv_columns, analyzer_id=analyzer_name)
if filtered_cols:
    print(f"Analyzer: {analyzer_name}")
    info = parse_analyzer_column_name(filtered_cols[0])
    # print("\nInput:")
    # print(f"source_column: {info.source_column}")

    # print(f"{row[info.source_column]}\n")

    for col in filtered_cols:
        info = parse_analyzer_column_name(col)
        print(f"metric: {info.metric_name}")
        # print(f"type: {schema[col]['type']}")
        # print(f"content_type: {schema[col]['content_type']}")
        print(f"description: {schema[col]['description']}")
        print(f"value: {row[col]}")
        print("\n")
else:
    print(f"No columns found for analyzer: {analyzer_name}")


Analyzer: llm_judge
metric: score
description: LLM judge score (0-10, higher = better quality)
value: 4.0


metric: label
description: LLM judge label/category for the sample
value: somewhat_helpful


metric: reasoning
description: LLM judge reasoning/explanation
value: The classification ID 54 (Refund_not_showing_up) does not align with the user's query about adding funds and accepted payment methods. A more relevant classification would be related to top-up methods or payment options.


metric: raw_response
description: Raw LLM response before parsing
value: ```json
{
  "score": 4,
  "label": "somewhat_helpful",
  "reasoning": "The classification ID 54 (Refund_not_showing_up) does not align with the user's query about adding funds and accepted payment methods. A more relevant classification would be related to top-up methods or payment options."
}
```




# Message level

In [22]:
# analyzer.message_df

In [9]:
from oumi.core.analyze.column_utils import (
    filter_analyzer_columns,
    get_analyzer_columns_by_analyzer,
    parse_analyzer_column_name,
)

msg_columns = analyzer.message_df.columns
row = analyzer.message_df.iloc[2]

# Choose the analzyer to analyze
analyzer_name = analyzer_names[0]


filtered_cols = filter_analyzer_columns(msg_columns, analyzer_id=analyzer_name)
if filtered_cols:
    print(f"Analyzer: {analyzer_name}")
    info = parse_analyzer_column_name(filtered_cols[0])

    print("\nInput:")
    print(f"[{row['role']}]")
    # print(f"{row[info.source_column]}\n")
    # print(f"source_column: {info.source_column}")
    # print(f"{row[info.source_column]}\n")

    for col in filtered_cols:
        info = parse_analyzer_column_name(col)
        print(f"metric: {info.metric_name}")
        # print(f"type: {schema[col]['type']}")
        # print(f"content_type: {schema[col]['content_type']}")
        print(f"description: {schema[col]['description']}")
        print(f"value: {row[col]}")
        print("\n")
else:
    print(f"No columns found for analyzer: {analyzer_name}")


No columns found for analyzer: llm_judge
