In [11]:
from oumi.core.analyze.llm_judge_analyzer import LLMJudgeAnalyzer

print("Available preset prompts:", LLMJudgeAnalyzer.list_presets())

Available preset prompts: ['instruction_quality', 'response_quality', 'conversation_coherence', 'safety', 'helpfulness', 'factuality']


In [12]:
import os
import sys


# IMPORTANT: Set these BEFORE importing torch or any ML libraries
# Disable all GPU/MPS backends to prevent crashes with IFD analyzer
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["CUDA_VISIBLE_DEVICES"] = ""  # Disable CUDA
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
os.environ["PYTORCH_MPS_HIGH_WATERMARK_RATIO"] = "0.0"  # Disable MPS memory allocation
os.environ["DISABLE_MPS_COMPAT"] = "1"  # Additional MPS disable flag
os.environ["HF_HUB_DISABLE_TELEMETRY"] = "1"  # Disable HuggingFace telemetry
os.environ["TRANSFORMERS_OFFLINE"] = "0"  # Allow model downloads

# Force CPU usage in PyTorch to avoid MPS crashes
import torch

# Forcefully disable MPS before anything else
torch.set_default_device("cpu")
if hasattr(torch.backends, "mps"):
    # Monkey-patch to prevent MPS usage
    original_is_available = torch.backends.mps.is_available
    torch.backends.mps.is_available = lambda: False
    print("MPS has been disabled - forcing CPU-only mode")
else:
    print("Using CPU for all computations")

print(f"PyTorch device: {torch.get_default_device()}")
print(f"PyTorch version: {torch.__version__}")

MPS has been disabled - forcing CPU-only mode
PyTorch device: cpu
PyTorch version: 2.6.0


In [13]:
from oumi.core.configs import AnalyzeConfig, SampleAnalyzerParams

# Option 1: Built-in factuality preset (NOT recommended for classification)
# This evaluates the response text alone without seeing the user query
llm_factuality_params_preset = SampleAnalyzerParams(
    id="llm_judge",
    params={
        "prompt_preset": "factuality",
        "inference_config": {
            "model_name": "gpt-4o-mini",
            "engine": "openai",
            "temperature": 0.1,
            "max_tokens": 256,
        },
        "batch_size": 10,
        "max_text_length": 4000,
        "parse_json_response": True,
    },
)

# Option 2: Custom "Classification Factuality" - Conversation-level (RECOMMENDED)
# Evaluates if the classification is factually correct and accurate
llm_factuality_params_conversation = SampleAnalyzerParams(
    id="llm_judge",
    params={
        "prompt": """Evaluate the factual accuracy of this banking intent classification (0-10).

Context: User asks a banking question, assistant must classify it into one of 77 predefined intent categories (by ID number).

CONVERSATION FORMAT:
The conversation below contains three parts:
1. SYSTEM: Instructions and the COMPLETE list of all 77 valid intent categories with their IDs (0-76)
2. USER: The banking query to classify
3. ASSISTANT: The classification result (should be a single numeric ID)

⚠️ IMPORTANT: The assistant's classification is at the VERY END of the conversation (after "ASSISTANT:").
Look for the last line starting with "ASSISTANT:" to find the classification ID.

Conversation to evaluate:
{text}

Evaluation criteria for factual accuracy:
- Is the classification ID a VALID intent ID (0-76) from the system's list?
- Does the ID number actually correspond to a real banking intent category?
- Is the classification ACCURATE for the user's query (matches intent meaning)?
- Are there any hallucinations (invalid IDs, made-up categories, extra explanations)?
- Is the format correct (just the ID number, no fabricated information)?

Scoring guide:
- 10: Valid ID, factually correct intent category, accurate classification
- 7-9: Valid ID, correct format, mostly accurate but could be better
- 4-6: Valid ID but questionable accuracy, or minor factual issues
- 1-3: Invalid ID, wrong category, or significant factual errors
- 0: Completely fabricated, hallucinated, or missing

Examples:
Factually correct (score 10):
  USER: "My card was declined at the store"
  ASSISTANT: 25  ← (Valid ID 25 = declined_card_payment, factually accurate)

Factually incorrect (score 0-3):
  USER: "My card was declined at the store"
  ASSISTANT: 99  ← (Invalid ID, only 0-76 exist - hallucination!)
  
  USER: "My card was declined at the store"  
  ASSISTANT: 0   ← (Valid ID but wrong category: activate_my_card ≠ declined_card_payment)

Respond with JSON:
- "score": 0-10 (10 = completely factually accurate)
- "label": "accurate", "mostly_accurate", "somewhat_inaccurate", "inaccurate"
- "reasoning": brief explanation of factual accuracy/errors

JSON response:""",
        "analyze_message_level": False,
        "analyze_conversation_level": True,  # See full context
        "inference_config": {
            "model_name": "gpt-4o-mini",
            "engine": "openai",
            "temperature": 0.1,
            "max_tokens": 300,  # Slightly more for detailed reasoning
        },
        "batch_size": 10,
        "max_text_length": 8000,  # Large enough for full conversation
        "parse_json_response": True,
    },
)

# Option 3: Custom "Format Validity Check" - Message-level for assistant only
# Checks if the response is a valid ID without hallucinations
llm_factuality_params_format = SampleAnalyzerParams(
    id="llm_judge",
    params={
        "prompt": """Evaluate the factual validity of this classification response (0-10).

Context: The assistant must classify a banking query by responding with a single intent ID (0-76).

Assistant response to evaluate:
{text}

Evaluation criteria for factual validity:
- Is it a numeric ID in the valid range (0-76)?
- No hallucinated or fabricated information?
- No invalid IDs (77+, negative numbers, etc.)?
- No extra text, explanations, or made-up details?

Good examples (factually valid):
  "59" ← Valid ID in range
  "0"  ← Valid ID in range
  "42" ← Valid ID in range

Bad examples (factually invalid):
  "99"  ← Hallucinated ID (only 0-76 exist)
  "-5"  ← Invalid negative ID
  "activate_my_card" ← Hallucinated category name instead of ID
  "I think it's 59" ← Extra fabricated explanation
  ""    ← Missing/empty

Respond with JSON:
- "score": 0-10 (10 = factually valid response)
  * 10: Valid ID (0-76), no hallucinations
  * 5-9: Valid ID but minor issues
  * 1-4: Invalid ID or out of range (hallucination)
  * 0: Empty, fabricated, or completely wrong
- "label": "accurate", "mostly_accurate", "somewhat_inaccurate", "inaccurate"
- "reasoning": brief explanation of factual validity/errors

JSON response:""",
        "filter_role": "assistant",
        "analyze_message_level": True,
        "analyze_conversation_level": False,
        "inference_config": {
            "model_name": "gpt-4o-mini",
            "engine": "openai",
            "temperature": 0.1,
            "max_tokens": 256,
        },
        "batch_size": 10,
        "max_text_length": 4000,
        "parse_json_response": True,
    },
)


In [14]:
import os
from oumi.core.configs import AnalyzeConfig
from oumi.core.analyze.dataset_analyzer import DatasetAnalyzer

# Create the full AnalyzeConfig
config = AnalyzeConfig(
    # Dataset configuration
    dataset_path="/Users/ryanarman/code/scratch/ryan_hillclimbing_experiments/banking77/notebooks/data/banking77_train.jsonl",
    # Analysis configuration
    sample_count=5,  # Analyze first 100 samples (LLM calls can be expensive!)
    analyzers=[
        llm_factuality_params_conversation,
    ],  # Just the LLM judge
    # Output configuration
    output_path="./analysis_output/banking77_llm_judge",
    generate_report=False,  # Optional: generate HTML report
)

# Finalize and run
config.finalize_and_validate()

from oumi.core.analyze.dataset_analyzer import DatasetAnalyzer

analyzer = DatasetAnalyzer(config)
analyzer.analyze_dataset()


[2025-12-30 17:19:01,568][oumi][rank0][pid:73805][MainThread][INFO]][base_map_dataset.py:91] Creating map dataset (type: TextSftJsonLinesDataset)... dataset_name: 'custom'
[2025-12-30 17:19:01,569][oumi.utils.analysis_utils][rank0][pid:73805][MainThread][INFO]][analysis_utils.py:225] Loaded text dataset from: /Users/ryanarman/code/scratch/ryan_hillclimbing_experiments/banking77/notebooks/data/banking77_train.jsonl
[2025-12-30 17:19:01,569][oumi][rank0][pid:73805][MainThread][INFO]][dataset_analyzer.py:154] Loaded dataset from config: None
[2025-12-30 17:19:01,570][oumi][rank0][pid:73805][MainThread][INFO]][dataset_analyzer.py:304] Initialized sample analyzer: llm_judge
[2025-12-30 17:19:01,598][oumi][rank0][pid:73805][MainThread][INFO]][dataset_analyzer.py:347] Starting analysis of dataset: None
[2025-12-30 17:19:01,599][oumi][rank0][pid:73805][MainThread][INFO]][dataset_analyzer.py:348] Using 1 sample analyzers: ['llm_judge']
[2025-12-30 17:19:01,599][oumi][rank0][pid:73805][MainThrea

Converting Unknown Dataset to DataFrames: 100%|██████████| 5/5 [00:00<00:00, 3170.30item/s]


[2025-12-30 17:19:01,605][oumi][rank0][pid:73805][MainThread][INFO]][llm_judge_analyzer.py:444] Initialized LLM Judge with model: gpt-4o-mini, engine: openai


100%|██████████| 5/5 [00:08<00:00,  1.66s/it]


[2025-12-30 17:19:09,911][oumi][rank0][pid:73805][MainThread][INFO]][llm_judge_analyzer.py:780] Skipping message-level analysis (analyze_message_level=False). Set analyze_message_level=True to enable.


In [15]:
schema = analyzer.get_schema()
print(f"len(schema): {len(schema)}")
merged_columns = analyzer.analysis_df.columns
print(f"len(merged_columns): {len(merged_columns)}")

len(schema): 12
len(merged_columns): 12


In [16]:
analyzer_names = [a.instance_id for a in config.analyzers]
analyzer_names

['llm_judge']

# Conv level

In [17]:
analyzer.conversation_df.head()

Unnamed: 0,conversation_index,conversation_id,num_messages,conversation_text_content,conversation_text_content__llm_judge__score,conversation_text_content__llm_judge__label,conversation_text_content__llm_judge__reasoning,conversation_text_content__llm_judge__raw_response
0,0,0,3,SYSTEM: You are a banking intent classifier. C...,10.0,accurate,The classification ID 52 corresponds to 'reque...,"```json\n{\n ""score"": 10,\n ""label"": ""accura..."
1,1,1,3,SYSTEM: You are a banking intent classifier. C...,10.0,accurate,The classification ID 69 corresponds to 'verif...,"```json\n{\n ""score"": 10,\n ""label"": ""accura..."
2,2,2,3,SYSTEM: You are a banking intent classifier. C...,10.0,accurate,The classification ID 59 corresponds to 'top_u...,"{\n ""score"": 10,\n ""label"": ""accurate"",\n ""..."
3,3,3,3,SYSTEM: You are a banking intent classifier. C...,1.0,inaccurate,The classification ID 54 (Refund_not_showing_u...,"```json\n{\n ""score"": 1,\n ""label"": ""inaccur..."
4,4,4,3,SYSTEM: You are a banking intent classifier. C...,10.0,accurate,The classification ID 57 corresponds to 'top_u...,"```json\n{\n ""score"": 10,\n ""label"": ""accura..."


In [23]:
from oumi.core.analyze.column_utils import (
    filter_analyzer_columns,
    get_analyzer_columns_by_analyzer,
    parse_analyzer_column_name,
)

conv_columns = analyzer.conversation_df.columns
row = analyzer.conversation_df.iloc[3]

# Choose the analzyer to analyze
analyzer_name = analyzer_names[0]


filtered_cols = filter_analyzer_columns(conv_columns, analyzer_id=analyzer_name)
if filtered_cols:
    print(f"Analyzer: {analyzer_name}")
    info = parse_analyzer_column_name(filtered_cols[0])
    # print("\nInput:")
    # print(f"source_column: {info.source_column}")

    # print(f"{row[info.source_column]}\n")

    for col in filtered_cols:
        info = parse_analyzer_column_name(col)
        print(f"metric: {info.metric_name}")
        # print(f"type: {schema[col]['type']}")
        # print(f"content_type: {schema[col]['content_type']}")
        print(f"description: {schema[col]['description']}")
        print(f"value: {row[col]}")
        print("\n")
else:
    print(f"No columns found for analyzer: {analyzer_name}")


Analyzer: llm_judge
metric: score
description: LLM judge score (0-10, higher = better quality)
value: 1.0


metric: label
description: LLM judge label/category for the sample
value: inaccurate


metric: reasoning
description: LLM judge reasoning/explanation
value: The classification ID 54 (Refund_not_showing_up) is invalid for the user's query about adding funds to their account. The correct intent category should relate to payment methods or funding options, which is not represented by ID 54.


metric: raw_response
description: Raw LLM response before parsing
value: ```json
{
  "score": 1,
  "label": "inaccurate",
  "reasoning": "The classification ID 54 (Refund_not_showing_up) is invalid for the user's query about adding funds to their account. The correct intent category should relate to payment methods or funding options, which is not represented by ID 54."
}
```




# Message level

In [19]:
analyzer.message_df

Unnamed: 0,conversation_index,conversation_id,message_index,message_id,role,text_content
0,0,0,0,msg_0,system,You are a banking intent classifier. Classify ...
1,0,0,1,msg_1,user,"If I bought something I didn't like, can I get..."
2,0,0,2,msg_2,assistant,52
3,1,1,0,msg_0,system,You are a banking intent classifier. Classify ...
4,1,1,1,msg_1,user,"If I'm getting my identity verified, what all ..."
5,1,1,2,msg_2,assistant,69
6,2,2,0,msg_0,system,You are a banking intent classifier. Classify ...
7,2,2,1,msg_1,user,My top-up was rejected by an app.
8,2,2,2,msg_2,assistant,59
9,3,3,0,msg_0,system,You are a banking intent classifier. Classify ...


In [20]:
from oumi.core.analyze.column_utils import (
    filter_analyzer_columns,
    get_analyzer_columns_by_analyzer,
    parse_analyzer_column_name,
)

msg_columns = analyzer.message_df.columns
row = analyzer.message_df.iloc[2]

# Choose the analzyer to analyze
analyzer_name = analyzer_names[0]


filtered_cols = filter_analyzer_columns(msg_columns, analyzer_id=analyzer_name)
if filtered_cols:
    print(f"Analyzer: {analyzer_name}")
    info = parse_analyzer_column_name(filtered_cols[0])

    print("\nInput:")
    print(f"[{row['role']}]")
    # print(f"{row[info.source_column]}\n")
    # print(f"source_column: {info.source_column}")
    # print(f"{row[info.source_column]}\n")

    for col in filtered_cols:
        info = parse_analyzer_column_name(col)
        print(f"metric: {info.metric_name}")
        # print(f"type: {schema[col]['type']}")
        # print(f"content_type: {schema[col]['content_type']}")
        print(f"description: {schema[col]['description']}")
        print(f"value: {row[col]}")
        print("\n")
else:
    print(f"No columns found for analyzer: {analyzer_name}")


No columns found for analyzer: llm_judge
