In [1]:
from oumi.core.analyze.llm_judge_analyzer import LLMJudgeAnalyzer

print("Available preset prompts:", LLMJudgeAnalyzer.list_presets())

Available preset prompts: ['instruction_quality', 'response_quality', 'conversation_coherence', 'safety', 'helpfulness', 'factuality']


In [2]:
import os
import sys


# IMPORTANT: Set these BEFORE importing torch or any ML libraries
# Disable all GPU/MPS backends to prevent crashes with IFD analyzer
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["CUDA_VISIBLE_DEVICES"] = ""  # Disable CUDA
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
os.environ["PYTORCH_MPS_HIGH_WATERMARK_RATIO"] = "0.0"  # Disable MPS memory allocation
os.environ["DISABLE_MPS_COMPAT"] = "1"  # Additional MPS disable flag
os.environ["HF_HUB_DISABLE_TELEMETRY"] = "1"  # Disable HuggingFace telemetry
os.environ["TRANSFORMERS_OFFLINE"] = "0"  # Allow model downloads

# Force CPU usage in PyTorch to avoid MPS crashes
import torch

# Forcefully disable MPS before anything else
torch.set_default_device("cpu")
if hasattr(torch.backends, "mps"):
    # Monkey-patch to prevent MPS usage
    original_is_available = torch.backends.mps.is_available
    torch.backends.mps.is_available = lambda: False
    print("MPS has been disabled - forcing CPU-only mode")
else:
    print("Using CPU for all computations")

print(f"PyTorch device: {torch.get_default_device()}")
print(f"PyTorch version: {torch.__version__}")

MPS has been disabled - forcing CPU-only mode
PyTorch device: cpu
PyTorch version: 2.6.0


In [4]:
from oumi.core.configs import AnalyzeConfig, SampleAnalyzerParams

llm_safety_params = SampleAnalyzerParams(
    id="llm_judge",
    params={
        # Use a preset prompt (available: instruction_quality, response_quality,
        # conversation_coherence, safety, helpfulness, factuality)
        "prompt_preset": "safety",
        # Inference configuration
        "inference_config": {
            "model_name": "gpt-4o-mini",  # or "gpt-4o", "claude-3-5-sonnet-20241022", etc.
            "engine": "openai",  # or "vllm", "native" for local models
            "temperature": 0.1,  # Low temperature for consistent judgments
            "max_tokens": 256,
            # For OpenAI, API key is read from OPENAI_API_KEY env var by default
            # For Anthropic, use ANTHROPIC_API_KEY env var
            # Or specify explicitly:
            # "api_key_env": "OPENAI_API_KEY",  # or "ANTHROPIC_API_KEY"
        },
        "batch_size": 10,  # Process 10 samples at a time
        "max_text_length": 4000,  # Truncate long texts
        "parse_json_response": True,  # Parse JSON from LLM response
    },
)


In [5]:
import os
from oumi.core.configs import AnalyzeConfig
from oumi.core.analyze.dataset_analyzer import DatasetAnalyzer

# Create the full AnalyzeConfig
config = AnalyzeConfig(
    # Dataset configuration
    dataset_path="/Users/ryanarman/code/scratch/ryan_hillclimbing_experiments/banking77/notebooks/data/banking77_train.jsonl",
    # Analysis configuration
    sample_count=5,  # Analyze first 100 samples (LLM calls can be expensive!)
    analyzers=[
        llm_safety_params,
    ],  # Just the LLM judge
    # Output configuration
    output_path="./analysis_output/banking77_llm_judge",
    generate_report=False,  # Optional: generate HTML report
)

# Finalize and run
config.finalize_and_validate()

from oumi.core.analyze.dataset_analyzer import DatasetAnalyzer

analyzer = DatasetAnalyzer(config)
analyzer.analyze_dataset()


[2025-12-30 16:46:52,226][oumi][rank0][pid:69225][MainThread][INFO]][base_map_dataset.py:91] Creating map dataset (type: TextSftJsonLinesDataset)... dataset_name: 'custom'
[2025-12-30 16:46:52,227][oumi.utils.analysis_utils][rank0][pid:69225][MainThread][INFO]][analysis_utils.py:225] Loaded text dataset from: /Users/ryanarman/code/scratch/ryan_hillclimbing_experiments/banking77/notebooks/data/banking77_train.jsonl
[2025-12-30 16:46:52,228][oumi][rank0][pid:69225][MainThread][INFO]][dataset_analyzer.py:154] Loaded dataset from config: None
[2025-12-30 16:46:52,228][oumi][rank0][pid:69225][MainThread][INFO]][dataset_analyzer.py:304] Initialized sample analyzer: llm_judge
[2025-12-30 16:46:52,229][oumi][rank0][pid:69225][MainThread][INFO]][dataset_analyzer.py:347] Starting analysis of dataset: None
[2025-12-30 16:46:52,229][oumi][rank0][pid:69225][MainThread][INFO]][dataset_analyzer.py:348] Using 1 sample analyzers: ['llm_judge']
[2025-12-30 16:46:52,230][oumi][rank0][pid:69225][MainThrea

Converting Unknown Dataset to DataFrames: 100%|██████████| 5/5 [00:00<00:00, 1461.53item/s]


[2025-12-30 16:46:52,279][oumi][rank0][pid:69225][MainThread][INFO]][llm_judge_analyzer.py:444] Initialized LLM Judge with model: gpt-4o-mini, engine: openai


100%|██████████| 5/5 [00:08<00:00,  1.62s/it]


[2025-12-30 16:47:00,406][oumi][rank0][pid:69225][MainThread][INFO]][llm_judge_analyzer.py:679] Batch of 10: 7 unique to evaluate, 3 duplicates, 0 from cache


 57%|█████▋    | 4/7 [00:06<00:05,  1.71s/it]



100%|██████████| 7/7 [00:12<00:00,  1.83s/it]


[2025-12-30 16:47:13,207][oumi][rank0][pid:69225][MainThread][INFO]][llm_judge_analyzer.py:679] Batch of 5: 4 unique to evaluate, 0 duplicates, 1 from cache


100%|██████████| 4/4 [00:06<00:00,  1.66s/it]


In [6]:
schema = analyzer.get_schema()
print(f"len(schema): {len(schema)}")
merged_columns = analyzer.analysis_df.columns
print(f"len(merged_columns): {len(merged_columns)}")

len(schema): 16
len(merged_columns): 16


In [7]:
analyzer_names = [a.instance_id for a in config.analyzers]
analyzer_names

['llm_judge']

# Conv level

In [8]:
from oumi.core.analyze.column_utils import (
    filter_analyzer_columns,
    get_analyzer_columns_by_analyzer,
    parse_analyzer_column_name,
)

conv_columns = analyzer.conversation_df.columns
row = analyzer.conversation_df.iloc[0]

# Choose the analzyer to analyze
analyzer_name = analyzer_names[0]


filtered_cols = filter_analyzer_columns(conv_columns, analyzer_id=analyzer_name)
if filtered_cols:
    print(f"Analyzer: {analyzer_name}")
    info = parse_analyzer_column_name(filtered_cols[0])
    # print("\nInput:")
    # print(f"source_column: {info.source_column}")
    # print(f"{row[info.source_column]}\n")

    for col in filtered_cols:
        info = parse_analyzer_column_name(col)
        print(f"metric: {info.metric_name}")
        # print(f"type: {schema[col]['type']}")
        # print(f"content_type: {schema[col]['content_type']}")
        print(f"description: {schema[col]['description']}")
        print(f"value: {row[col]}")
        print("\n")
else:
    print(f"No columns found for analyzer: {analyzer_name}")


Analyzer: llm_judge
metric: score
description: LLM judge score (0-10, higher = better quality)
value: 10.0


metric: label
description: LLM judge label/category for the sample
value: safe


metric: reasoning
description: LLM judge reasoning/explanation
value: The text contains no harmful or dangerous content, personal information exposure, inappropriate language, misleading claims, or potential policy violations. It is a straightforward instruction set for a banking intent classifier.


metric: raw_response
description: Raw LLM response before parsing
value: ```json
{
  "score": 10,
  "label": "safe",
  "reasoning": "The text contains no harmful or dangerous content, personal information exposure, inappropriate language, misleading claims, or potential policy violations. It is a straightforward instruction set for a banking intent classifier."
}
```




# Message level

In [11]:
analyzer.message_df

Unnamed: 0,conversation_index,conversation_id,message_index,message_id,role,text_content,text_content__llm_judge__score,text_content__llm_judge__label,text_content__llm_judge__reasoning,text_content__llm_judge__raw_response
0,0,0,0,msg_0,system,You are a banking intent classifier. Classify ...,10.0,safe,"The text contains no harmful content, personal...","{\n ""score"": 10,\n ""label"": ""safe"",\n ""reas..."
1,0,0,1,msg_1,user,"If I bought something I didn't like, can I get...",10.0,safe,The text poses a general question about refund...,"```json\n{\n ""score"": 10,\n ""label"": ""safe"",..."
2,0,0,2,msg_2,assistant,52,10.0,safe,The text '52' does not contain any harmful or ...,"```json\n{\n ""score"": 10,\n ""label"": ""safe"",..."
3,1,1,0,msg_0,system,You are a banking intent classifier. Classify ...,10.0,safe,"The text contains no harmful content, personal...","{\n ""score"": 10,\n ""label"": ""safe"",\n ""reas..."
4,1,1,1,msg_1,user,"If I'm getting my identity verified, what all ...",9.0,safe,The text is a general inquiry about identity v...,"```json\n{\n ""score"": 9,\n ""label"": ""safe"",\..."
5,1,1,2,msg_2,assistant,69,10.0,safe,The text '69' does not contain harmful or dang...,"```json\n{\n ""score"": 10,\n ""label"": ""safe"",..."
6,2,2,0,msg_0,system,You are a banking intent classifier. Classify ...,10.0,safe,"The text contains no harmful content, personal...","{\n ""score"": 10,\n ""label"": ""safe"",\n ""reas..."
7,2,2,1,msg_1,user,My top-up was rejected by an app.,9.0,safe,The text does not contain harmful or dangerous...,"```json\n{\n ""score"": 9,\n ""label"": ""safe"",\..."
8,2,2,2,msg_2,assistant,59,10.0,safe,The text '59' does not contain any harmful or ...,"```json\n{\n ""score"": 10,\n ""label"": ""safe"",..."
9,3,3,0,msg_0,system,You are a banking intent classifier. Classify ...,10.0,safe,"The text contains no harmful content, personal...","{\n ""score"": 10,\n ""label"": ""safe"",\n ""reas..."


In [12]:
from oumi.core.analyze.column_utils import (
    filter_analyzer_columns,
    get_analyzer_columns_by_analyzer,
    parse_analyzer_column_name,
)

msg_columns = analyzer.message_df.columns
row = analyzer.message_df.iloc[4]

# Choose the analzyer to analyze
analyzer_name = analyzer_names[0]


filtered_cols = filter_analyzer_columns(msg_columns, analyzer_id=analyzer_name)
if filtered_cols:
    print(f"Analyzer: {analyzer_name}")
    info = parse_analyzer_column_name(filtered_cols[0])

    print("\nInput:")
    print(f"[{row['role']}]")
    # print(f"{row[info.source_column]}\n")
    # print(f"source_column: {info.source_column}")
    # print(f"{row[info.source_column]}\n")

    for col in filtered_cols:
        info = parse_analyzer_column_name(col)
        print(f"metric: {info.metric_name}")
        # print(f"type: {schema[col]['type']}")
        # print(f"content_type: {schema[col]['content_type']}")
        print(f"description: {schema[col]['description']}")
        print(f"value: {row[col]}")
        print("\n")
else:
    print(f"No columns found for analyzer: {analyzer_name}")


Analyzer: llm_judge

Input:
[user]
metric: score
description: LLM judge score (0-10, higher = better quality)
value: 9.0


metric: label
description: LLM judge label/category for the sample
value: safe


metric: reasoning
description: LLM judge reasoning/explanation
value: The text is a general inquiry about identity verification and does not contain harmful content, personal information, inappropriate language, or misleading claims. However, it could be slightly concerning if it implies a context where identity verification is being misused, but overall, it is a safe question.


metric: raw_response
description: Raw LLM response before parsing
value: ```json
{
  "score": 9,
  "label": "safe",
  "reasoning": "The text is a general inquiry about identity verification and does not contain harmful content, personal information, inappropriate language, or misleading claims. However, it could be slightly concerning if it implies a context where identity verification is being misused, but ov