In [1]:
import os
import sys


# IMPORTANT: Set these BEFORE importing torch or any ML libraries
# Disable all GPU/MPS backends to prevent crashes with IFD analyzer
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["CUDA_VISIBLE_DEVICES"] = ""  # Disable CUDA
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
os.environ["PYTORCH_MPS_HIGH_WATERMARK_RATIO"] = "0.0"  # Disable MPS memory allocation
os.environ["DISABLE_MPS_COMPAT"] = "1"  # Additional MPS disable flag
os.environ["HF_HUB_DISABLE_TELEMETRY"] = "1"  # Disable HuggingFace telemetry
os.environ["TRANSFORMERS_OFFLINE"] = "0"  # Allow model downloads

# Force CPU usage in PyTorch to avoid MPS crashes
import torch

# Forcefully disable MPS before anything else
torch.set_default_device("cpu")
if hasattr(torch.backends, "mps"):
    # Monkey-patch to prevent MPS usage
    original_is_available = torch.backends.mps.is_available
    torch.backends.mps.is_available = lambda: False
    print("MPS has been disabled - forcing CPU-only mode")
else:
    print("Using CPU for all computations")

print(f"PyTorch device: {torch.get_default_device()}")
print(f"PyTorch version: {torch.__version__}")

MPS has been disabled - forcing CPU-only mode
PyTorch device: cpu
PyTorch version: 2.6.0


In [2]:
import os
from oumi.core.configs import AnalyzeConfig
from oumi.core.analyze.dataset_analyzer import DatasetAnalyzer

# Load config from YAML file
config = AnalyzeConfig.from_yaml(
    "/Users/ryanarman/code/oumi/configs/examples/analyze/analyze_banking77.yaml"
)
dataset_path = "/Users/ryanarman/code/scratch/ryan_hillclimbing_experiments/banking77/notebooks/data/banking77_train.jsonl"
config.sample_count = 10
config.chat_template = "chat_ml"
# Override the dataset settings to use your local file
config.dataset_path = dataset_path
config.dataset_name = None  # Clear dataset_name so it uses dataset_path instead
# Optional: Override dataset path if needed
# config.dataset_path = "/path/to/your/dataset.jsonl"

# Optional: Override sample count for quick testing
# config.sample_count = 5

print(f"✅ Config loaded with {len(config.analyzers)} analyzers:")
for analyzer in config.analyzers:
    instance_id = analyzer.instance_id or analyzer.id
    print(f"  - {instance_id} (type: {analyzer.id})")

# Validate the configuration
config.finalize_and_validate()
print("✅ Config validated successfully!")

analyzer = DatasetAnalyzer(config)

✅ Config loaded with 5 analyzers:
  - length (type: length)
  - cost (type: cost)
  - helpfulness (type: llm_judge)
  - instruction_quality (type: llm_judge)
  - response_quality (type: llm_judge)
✅ Config validated successfully!
[2026-01-05 16:54:50,453][oumi][rank0][pid:24426][MainThread][INFO]][base_map_dataset.py:91] Creating map dataset (type: TextSftJsonLinesDataset)... dataset_name: 'custom'
[2026-01-05 16:54:50,454][oumi.utils.analysis_utils][rank0][pid:24426][MainThread][INFO]][analysis_utils.py:227] Loaded text dataset from: /Users/ryanarman/code/scratch/ryan_hillclimbing_experiments/banking77/notebooks/data/banking77_train.jsonl
[2026-01-05 16:54:50,454][oumi][rank0][pid:24426][MainThread][INFO]][dataset_analyzer.py:154] Loaded dataset from config: None
[2026-01-05 16:54:50,601][oumi][rank0][pid:24426][MainThread][INFO]][dataset_analyzer.py:318] Initialized sample analyzer: length
[2026-01-05 16:54:50,602][oumi][rank0][pid:24426][MainThread][INFO]][dataset_analyzer.py:318] I

In [3]:
# Run the analysis
analyzer.analyze_dataset()

# The results are stored in analyzer object
if analyzer._analysis_results:
    print(
        f"Total conversations analyzed: {analyzer._analysis_results.conversations_analyzed}"
    )

[2026-01-05 16:54:50,611][oumi][rank0][pid:24426][MainThread][INFO]][dataset_analyzer.py:361] Starting analysis of dataset: None
[2026-01-05 16:54:50,612][oumi][rank0][pid:24426][MainThread][INFO]][dataset_analyzer.py:362] Using 5 sample analyzers: ['length', 'cost', 'helpfulness', 'instruction_quality', 'response_quality']
[2026-01-05 16:54:50,613][oumi][rank0][pid:24426][MainThread][INFO]][dataset_analyzer.py:385] Analyzing 10 of 8002 conversations
[2026-01-05 16:54:50,613][oumi][rank0][pid:24426][MainThread][INFO]][dataset_analyzer.py:435] Converting conversation dataset with 8002 items
[2026-01-05 16:54:50,613][oumi][rank0][pid:24426][MainThread][INFO]][dataset_analyzer.py:442] Limiting analysis to first 10 items (dataset has 8002 total)


Converting Unknown Dataset to DataFrames: 100%|██████████| 10/10 [00:00<00:00, 537.09item/s]


[2026-01-05 16:54:50,650][oumi.utils.analysis_utils][rank0][pid:24426][MainThread][INFO]][analysis_utils.py:1322] Adding default schema entries for 2 columns not in base schema: ['label', 'label_name']
[2026-01-05 16:54:50,675][oumi][rank0][pid:24426][MainThread][INFO]][llm_judge_analyzer.py:444] Initialized LLM Judge with model: gpt-4o-mini, engine: openai


100%|██████████| 10/10 [00:18<00:00,  1.88s/it]


[2026-01-05 16:55:09,530][oumi][rank0][pid:24426][MainThread][INFO]][llm_judge_analyzer.py:444] Initialized LLM Judge with model: gpt-4o-mini, engine: openai


100%|██████████| 10/10 [00:13<00:00,  1.37s/it]


[2026-01-05 16:55:23,194][oumi][rank0][pid:24426][MainThread][INFO]][llm_judge_analyzer.py:787] Skipping conversation-level analysis (analyze_conversation_level=False). Set analyze_conversation_level=True to enable.
[2026-01-05 16:55:23,209][oumi][rank0][pid:24426][MainThread][INFO]][llm_judge_analyzer.py:780] Skipping message-level analysis (analyze_message_level=False). Set analyze_message_level=True to enable.
[2026-01-05 16:55:23,211][oumi][rank0][pid:24426][MainThread][INFO]][llm_judge_analyzer.py:679] Batch of 10: 7 unique to evaluate, 3 duplicates, 0 from cache


  0%|          | 0/7 [00:00<?, ?it/s]



100%|██████████| 7/7 [00:10<00:00,  1.45s/it]


[2026-01-05 16:55:33,336][oumi][rank0][pid:24426][MainThread][INFO]][llm_judge_analyzer.py:679] Batch of 10: 7 unique to evaluate, 0 duplicates, 3 from cache


100%|██████████| 7/7 [00:09<00:00,  1.43s/it]


[2026-01-05 16:55:43,319][oumi][rank0][pid:24426][MainThread][INFO]][llm_judge_analyzer.py:679] Batch of 10: 6 unique to evaluate, 0 duplicates, 4 from cache


100%|██████████| 6/6 [00:43<00:00,  7.29s/it]


[2026-01-05 16:56:27,100][oumi][rank0][pid:24426][MainThread][INFO]][llm_judge_analyzer.py:828] Evaluating 10 'assistant' messages (filtered from 30 total)
[2026-01-05 16:56:27,103][oumi][rank0][pid:24426][MainThread][INFO]][llm_judge_analyzer.py:444] Initialized LLM Judge with model: gpt-4o-mini, engine: openai
[2026-01-05 16:56:27,103][oumi][rank0][pid:24426][MainThread][INFO]][llm_judge_analyzer.py:679] Batch of 10: 9 unique to evaluate, 1 duplicates, 0 from cache


100%|██████████| 9/9 [00:13<00:00,  1.55s/it]


Total conversations analyzed: 10


In [9]:
schema = analyzer.get_schema()
print(f"len(schema): {len(schema)}")
merged_columns = analyzer.analysis_df.columns
print(f"len(merged_columns): {len(merged_columns)}")

len(schema): 46
len(merged_columns): 46


In [10]:
analyzer_names = [a.instance_id for a in config.analyzers]
analyzer_names

['length', 'cost', 'helpfulness', 'instruction_quality', 'response_quality']

# Conv level

In [15]:
conv_df = analyzer.conversation_df
conv_df.head()

Unnamed: 0,conversation_index,conversation_id,num_messages,conversation_text_content,label,label_name,conversation_text_content__length__token_count,conversation_text_content__cost__fits_context_4k,conversation_text_content__cost__context_utilization_4k,conversation_text_content__cost__tokens_wasted_4k,...,conversation_text_content__cost__context_utilization_16k,conversation_text_content__cost__tokens_wasted_16k,conversation_text_content__helpfulness__score,conversation_text_content__helpfulness__label,conversation_text_content__helpfulness__reasoning,conversation_text_content__helpfulness__raw_response,conversation_text_content__instruction_quality__score,conversation_text_content__instruction_quality__label,conversation_text_content__instruction_quality__reasoning,conversation_text_content__instruction_quality__raw_response
0,0,0,3,<|im_start|>system\nYou are a banking intent c...,52,request_refund,1806,True,0.4434,2280,...,0.1108,14568,10.0,very_helpful,The classification ID 52 (request_refund) is p...,"```json\n{\n ""score"": 10,\n ""label"": ""very_h...",0.0,poor,The response does not provide any information ...,"{\n ""score"": 0,\n ""label"": ""poor"",\n ""reaso..."
1,1,1,3,<|im_start|>system\nYou are a banking intent c...,69,verify_my_identity,1805,True,0.4431,2281,...,0.1108,14569,10.0,very_helpful,The classification ID 69 (verify_my_identity) ...,"```json\n{\n ""score"": 10,\n ""label"": ""very_h...",0.0,poor,The response does not provide any information ...,"{\n ""score"": 0,\n ""label"": ""poor"",\n ""reaso..."
2,2,2,3,<|im_start|>system\nYou are a banking intent c...,59,top_up_failed,1801,True,0.4421,2285,...,0.1105,14573,9.0,very_helpful,The classification ID 59 (top_up_failed) is hi...,"```json\n{\n ""score"": 9,\n ""label"": ""very_he...",0.0,poor,The response does not provide any information ...,"{\n ""score"": 0,\n ""label"": ""poor"",\n ""reaso..."
3,3,3,3,<|im_start|>system\nYou are a banking intent c...,54,supported_cards_and_currencies,1807,True,0.4436,2279,...,0.1109,14567,4.0,somewhat_helpful,The classification ID 54 (supported_cards_and_...,"```json\n{\n ""score"": 4,\n ""label"": ""somewha...",0.0,poor,The response does not provide any information ...,"{\n ""score"": 0,\n ""label"": ""poor"",\n ""reaso..."
4,4,4,3,<|im_start|>system\nYou are a banking intent c...,57,top_up_by_card_charge,1804,True,0.4429,2282,...,0.1107,14570,9.0,helpful,The classification ID 57 (top_up_by_card_charg...,"```json\n{\n ""score"": 9,\n ""label"": ""helpful...",0.0,poor,The response does not provide any information ...,"{\n ""score"": 0,\n ""label"": ""poor"",\n ""reaso..."


In [17]:
conv_df.conversation_text_content__helpfulness__score

0    10.0
1    10.0
2     9.0
3     4.0
4     9.0
5    10.0
6    10.0
7    10.0
8    10.0
9    10.0
Name: conversation_text_content__helpfulness__score, dtype: float64

In [21]:
from oumi.core.analyze.column_utils import (
    filter_analyzer_columns,
    get_analyzer_columns_by_analyzer,
    parse_analyzer_column_name,
)

conv_columns = analyzer.conversation_df.columns
row = analyzer.conversation_df.iloc[0]

# Choose the analzyer to analyze
analyzer_name = analyzer_names[3]


filtered_cols = filter_analyzer_columns(conv_columns, analyzer_id=analyzer_name)
if filtered_cols:
    print(f"Analyzer: {analyzer_name}")
    info = parse_analyzer_column_name(filtered_cols[0])
    # print("\nInput:")
    # print(f"source_column: {info.source_column}")
    # print(f"{row[info.source_column]}\n")

    for col in filtered_cols:
        info = parse_analyzer_column_name(col)
        print(f"metric: {info.metric_name}")
        # print(f"type: {schema[col]['type']}")
        # print(f"content_type: {schema[col]['content_type']}")
        print(f"description: {schema[col]['description']}")
        print(f"value: {row[col]}")
        print("\n")
else:
    print(f"No columns found for analyzer: {analyzer_name}")


Analyzer: instruction_quality
metric: score
description: LLM judge score (0-10, higher = better quality)
value: 0.0


metric: label
description: LLM judge label/category for the sample
value: poor


metric: reasoning
description: LLM judge reasoning/explanation
value: The response does not provide any information or context to evaluate, making it impossible to determine if it answers the customer's question or is clear and professional.


metric: raw_response
description: Raw LLM response before parsing
value: {
  "score": 0,
  "label": "poor",
  "reasoning": "The response does not provide any information or context to evaluate, making it impossible to determine if it answers the customer's question or is clear and professional."
}




# Message level

In [32]:
msg_df = analyzer.message_df
msg_df.head()

Unnamed: 0,conversation_index,conversation_id,message_index,message_id,role,text_content,text_content__length__token_count,text_content__cost__fits_context_4k,text_content__cost__context_utilization_4k,text_content__cost__tokens_wasted_4k,...,text_content__cost__context_utilization_16k,text_content__cost__tokens_wasted_16k,text_content__instruction_quality__score,text_content__instruction_quality__label,text_content__instruction_quality__reasoning,text_content__instruction_quality__raw_response,text_content__response_quality__score,text_content__response_quality__label,text_content__response_quality__reasoning,text_content__response_quality__raw_response
0,0,0,0,msg_0,system,You are a banking intent classifier. Classify ...,1751,True,0.4299,2335,...,0.1075,14623,2.0,poor,The response does not directly answer a custom...,"{\n ""score"": 2,\n ""label"": ""poor"",\n ""reaso...",,,,
1,0,0,1,msg_1,user,"If I bought something I didn't like, can I get...",14,True,0.0059,4072,...,0.0015,16360,4.0,needs_improvement,The response does not directly answer the cust...,"```json\n{\n ""score"": 4,\n ""label"": ""needs_i...",,,,
2,0,0,2,msg_2,assistant,52,1,True,0.0027,4085,...,0.0007,16373,1.0,poor,The response '52' does not answer any customer...,"```json\n{\n ""score"": 1,\n ""label"": ""poor"",\...",10.0,excellent,The response is a valid numeric ID (52) within...,"```json\n{\n ""score"": 10,\n ""label"": ""excell..."
3,1,1,0,msg_0,system,You are a banking intent classifier. Classify ...,1751,True,0.4299,2335,...,0.1075,14623,2.0,poor,The response does not directly answer a custom...,"{\n ""score"": 2,\n ""label"": ""poor"",\n ""reaso...",,,,
4,1,1,1,msg_1,user,"If I'm getting my identity verified, what all ...",13,True,0.0056,4073,...,0.0014,16361,7.0,good,The response should directly list the specific...,"```json\n{\n ""score"": 7,\n ""label"": ""good"",\...",,,,


In [35]:
msg_df[msg_df.role == "assistant"].text_content__response_quality__score

2     10.0
5     10.0
8     10.0
11    10.0
14    10.0
17    10.0
20    10.0
23    10.0
26    10.0
29    10.0
Name: text_content__response_quality__score, dtype: float64

In [None]:
from oumi.core.analyze.column_utils import (
    filter_analyzer_columns,
    get_analyzer_columns_by_analyzer,
    parse_analyzer_column_name,
)

msg_columns = analyzer.message_df.columns
row = analyzer.message_df.iloc[2]

# Choose the analzyer to analyze
analyzer_name = analyzer_names[4]


filtered_cols = filter_analyzer_columns(msg_columns, analyzer_id=analyzer_name)
if filtered_cols:
    print(f"Analyzer: {analyzer_name}")
    info = parse_analyzer_column_name(filtered_cols[0])

    print("\nInput:")
    print(f"[{row['role']}]: {row[info.source_column]}\n")
    # print(f"source_column: {info.source_column}")
    print(f"{row[info.source_column]}\n")

    for col in filtered_cols:
        info = parse_analyzer_column_name(col)
        print(f"metric: {info.metric_name}")
        # print(f"type: {schema[col]['type']}")
        # print(f"content_type: {schema[col]['content_type']}")
        print(f"description: {schema[col]['description']}")
        print(f"value: {row[col]}")
        print("\n")
else:
    print(f"No columns found for analyzer: {analyzer_name}")


IndexError: list index out of range