In [1]:
import os
import sys


# IMPORTANT: Set these BEFORE importing torch or any ML libraries
# Disable all GPU/MPS backends to prevent crashes with IFD analyzer
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["CUDA_VISIBLE_DEVICES"] = ""  # Disable CUDA
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
os.environ["PYTORCH_MPS_HIGH_WATERMARK_RATIO"] = "0.0"  # Disable MPS memory allocation
os.environ["DISABLE_MPS_COMPAT"] = "1"  # Additional MPS disable flag
os.environ["HF_HUB_DISABLE_TELEMETRY"] = "1"  # Disable HuggingFace telemetry
os.environ["TRANSFORMERS_OFFLINE"] = "0"  # Allow model downloads

# Force CPU usage in PyTorch to avoid MPS crashes
import torch

# Forcefully disable MPS before anything else
torch.set_default_device("cpu")
if hasattr(torch.backends, "mps"):
    # Monkey-patch to prevent MPS usage
    original_is_available = torch.backends.mps.is_available
    torch.backends.mps.is_available = lambda: False
    print("MPS has been disabled - forcing CPU-only mode")
else:
    print("Using CPU for all computations")

print(f"PyTorch device: {torch.get_default_device()}")
print(f"PyTorch version: {torch.__version__}")

MPS has been disabled - forcing CPU-only mode
PyTorch device: cpu
PyTorch version: 2.6.0


In [2]:
import os
from oumi.core.configs import AnalyzeConfig
from oumi.core.analyze.dataset_analyzer import DatasetAnalyzer

# Load config from YAML file
config = AnalyzeConfig.from_yaml(
    "/Users/ryanarman/code/oumi/configs/examples/analyze/analyze_banking77.yaml"
)

# Override settings for this run
dataset_path = "/Users/ryanarman/code/scratch/ryan_hillclimbing_experiments/banking77/notebooks/data/banking77_train.jsonl"
config.dataset_path = dataset_path
config.dataset_name = None  # Clear dataset_name so it uses dataset_path instead
config.sample_count = 10
config.chat_template = "chat_ml"

# Set absolute output path (makes it easier to find the results!)
config.output_path = "/Users/ryanarman/code/oumi/analysis_output/banking77_final"

print(f"‚úÖ Config loaded with {len(config.analyzers)} analyzers:")
for analyzer in config.analyzers:
    instance_id = analyzer.instance_id or analyzer.id
    print(f"  - {instance_id} (type: {analyzer.id})")

print(f"üìÅ Output will be saved to: {config.output_path}")

# Validate the configuration
config.finalize_and_validate()
print("‚úÖ Config validated successfully!")

analyzer = DatasetAnalyzer(config)

‚úÖ Config loaded with 5 analyzers:
  - length (type: length)
  - cost (type: cost)
  - helpfulness (type: llm_judge)
  - instruction_quality (type: llm_judge)
  - response_quality (type: llm_judge)
üìÅ Output will be saved to: /Users/ryanarman/code/oumi/analysis_output/banking77_final
‚úÖ Config validated successfully!
[2026-01-06 12:04:41,670][oumi][rank0][pid:53404][MainThread][INFO]][base_map_dataset.py:91] Creating map dataset (type: TextSftJsonLinesDataset)... dataset_name: 'custom'
[2026-01-06 12:04:41,670][oumi.utils.analysis_utils][rank0][pid:53404][MainThread][INFO]][analysis_utils.py:227] Loaded text dataset from: /Users/ryanarman/code/scratch/ryan_hillclimbing_experiments/banking77/notebooks/data/banking77_train.jsonl
[2026-01-06 12:04:41,671][oumi][rank0][pid:53404][MainThread][INFO]][dataset_analyzer.py:154] Loaded dataset from config: None
[2026-01-06 12:04:41,789][oumi][rank0][pid:53404][MainThread][INFO]][dataset_analyzer.py:318] Initialized sample analyzer: length
[2

In [3]:
# Run the analysis
analyzer.analyze_dataset()

# The results are stored in analyzer object
if analyzer._analysis_results:
    print(
        f"Total conversations analyzed: {analyzer._analysis_results.conversations_analyzed}"
    )

[2026-01-06 12:04:41,799][oumi][rank0][pid:53404][MainThread][INFO]][dataset_analyzer.py:361] Starting analysis of dataset: None
[2026-01-06 12:04:41,800][oumi][rank0][pid:53404][MainThread][INFO]][dataset_analyzer.py:362] Using 5 sample analyzers: ['length', 'cost', 'helpfulness', 'instruction_quality', 'response_quality']
[2026-01-06 12:04:41,800][oumi][rank0][pid:53404][MainThread][INFO]][dataset_analyzer.py:385] Analyzing 10 of 8002 conversations
[2026-01-06 12:04:41,802][oumi][rank0][pid:53404][MainThread][INFO]][dataset_analyzer.py:435] Converting conversation dataset with 8002 items
[2026-01-06 12:04:41,802][oumi][rank0][pid:53404][MainThread][INFO]][dataset_analyzer.py:442] Limiting analysis to first 10 items (dataset has 8002 total)


Converting Unknown Dataset to DataFrames: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10/10 [00:00<00:00, 574.91item/s]


[2026-01-06 12:04:41,837][oumi.utils.analysis_utils][rank0][pid:53404][MainThread][INFO]][analysis_utils.py:1322] Adding default schema entries for 2 columns not in base schema: ['label', 'label_name']
[2026-01-06 12:04:41,857][oumi][rank0][pid:53404][MainThread][INFO]][llm_judge_analyzer.py:444] Initialized LLM Judge with model: gpt-4o-mini, engine: openai


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10/10 [00:20<00:00,  2.09s/it]


[2026-01-06 12:05:02,805][oumi][rank0][pid:53404][MainThread][INFO]][llm_judge_analyzer.py:787] Skipping conversation-level analysis (analyze_conversation_level=False). Set analyze_conversation_level=True to enable.
[2026-01-06 12:05:02,807][oumi][rank0][pid:53404][MainThread][INFO]][llm_judge_analyzer.py:787] Skipping conversation-level analysis (analyze_conversation_level=False). Set analyze_conversation_level=True to enable.
[2026-01-06 12:05:02,825][oumi][rank0][pid:53404][MainThread][INFO]][llm_judge_analyzer.py:780] Skipping message-level analysis (analyze_message_level=False). Set analyze_message_level=True to enable.
[2026-01-06 12:05:02,827][oumi][rank0][pid:53404][MainThread][INFO]][llm_judge_analyzer.py:828] Evaluating 10 'system' messages (filtered from 30 total)
[2026-01-06 12:05:02,828][oumi][rank0][pid:53404][MainThread][INFO]][llm_judge_analyzer.py:444] Initialized LLM Judge with model: gpt-4o-mini, engine: openai
[2026-01-06 12:05:02,828][oumi][rank0][pid:53404][MainTh

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 9/9 [00:16<00:00,  1.86s/it]


Total conversations analyzed: 10


In [4]:
analyzer.analysis_df

Unnamed: 0,conversation_index,conversation_id,num_messages,conversation_text_content,label,label_name,conversation_text_content__length__token_count,conversation_text_content__cost__fits_context_4k,conversation_text_content__cost__context_utilization_4k,conversation_text_content__cost__tokens_wasted_4k,...,text_content__cost__context_utilization_16k,text_content__cost__tokens_wasted_16k,text_content__instruction_quality__score,text_content__instruction_quality__label,text_content__instruction_quality__reasoning,text_content__instruction_quality__raw_response,text_content__response_quality__score,text_content__response_quality__label,text_content__response_quality__reasoning,text_content__response_quality__raw_response
0,0,0,3,<|im_start|>system\nYou are a banking intent c...,52,request_refund,1806,True,0.4434,2280,...,0.1075,14623,8.0,good,"The instruction is clear and specific, with a ...","```json\n{\n ""score"": 8,\n ""label"": ""good"",\...",,,,
1,0,0,3,<|im_start|>system\nYou are a banking intent c...,52,request_refund,1806,True,0.4434,2280,...,0.0015,16360,,,,,,,,
2,0,0,3,<|im_start|>system\nYou are a banking intent c...,52,request_refund,1806,True,0.4434,2280,...,0.0007,16373,,,,,10.0,excellent,The response is a valid numeric ID (52) within...,"```json\n{\n ""score"": 10,\n ""label"": ""excell..."
3,1,1,3,<|im_start|>system\nYou are a banking intent c...,69,verify_my_identity,1805,True,0.4431,2281,...,0.1075,14623,8.0,good,"The instruction is clear and specific, with a ...","```json\n{\n ""score"": 8,\n ""label"": ""good"",\...",,,,
4,1,1,3,<|im_start|>system\nYou are a banking intent c...,69,verify_my_identity,1805,True,0.4431,2281,...,0.0014,16361,,,,,,,,
5,1,1,3,<|im_start|>system\nYou are a banking intent c...,69,verify_my_identity,1805,True,0.4431,2281,...,0.0007,16373,,,,,10.0,excellent,The response is a valid numeric ID (69) within...,"```json\n{\n ""score"": 10,\n ""label"": ""excell..."
6,2,2,3,<|im_start|>system\nYou are a banking intent c...,59,top_up_failed,1801,True,0.4421,2285,...,0.1075,14623,8.0,good,"The instruction is clear and specific, with a ...","```json\n{\n ""score"": 8,\n ""label"": ""good"",\...",,,,
7,2,2,3,<|im_start|>system\nYou are a banking intent c...,59,top_up_failed,1801,True,0.4421,2285,...,0.0012,16365,,,,,,,,
8,2,2,3,<|im_start|>system\nYou are a banking intent c...,59,top_up_failed,1801,True,0.4421,2285,...,0.0007,16373,,,,,10.0,excellent,The response is a valid single numeric ID (59)...,"```json\n{\n ""score"": 10,\n ""label"": ""excell..."
9,3,3,3,<|im_start|>system\nYou are a banking intent c...,54,supported_cards_and_currencies,1807,True,0.4436,2279,...,0.1075,14623,8.0,good,"The instruction is clear and specific, with a ...","```json\n{\n ""score"": 8,\n ""label"": ""good"",\...",,,,


In [7]:
from pathlib import Path
import json
import traceback

output_dir = Path(config.output_path)
output_dir.mkdir(parents=True, exist_ok=True)

# Save message-level results
if analyzer.message_df is not None and not analyzer.message_df.empty:
    msg_path = output_dir / "messages_df.parquet"
    analyzer.message_df.to_parquet(msg_path)
    print(f"‚úÖ Saved message analysis to: {msg_path}")

# Save conversation-level results
if analyzer.conversation_df is not None and not analyzer.conversation_df.empty:
    conv_path = output_dir / "conversations_df.parquet"
    analyzer.conversation_df.to_parquet(conv_path)
    print(f"‚úÖ Saved conversation analysis to: {conv_path}")

# Save merged results
if analyzer.analysis_df is not None and not analyzer.analysis_df.empty:
    merged_path = output_dir / "merged_df.parquet"
    analyzer.analysis_df.to_parquet(merged_path)
    print(f"‚úÖ Saved merged analysis to: {merged_path}")

# Save analysis summary
summary_path = output_dir / "analysis_summary.json"
with open(summary_path, "w") as f:
    json.dump(analyzer.analysis_summary, f, indent=2, default=str)
print(f"‚úÖ Saved analysis summary to: {summary_path}")

# Generate HTML report if configured
if config.generate_report:
    try:
        from oumi.core.analyze.report_generator import HTMLReportGenerator

        report_gen = HTMLReportGenerator()
        report_path = report_gen.generate_report(
            analyzer,
            output_path=output_dir,
            title=config.report_title or "Banking77 Analysis Report",
        )
        print(f"‚úÖ Generated HTML report at: {report_path / 'index.html'}")
    except ImportError:
        print("‚ö†Ô∏è  Plotly not installed. Skipping HTML report generation.")
        print("   Install with: pip install 'oumi[analyze_advanced]'")
    except Exception as e:
        print(f"‚ö†Ô∏è  Failed to generate HTML report: {e}")
        print("\nüîç FULL TRACEBACK:")
        print("=" * 70)
        traceback.print_exc()
        print("=" * 70)

print(f"\nüìÅ All results saved to: {output_dir.absolute()}")


‚úÖ Saved message analysis to: /Users/ryanarman/code/oumi/analysis_output/banking77_final/messages_df.parquet
‚úÖ Saved conversation analysis to: /Users/ryanarman/code/oumi/analysis_output/banking77_final/conversations_df.parquet
‚úÖ Saved merged analysis to: /Users/ryanarman/code/oumi/analysis_output/banking77_final/merged_df.parquet
‚úÖ Saved analysis summary to: /Users/ryanarman/code/oumi/analysis_output/banking77_final/analysis_summary.json
[2026-01-06 12:07:03,162][oumi][rank0][pid:53404][MainThread][INFO]][report_generator.py:203] Generated HTML report: /Users/ryanarman/code/oumi/analysis_output/banking77_final/index.html
[2026-01-06 12:07:03,163][oumi][rank0][pid:53404][MainThread][INFO]][report_generator.py:204] External data files written to: /Users/ryanarman/code/oumi/analysis_output/banking77_final/data
‚úÖ Generated HTML report at: /Users/ryanarman/code/oumi/analysis_output/banking77_final/index.html

üìÅ All results saved to: /Users/ryanarman/code/oumi/analysis_output/ban

In [9]:
schema = analyzer.get_schema()
print(f"len(schema): {len(schema)}")
merged_columns = analyzer.analysis_df.columns
print(f"len(merged_columns): {len(merged_columns)}")

len(schema): 42
len(merged_columns): 42


In [11]:
schema

{'conversation_index': {'type': <ColumnType.INT: 'int'>,
  'content_type': <ContentType.IDENTIFIER: 'identifier'>,
  'description': 'Conversation index in dataset'},
 'conversation_id': {'type': <ColumnType.STRING: 'string'>,
  'content_type': <ContentType.IDENTIFIER: 'identifier'>,
  'description': 'Conversation identifier'},
 'num_messages': {'type': <ColumnType.INT: 'int'>,
  'content_type': <ContentType.NUMERIC: 'numeric'>,
  'description': 'Number of messages in conversation'},
 'conversation_text_content': {'type': <ColumnType.STRING: 'string'>,
  'content_type': <ContentType.TEXT: 'text'>,
  'description': 'Full conversation rendered as text'},
 'message_index': {'type': <ColumnType.INT: 'int'>,
  'content_type': <ContentType.IDENTIFIER: 'identifier'>,
  'description': 'Message index within conversation'},
 'message_id': {'type': <ColumnType.STRING: 'string'>,
  'content_type': <ContentType.IDENTIFIER: 'identifier'>,
  'description': 'Message identifier'},
 'role': {'type': <Col

In [None]:
analyzer_names = [a.instance_id for a in config.analyzers]
analyzer_names

['length', 'cost', 'helpfulness', 'instruction_quality', 'response_quality']

# Conv level

In [None]:
conv_df = analyzer.conversation_df
conv_df.head()

Unnamed: 0,conversation_index,conversation_id,num_messages,conversation_text_content,label,label_name,conversation_text_content__length__token_count,conversation_text_content__cost__fits_context_4k,conversation_text_content__cost__context_utilization_4k,conversation_text_content__cost__tokens_wasted_4k,conversation_text_content__cost__fits_context_8k,conversation_text_content__cost__context_utilization_8k,conversation_text_content__cost__tokens_wasted_8k,conversation_text_content__cost__fits_context_16k,conversation_text_content__cost__context_utilization_16k,conversation_text_content__cost__tokens_wasted_16k,conversation_text_content__helpfulness__score,conversation_text_content__helpfulness__label,conversation_text_content__helpfulness__reasoning,conversation_text_content__helpfulness__raw_response
0,0,0,3,<|im_start|>system\nYou are a banking intent c...,52,request_refund,1806,True,0.4434,2280,True,0.2217,6376,True,0.1108,14568,10.0,very_helpful,The classification ID 52 (request_refund) is p...,"{\n ""score"": 10,\n ""label"": ""very_helpful"",\..."
1,1,1,3,<|im_start|>system\nYou are a banking intent c...,69,verify_my_identity,1805,True,0.4431,2281,True,0.2216,6377,True,0.1108,14569,10.0,very_helpful,The classification ID 69 (verify_my_identity) ...,"```json\n{\n ""score"": 10,\n ""label"": ""very_h..."
2,2,2,3,<|im_start|>system\nYou are a banking intent c...,59,top_up_failed,1801,True,0.4421,2285,True,0.2211,6381,True,0.1105,14573,10.0,very_helpful,The classification ID 59 (top_up_failed) is pe...,"```json\n{\n ""score"": 10,\n ""label"": ""very_h..."
3,3,3,3,<|im_start|>system\nYou are a banking intent c...,54,supported_cards_and_currencies,1807,True,0.4436,2279,True,0.2218,6375,True,0.1109,14567,4.0,somewhat_helpful,The classification ID 54 (extra_charge_on_stat...,"```json\n{\n ""score"": 4,\n ""label"": ""somewha..."
4,4,4,3,<|im_start|>system\nYou are a banking intent c...,57,top_up_by_card_charge,1804,True,0.4429,2282,True,0.2214,6378,True,0.1107,14570,9.0,helpful,The classification ID 57 (top_up_by_card_charg...,"```json\n{\n ""score"": 9,\n ""label"": ""helpful..."


In [None]:
conv_df.conversation_text_content__helpfulness__score

0    10.0
1    10.0
2    10.0
3     4.0
4     9.0
5    10.0
6    10.0
7    10.0
8    10.0
9    10.0
Name: conversation_text_content__helpfulness__score, dtype: float64

In [None]:
from oumi.core.analyze.column_utils import (
    filter_analyzer_columns,
    get_analyzer_columns_by_analyzer,
    parse_analyzer_column_name,
)

conv_columns = analyzer.conversation_df.columns
row = analyzer.conversation_df.iloc[0]

# Choose the analzyer to analyze
analyzer_name = analyzer_names[0]


filtered_cols = filter_analyzer_columns(conv_columns, analyzer_id=analyzer_name)
if filtered_cols:
    print(f"Analyzer: {analyzer_name}")
    info = parse_analyzer_column_name(filtered_cols[0])
    # print("\nInput:")
    # print(f"source_column: {info.source_column}")
    # print(f"{row[info.source_column]}\n")

    for col in filtered_cols:
        info = parse_analyzer_column_name(col)
        print(f"metric: {info.metric_name}")
        # print(f"type: {schema[col]['type']}")
        # print(f"content_type: {schema[col]['content_type']}")
        print(f"description: {schema[col]['description']}")
        print(f"value: {row[col]}")
        print("\n")
else:
    print(f"No columns found for analyzer: {analyzer_name}")


Analyzer: length
metric: token_count
description: Token count for conversation_text_content
value: 1806




# Message level

In [None]:
msg_df = analyzer.message_df
msg_df.head()

Unnamed: 0,conversation_index,conversation_id,message_index,message_id,role,text_content,text_content__length__token_count,text_content__cost__fits_context_4k,text_content__cost__context_utilization_4k,text_content__cost__tokens_wasted_4k,...,text_content__cost__context_utilization_16k,text_content__cost__tokens_wasted_16k,text_content__instruction_quality__score,text_content__instruction_quality__label,text_content__instruction_quality__reasoning,text_content__instruction_quality__raw_response,text_content__response_quality__score,text_content__response_quality__label,text_content__response_quality__reasoning,text_content__response_quality__raw_response
0,0,0,0,msg_0,system,You are a banking intent classifier. Classify ...,1751,True,0.4299,2335,...,0.1075,14623,8.0,good,"The instruction is clear and specific, with a ...","```json\n{\n ""score"": 8,\n ""label"": ""good"",\...",,,,
1,0,0,1,msg_1,user,"If I bought something I didn't like, can I get...",14,True,0.0059,4072,...,0.0015,16360,,,,,,,,
2,0,0,2,msg_2,assistant,52,1,True,0.0027,4085,...,0.0007,16373,,,,,10.0,excellent,The response is a valid single numeric ID (52)...,"```json\n{\n ""score"": 10,\n ""label"": ""excell..."
3,1,1,0,msg_0,system,You are a banking intent classifier. Classify ...,1751,True,0.4299,2335,...,0.1075,14623,8.0,good,"The instruction is clear and specific, with a ...","```json\n{\n ""score"": 8,\n ""label"": ""good"",\...",,,,
4,1,1,1,msg_1,user,"If I'm getting my identity verified, what all ...",13,True,0.0056,4073,...,0.0014,16361,,,,,,,,


In [None]:
msg_df[msg_df.role == "assistant"].text_content__response_quality__score

2     10.0
5     10.0
8     10.0
11    10.0
14    10.0
17    10.0
20    10.0
23    10.0
26    10.0
29    10.0
Name: text_content__response_quality__score, dtype: float64

In [None]:
msg_df[msg_df.role == "system"].text_content__instruction_quality__score.iloc[0]


8.0

In [None]:
msg_df[msg_df.role == "system"].text_content__instruction_quality__label.iloc[0]


'good'

In [None]:
msg_df[msg_df.role == "system"].text_content__instruction_quality__reasoning.iloc[0]


"The instruction is clear and specific, with a defined goal of classifying user queries into one of 77 banking intents. It uses action verbs and provides context about the task. However, it could benefit from slightly more detail on how to handle ambiguous queries or what to do if the query doesn't fit any of the provided intents."

In [None]:
from oumi.core.analyze.column_utils import (
    filter_analyzer_columns,
    get_analyzer_columns_by_analyzer,
    parse_analyzer_column_name,
)

msg_columns = analyzer.message_df.columns
row = analyzer.message_df.iloc[0]

# Choose the analzyer to analyze
analyzer_name = analyzer_names[3]


filtered_cols = filter_analyzer_columns(msg_columns, analyzer_id=analyzer_name)
if filtered_cols:
    print(f"Analyzer: {analyzer_name}")
    info = parse_analyzer_column_name(filtered_cols[0])

    print("\nInput:")
    print(f"[{row['role']}]: {row[info.source_column]}\n")
    # print(f"source_column: {info.source_column}")
    print(f"{row[info.source_column]}\n")

    for col in filtered_cols:
        info = parse_analyzer_column_name(col)
        print(f"metric: {info.metric_name}")
        # print(f"type: {schema[col]['type']}")
        # print(f"content_type: {schema[col]['content_type']}")
        print(f"description: {schema[col]['description']}")
        print(f"value: {row[col]}")
        print("\n")
else:
    print(f"No columns found for analyzer: {analyzer_name}")


Analyzer: instruction_quality

Input:
[system]: You are a banking intent classifier. Classify the user's query into one of  77 banking intents (output is a single integer ID).

IDs:

0: activate_my_card
1: age_limit
2: apple_pay_or_google_pay
3: atm_support
4: automatic_top_up
5: balance_not_updated_after_bank_transfer
6: balance_not_updated_after_cheque_or_cash_deposit
7: beneficiary_not_allowed
8: cancel_transfer
9: card_about_to_expire
10: card_acceptance
11: card_arrival
12: card_delivery_estimate
13: card_linking
14: card_not_working
15: card_payment_fee_charged
16: card_payment_not_recognised
17: card_payment_wrong_exchange_rate
18: card_swallowed
19: cash_withdrawal_charge
20: cash_withdrawal_not_recognised
21: change_pin
22: compromised_card
23: contactless_not_working
24: country_support
25: declined_card_payment
26: declined_cash_withdrawal
27: declined_transfer
28: direct_debit_payment_not_recognised
29: disposable_card_limits
30: edit_personal_details
31: exchange_charge
32