In [1]:
import os
import sys


# IMPORTANT: Set these BEFORE importing torch or any ML libraries
# Disable all GPU/MPS backends to prevent crashes with IFD analyzer
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["CUDA_VISIBLE_DEVICES"] = ""  # Disable CUDA
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
os.environ["PYTORCH_MPS_HIGH_WATERMARK_RATIO"] = "0.0"  # Disable MPS memory allocation
os.environ["DISABLE_MPS_COMPAT"] = "1"  # Additional MPS disable flag
os.environ["HF_HUB_DISABLE_TELEMETRY"] = "1"  # Disable HuggingFace telemetry
os.environ["TRANSFORMERS_OFFLINE"] = "0"  # Allow model downloads

# Force CPU usage in PyTorch to avoid MPS crashes
import torch

# Forcefully disable MPS before anything else
torch.set_default_device("cpu")
if hasattr(torch.backends, "mps"):
    # Monkey-patch to prevent MPS usage
    original_is_available = torch.backends.mps.is_available
    torch.backends.mps.is_available = lambda: False
    print("MPS has been disabled - forcing CPU-only mode")
else:
    print("Using CPU for all computations")

print(f"PyTorch device: {torch.get_default_device()}")
print(f"PyTorch version: {torch.__version__}")

MPS has been disabled - forcing CPU-only mode
PyTorch device: cpu
PyTorch version: 2.6.0


In [2]:
OUTPUT_PATH = "/Users/ryanarman/code/oumi/analysis_output/banking77_final_3"

import os
from oumi.core.configs import AnalyzeConfig
from oumi.core.analyze.dataset_analyzer import DatasetAnalyzer

# Load config from YAML file
config = AnalyzeConfig.from_yaml(
    "/Users/ryanarman/code/oumi/configs/examples/analyze/analyze_banking77.yaml"
)

# Override settings for this run
dataset_path = "/Users/ryanarman/code/scratch/ryan_hillclimbing_experiments/banking77/notebooks/data/banking77_train.jsonl"
config.dataset_path = dataset_path
config.dataset_name = None  # Clear dataset_name so it uses dataset_path instead
config.sample_count = 1000
config.chat_template = "chat_ml"

# Set absolute output path (makes it easier to find the results!)
config.output_path = OUTPUT_PATH

print(f"‚úÖ Config loaded with {len(config.analyzers)} analyzers:")
for analyzer in config.analyzers:
    instance_id = analyzer.instance_id or analyzer.id
    print(f"  - {instance_id} (type: {analyzer.id})")

print(f"üìÅ Output will be saved to: {config.output_path}")

# Validate the configuration
config.finalize_and_validate()
print("‚úÖ Config validated successfully!")

analyzer = DatasetAnalyzer(config)

‚úÖ Config loaded with 9 analyzers:
  - length (type: length)
  - token_stats (type: token_stats)
  - cost (type: cost)
  - embedding (type: embedding)
  - question_diversity (type: question_diversity)
  - repr_diversity (type: repr_diversity)
  - helpfulness (type: llm_judge)
  - instruction_quality (type: llm_judge)
  - response_quality (type: llm_judge)
üìÅ Output will be saved to: /Users/ryanarman/code/oumi/analysis_output/banking77_final_3
‚úÖ Config validated successfully!
[2026-01-08 16:22:32,537][oumi][rank0][pid:59207][MainThread][INFO]][base_map_dataset.py:91] Creating map dataset (type: TextSftJsonLinesDataset)... dataset_name: 'custom'
[2026-01-08 16:22:32,538][oumi.utils.analysis_utils][rank0][pid:59207][MainThread][INFO]][analysis_utils.py:227] Loaded text dataset from: /Users/ryanarman/code/scratch/ryan_hillclimbing_experiments/banking77/notebooks/data/banking77_train.jsonl
[2026-01-08 16:22:32,538][oumi][rank0][pid:59207][MainThread][INFO]][dataset_analyzer.py:154] Loa

In [None]:
# Run the analysis
analyzer.analyze_dataset()

# The results are stored in analyzer object
if analyzer._analysis_results:
    print(
        f"Total conversations analyzed: {analyzer._analysis_results.conversations_analyzed}"
    )

[2026-01-08 16:22:35,298][oumi][rank0][pid:59207][MainThread][INFO]][dataset_analyzer.py:363] Starting analysis of dataset: None
[2026-01-08 16:22:35,300][oumi][rank0][pid:59207][MainThread][INFO]][dataset_analyzer.py:364] Using 9 sample analyzers: ['length', 'token_stats', 'cost', 'embedding', 'question_diversity', 'repr_diversity', 'helpfulness', 'instruction_quality', 'response_quality']
[2026-01-08 16:22:35,301][oumi][rank0][pid:59207][MainThread][INFO]][dataset_analyzer.py:387] Analyzing 1000 of 8002 conversations
[2026-01-08 16:22:35,302][oumi][rank0][pid:59207][MainThread][INFO]][dataset_analyzer.py:439] Converting conversation dataset with 8002 items
[2026-01-08 16:22:35,302][oumi][rank0][pid:59207][MainThread][INFO]][dataset_analyzer.py:446] Limiting analysis to first 1000 items (dataset has 8002 total)


Converting Unknown Dataset to DataFrames: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1000/1000 [00:01<00:00, 740.57item/s]


[2026-01-08 16:22:36,828][oumi.utils.analysis_utils][rank0][pid:59207][MainThread][INFO]][analysis_utils.py:1325] Adding default schema entries for 2 columns not in base schema: ['label', 'label_name']
[2026-01-08 16:22:37,474][oumi][rank0][pid:59207][MainThread][INFO]][embedding_analyzer.py:518] Computing embeddings for 1000 samples...
[2026-01-08 16:22:37,475][oumi][rank0][pid:59207][MainThread][INFO]][embedding_analyzer.py:196] Loading embedding model: all-MiniLM-L6-v2


Computing embeddings: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1000/1000 [00:07<00:00, 137.36it/s]


[2026-01-08 16:22:45,764][oumi][rank0][pid:59207][MainThread][INFO]][embedding_analyzer.py:537] Detecting semantic duplicates...
[2026-01-08 16:22:45,909][oumi][rank0][pid:59207][MainThread][INFO]][embedding_analyzer.py:624] Detecting fuzzy duplicates using MinHash LSH...
[2026-01-08 16:22:45,910][oumi][rank0][pid:59207][MainThread][INFO]][embedding_analyzer.py:350] Creating MinHash signatures for 1000 samples...


Creating MinHash signatures: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1000/1000 [00:06<00:00, 143.10it/s]


[2026-01-08 16:22:52,933][oumi][rank0][pid:59207][MainThread][INFO]][embedding_analyzer.py:369] Finding fuzzy duplicates using LSH...


Finding duplicates: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1000/1000 [00:00<00:00, 1032.07it/s]


[2026-01-08 16:22:53,907][oumi][rank0][pid:59207][MainThread][INFO]][question_diversity_analyzer.py:464] Computing embeddings for 1000 user questions...
[2026-01-08 16:22:53,908][oumi][rank0][pid:59207][MainThread][INFO]][question_diversity_analyzer.py:174] Loading embedding model: all-MiniLM-L6-v2


Computing embeddings:  38%|‚ñà‚ñà‚ñà‚ñä      | 384/1000 [00:02<00:04, 140.78it/s]

In [None]:
from pathlib import Path
import traceback
from oumi.utils.analysis_utils import save_analyzer_artifacts

# Save all analyzer artifacts (dataframes, schemas, summary)
save_analyzer_artifacts(analyzer, Path(config.output_path), output_format="parquet")


[2026-01-08 16:15:50,529][oumi.utils.analysis_utils][rank0][pid:57736][MainThread][INFO]][analysis_utils.py:1414] Saved message analysis to: /Users/ryanarman/code/oumi/analysis_output/banking77_final/messages_df.parquet
[2026-01-08 16:15:50,541][oumi.utils.analysis_utils][rank0][pid:57736][MainThread][INFO]][analysis_utils.py:1420] Saved conversation analysis to: /Users/ryanarman/code/oumi/analysis_output/banking77_final/conversations_df.parquet
[2026-01-08 16:15:50,570][oumi.utils.analysis_utils][rank0][pid:57736][MainThread][INFO]][analysis_utils.py:1426] Saved merged analysis to: /Users/ryanarman/code/oumi/analysis_output/banking77_final/merged_df.parquet
[2026-01-08 16:15:50,571][oumi.utils.analysis_utils][rank0][pid:57736][MainThread][INFO]][analysis_utils.py:1438] Saved message schema to: /Users/ryanarman/code/oumi/analysis_output/banking77_final/message_schema.json
[2026-01-08 16:15:50,573][oumi.utils.analysis_utils][rank0][pid:57736][MainThread][INFO]][analysis_utils.py:1445] S

# Load artifacts

In [None]:
# OUTPUT_PATH = "/Users/ryanarman/code/oumi/analysis_output/banking77_final"
# OUTPUT_PATH = "/Users/ryanarman/code/oumi/analysis_output/banking77_final_2"
OUTPUT_PATH = "/Users/ryanarman/code/oumi/analysis_output/banking77_final_3"
from oumi.utils.analysis_utils import (
    load_analyzer_artifacts,
    regenerate_recommendations,
)

artifacts = load_analyzer_artifacts(OUTPUT_PATH)

# Regenerate recommendations with latest code (e.g., updated duplicate detection)
artifacts = regenerate_recommendations(artifacts, outlier_threshold=3.0)

artifacts.keys()


# Generate HTML report if configured


try:
    from oumi.core.analyze.report_generator import HTMLReportGenerator

    report_gen = HTMLReportGenerator()
    report_path = report_gen.generate_report(
        artifacts=artifacts,
        output_path=OUTPUT_PATH,
        title="Banking77 Analysis Report",
    )
    print(f"‚úÖ Generated HTML report at: {report_path / 'index.html'}")
except ImportError:
    print("‚ö†Ô∏è  Plotly not installed. Skipping HTML report generation.")
    print("   Install with: pip install 'oumi[analyze_advanced]'")
except Exception as e:
    print(f"‚ö†Ô∏è  Failed to generate HTML report: {e}")
    print("\nüîç FULL TRACEBACK:")
    print("=" * 70)
    traceback.print_exc()
    print("=" * 70)

print(f"\nüìÅ All results saved to: {OUTPUT_PATH}")


[2026-01-08 16:19:36,660][oumi.utils.analysis_utils][rank0][pid:58531][MainThread][INFO]][analysis_utils.py:1539] Loaded message analysis from: /Users/ryanarman/code/oumi/analysis_output/banking77_final/messages_df
[2026-01-08 16:19:36,665][oumi.utils.analysis_utils][rank0][pid:58531][MainThread][INFO]][analysis_utils.py:1546] Loaded conversation analysis from: /Users/ryanarman/code/oumi/analysis_output/banking77_final/conversations_df
[2026-01-08 16:19:36,676][oumi.utils.analysis_utils][rank0][pid:58531][MainThread][INFO]][analysis_utils.py:1553] Loaded merged analysis from: /Users/ryanarman/code/oumi/analysis_output/banking77_final/merged_df
[2026-01-08 16:19:36,677][oumi.utils.analysis_utils][rank0][pid:58531][MainThread][INFO]][analysis_utils.py:1562] Loaded combined schemas from: /Users/ryanarman/code/oumi/analysis_output/banking77_final/schema.json
[2026-01-08 16:19:36,678][oumi.utils.analysis_utils][rank0][pid:58531][MainThread][INFO]][analysis_utils.py:1584] Loaded analysis sum

In [5]:
schema = analyzer.get_schema()
print(f"len(schema): {len(schema)}")
merged_columns = analyzer.analysis_df.columns
print(f"len(merged_columns): {len(merged_columns)}")

len(schema): 42
len(merged_columns): 42


In [6]:
analyzer_names = [a.instance_id for a in config.analyzers]
analyzer_names

['length', 'cost', 'helpfulness', 'instruction_quality', 'response_quality']

# Conv level

In [9]:
conv_df = artifacts["conversations_df"]
msg_df = artifacts["messages_df"]

In [None]:
msg_df["text_content__cost__conversation_system_tokens"]

0      1751
1      1751
2      1751
3      1751
4      1751
       ... 
295    1751
296    1751
297    1751
298    1751
299    1751
Name: text_content__cost__conversation_system_tokens, Length: 300, dtype: int64

In [11]:
msg_df.columns

Index(['conversation_index', 'conversation_id', 'message_index', 'message_id',
       'role', 'text_content', 'text_content__length__token_count',
       'text_content__cost__fits_context_4k',
       'text_content__cost__context_utilization_4k',
       'text_content__cost__tokens_wasted_4k',
       'text_content__cost__fits_context_8k',
       'text_content__cost__context_utilization_8k',
       'text_content__cost__tokens_wasted_8k',
       'text_content__cost__fits_context_16k',
       'text_content__cost__context_utilization_16k',
       'text_content__cost__tokens_wasted_16k',
       'text_content__cost__system_tokens', 'text_content__cost__input_tokens',
       'text_content__cost__output_tokens',
       'text_content__cost__system_fits_context_4k',
       'text_content__cost__system_context_utilization_4k',
       'text_content__cost__system_tokens_wasted_4k',
       'text_content__cost__input_fits_context_4k',
       'text_content__cost__input_context_utilization_4k',
       'te

In [7]:
conv_df = analyzer.conversation_df
conv_df.head()

Unnamed: 0,conversation_index,conversation_id,num_messages,conversation_text_content,label,label_name,conversation_text_content__length__token_count,conversation_text_content__cost__fits_context_4k,conversation_text_content__cost__context_utilization_4k,conversation_text_content__cost__tokens_wasted_4k,conversation_text_content__cost__fits_context_8k,conversation_text_content__cost__context_utilization_8k,conversation_text_content__cost__tokens_wasted_8k,conversation_text_content__cost__fits_context_16k,conversation_text_content__cost__context_utilization_16k,conversation_text_content__cost__tokens_wasted_16k,conversation_text_content__helpfulness__score,conversation_text_content__helpfulness__label,conversation_text_content__helpfulness__reasoning,conversation_text_content__helpfulness__raw_response
0,0,0,3,<|im_start|>system\nYou are a banking intent c...,52,request_refund,1806,True,0.4434,2280,True,0.2217,6376,True,0.1108,14568,10.0,very_helpful,The classification ID 52 (request_refund) is p...,"{\n ""score"": 10,\n ""label"": ""very_helpful"",\..."
1,1,1,3,<|im_start|>system\nYou are a banking intent c...,69,verify_my_identity,1805,True,0.4431,2281,True,0.2216,6377,True,0.1108,14569,10.0,very_helpful,The classification ID 69 (verify_my_identity) ...,"{\n ""score"": 10,\n ""label"": ""very_helpful"",\..."
2,2,2,3,<|im_start|>system\nYou are a banking intent c...,59,top_up_failed,1801,True,0.4421,2285,True,0.2211,6381,True,0.1105,14573,10.0,very_helpful,The classification ID 59 (top_up_failed) is pe...,"```json\n{\n ""score"": 10,\n ""label"": ""very_h..."
3,3,3,3,<|im_start|>system\nYou are a banking intent c...,54,supported_cards_and_currencies,1807,True,0.4436,2279,True,0.2218,6375,True,0.1109,14567,4.0,somewhat_helpful,The classification ID 54 (supported_cards_and_...,"```json\n{\n ""score"": 4,\n ""label"": ""somewha..."
4,4,4,3,<|im_start|>system\nYou are a banking intent c...,57,top_up_by_card_charge,1804,True,0.4429,2282,True,0.2214,6378,True,0.1107,14570,9.0,helpful,The classification ID 57 (top_up_by_card_charg...,"```json\n{\n ""score"": 9,\n ""label"": ""helpful..."


In [11]:
conv_df[conv_df.conversation_index == 23]

Unnamed: 0,conversation_index,conversation_id,num_messages,conversation_text_content,label,label_name,conversation_text_content__length__token_count,conversation_text_content__cost__fits_context_4k,conversation_text_content__cost__context_utilization_4k,conversation_text_content__cost__tokens_wasted_4k,conversation_text_content__cost__fits_context_8k,conversation_text_content__cost__context_utilization_8k,conversation_text_content__cost__tokens_wasted_8k,conversation_text_content__cost__fits_context_16k,conversation_text_content__cost__context_utilization_16k,conversation_text_content__cost__tokens_wasted_16k,conversation_text_content__helpfulness__score,conversation_text_content__helpfulness__label,conversation_text_content__helpfulness__reasoning,conversation_text_content__helpfulness__raw_response
23,23,23,3,<|im_start|>system\nYou are a banking intent c...,48,pending_transfer,1841,True,0.4519,2245,True,0.226,6341,True,0.113,14533,8.0,helpful,The classification ID 48 (pending_transfer) is...,"```json\n{\n ""score"": 8,\n ""label"": ""helpful..."


In [17]:
msg_df[msg_df.conversation_index == 23].iloc[1].text_content__length__token_count


49

In [8]:
conv_df.conversation_text_content__helpfulness__score

0    10.0
1    10.0
2    10.0
3     4.0
4     9.0
5    10.0
6    10.0
7    10.0
8    10.0
9    10.0
Name: conversation_text_content__helpfulness__score, dtype: float64

In [9]:
from oumi.core.analyze.column_utils import (
    filter_analyzer_columns,
    get_analyzer_columns_by_analyzer,
    parse_analyzer_column_name,
)

conv_columns = analyzer.conversation_df.columns
row = analyzer.conversation_df.iloc[0]

# Choose the analzyer to analyze
analyzer_name = analyzer_names[0]


filtered_cols = filter_analyzer_columns(conv_columns, analyzer_id=analyzer_name)
if filtered_cols:
    print(f"Analyzer: {analyzer_name}")
    info = parse_analyzer_column_name(filtered_cols[0])
    # print("\nInput:")
    # print(f"source_column: {info.source_column}")
    # print(f"{row[info.source_column]}\n")

    for col in filtered_cols:
        info = parse_analyzer_column_name(col)
        print(f"metric: {info.metric_name}")
        # print(f"type: {schema[col]['type']}")
        # print(f"content_type: {schema[col]['content_type']}")
        print(f"description: {schema[col]['description']}")
        print(f"value: {row[col]}")
        print("\n")
else:
    print(f"No columns found for analyzer: {analyzer_name}")


Analyzer: length
metric: token_count
description: Token count for conversation_text_content
value: 1806




# Message level

In [10]:
msg_df = analyzer.message_df
msg_df.head()

Unnamed: 0,conversation_index,conversation_id,message_index,message_id,role,text_content,text_content__length__token_count,text_content__cost__fits_context_4k,text_content__cost__context_utilization_4k,text_content__cost__tokens_wasted_4k,...,text_content__cost__context_utilization_16k,text_content__cost__tokens_wasted_16k,text_content__instruction_quality__score,text_content__instruction_quality__label,text_content__instruction_quality__reasoning,text_content__instruction_quality__raw_response,text_content__response_quality__score,text_content__response_quality__label,text_content__response_quality__reasoning,text_content__response_quality__raw_response
0,0,0,0,msg_0,system,You are a banking intent classifier. Classify ...,1751,True,0.4299,2335,...,0.1075,14623,8.0,good,"The instruction is clear and specific, with a ...","```json\n{\n ""score"": 8,\n ""label"": ""good"",\...",,,,
1,0,0,1,msg_1,user,"If I bought something I didn't like, can I get...",14,True,0.0059,4072,...,0.0015,16360,,,,,,,,
2,0,0,2,msg_2,assistant,52,1,True,0.0027,4085,...,0.0007,16373,,,,,10.0,excellent,The response is a valid single numeric ID (52)...,"```json\n{\n ""score"": 10,\n ""label"": ""excell..."
3,1,1,0,msg_0,system,You are a banking intent classifier. Classify ...,1751,True,0.4299,2335,...,0.1075,14623,8.0,good,"The instruction is clear and specific, with a ...","```json\n{\n ""score"": 8,\n ""label"": ""good"",\...",,,,
4,1,1,1,msg_1,user,"If I'm getting my identity verified, what all ...",13,True,0.0056,4073,...,0.0014,16361,,,,,,,,


In [11]:
msg_df[msg_df.role == "assistant"].text_content__response_quality__score

2     10.0
5     10.0
8     10.0
11    10.0
14    10.0
17    10.0
20    10.0
23    10.0
26    10.0
29    10.0
Name: text_content__response_quality__score, dtype: float64

In [12]:
msg_df[msg_df.role == "system"].text_content__instruction_quality__score.iloc[0]


8.0

In [13]:
msg_df[msg_df.role == "system"].text_content__instruction_quality__label.iloc[0]


'good'

In [14]:
msg_df[msg_df.role == "system"].text_content__instruction_quality__reasoning.iloc[0]


'The instruction is clear and specific, with a defined goal of classifying user queries into one of 77 banking intents. It uses action verbs and provides context through examples and intent descriptions. However, it could be improved by clarifying the range of IDs (0-76) instead of 0-77, as the highest ID listed is 76.'

In [15]:
from oumi.core.analyze.column_utils import (
    filter_analyzer_columns,
    get_analyzer_columns_by_analyzer,
    parse_analyzer_column_name,
)

msg_columns = analyzer.message_df.columns
row = analyzer.message_df.iloc[0]

# Choose the analzyer to analyze
analyzer_name = analyzer_names[3]


filtered_cols = filter_analyzer_columns(msg_columns, analyzer_id=analyzer_name)
if filtered_cols:
    print(f"Analyzer: {analyzer_name}")
    info = parse_analyzer_column_name(filtered_cols[0])

    print("\nInput:")
    print(f"[{row['role']}]: {row[info.source_column]}\n")
    # print(f"source_column: {info.source_column}")
    print(f"{row[info.source_column]}\n")

    for col in filtered_cols:
        info = parse_analyzer_column_name(col)
        print(f"metric: {info.metric_name}")
        # print(f"type: {schema[col]['type']}")
        # print(f"content_type: {schema[col]['content_type']}")
        print(f"description: {schema[col]['description']}")
        print(f"value: {row[col]}")
        print("\n")
else:
    print(f"No columns found for analyzer: {analyzer_name}")


Analyzer: instruction_quality

Input:
[system]: You are a banking intent classifier. Classify the user's query into one of  77 banking intents (output is a single integer ID).

IDs:

0: activate_my_card
1: age_limit
2: apple_pay_or_google_pay
3: atm_support
4: automatic_top_up
5: balance_not_updated_after_bank_transfer
6: balance_not_updated_after_cheque_or_cash_deposit
7: beneficiary_not_allowed
8: cancel_transfer
9: card_about_to_expire
10: card_acceptance
11: card_arrival
12: card_delivery_estimate
13: card_linking
14: card_not_working
15: card_payment_fee_charged
16: card_payment_not_recognised
17: card_payment_wrong_exchange_rate
18: card_swallowed
19: cash_withdrawal_charge
20: cash_withdrawal_not_recognised
21: change_pin
22: compromised_card
23: contactless_not_working
24: country_support
25: declined_card_payment
26: declined_cash_withdrawal
27: declined_transfer
28: direct_debit_payment_not_recognised
29: disposable_card_limits
30: edit_personal_details
31: exchange_charge
32