In [1]:
import os
import sys


# IMPORTANT: Set these BEFORE importing torch or any ML libraries
# Disable all GPU/MPS backends to prevent crashes with IFD analyzer
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["CUDA_VISIBLE_DEVICES"] = ""  # Disable CUDA
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
os.environ["PYTORCH_MPS_HIGH_WATERMARK_RATIO"] = "0.0"  # Disable MPS memory allocation
os.environ["DISABLE_MPS_COMPAT"] = "1"  # Additional MPS disable flag
os.environ["HF_HUB_DISABLE_TELEMETRY"] = "1"  # Disable HuggingFace telemetry
os.environ["TRANSFORMERS_OFFLINE"] = "0"  # Allow model downloads

# Force CPU usage in PyTorch to avoid MPS crashes
import torch

# Forcefully disable MPS before anything else
torch.set_default_device("cpu")
if hasattr(torch.backends, "mps"):
    # Monkey-patch to prevent MPS usage
    original_is_available = torch.backends.mps.is_available
    torch.backends.mps.is_available = lambda: False
    print("MPS has been disabled - forcing CPU-only mode")
else:
    print("Using CPU for all computations")

print(f"PyTorch device: {torch.get_default_device()}")
print(f"PyTorch version: {torch.__version__}")

MPS has been disabled - forcing CPU-only mode
PyTorch device: cpu
PyTorch version: 2.6.0


In [2]:
OUTPUT_PATH = "/Users/ryanarman/code/oumi/analysis_output/ultrachat"

In [3]:
import os
from oumi.core.configs import AnalyzeConfig
from oumi.core.analyze.dataset_analyzer import DatasetAnalyzer

# Load config from YAML file
config = AnalyzeConfig.from_yaml(
    "/Users/ryanarman/code/oumi/configs/examples/analyze/analyze_ultrachat.yaml"
)

# Override settings for this run
# dataset_path = "/Users/ryanarman/code/scratch/ryan_hillclimbing_experiments/banking77/notebooks/data/banking77_train.jsonl"
# config.dataset_path = dataset_path
# config.dataset_name = None  # Clear dataset_name so it uses dataset_path instead
config.sample_count = 10
config.chat_template = "chat_ml"

# Set absolute output path (makes it easier to find the results!)
config.output_path = OUTPUT_PATH

print(f"‚úÖ Config loaded with {len(config.analyzers)} analyzers:")
for analyzer in config.analyzers:
    instance_id = analyzer.instance_id or analyzer.id
    print(f"  - {instance_id} (type: {analyzer.id})")

print(f"üìÅ Output will be saved to: {config.output_path}")

# Validate the configuration
config.finalize_and_validate()
print("‚úÖ Config validated successfully!")

analyzer = DatasetAnalyzer(config)

‚úÖ Config loaded with 25 analyzers:
  - length (type: length)
  - diversity (type: diversity)
  - format (type: format)
  - quality (type: quality)
  - content_pattern (type: content_pattern)
  - embedding (type: embedding)
  - question_diversity (type: question_diversity)
  - repr_diversity (type: repr_diversity)
  - conversation_structure (type: conversation_structure)
  - response_completeness (type: response_completeness)
  - training_quality (type: training_quality)
  - task_category (type: task_category)
  - safety (type: safety)
  - difficulty (type: difficulty)
  - input_quality (type: input_quality)
  - instruct_reward (type: instruct_reward)
  - cost (type: cost)
  - helpfulness (type: llm_judge)
  - instruction_quality (type: llm_judge)
  - response_quality (type: llm_judge)
  - excessive_politeness (type: llm_judge)
  - roleplay_bleeding (type: llm_judge)
  - reasoning_leakage (type: llm_judge)
  - style_homogenization (type: llm_judge)
  - repetitive_turns (type: llm_judg

In [4]:
# Run the analysis
analyzer.analyze_dataset()

# The results are stored in analyzer object
if analyzer._analysis_results:
    print(
        f"Total conversations analyzed: {analyzer._analysis_results.conversations_analyzed}"
    )

[2026-01-07 14:56:41,216][oumi][rank0][pid:6650][MainThread][INFO]][dataset_analyzer.py:363] Starting analysis of dataset: HuggingFaceH4/ultrachat_200k
[2026-01-07 14:56:41,217][oumi][rank0][pid:6650][MainThread][INFO]][dataset_analyzer.py:364] Using 25 sample analyzers: ['length', 'diversity', 'format', 'quality', 'content_pattern', 'embedding', 'question_diversity', 'repr_diversity', 'conversation_structure', 'response_completeness', 'training_quality', 'task_category', 'safety', 'difficulty', 'input_quality', 'instruct_reward', 'cost', 'helpfulness', 'instruction_quality', 'response_quality', 'excessive_politeness', 'roleplay_bleeding', 'reasoning_leakage', 'style_homogenization', 'repetitive_turns']
[2026-01-07 14:56:41,218][oumi][rank0][pid:6650][MainThread][INFO]][dataset_analyzer.py:387] Analyzing 10 of 100 conversations
[2026-01-07 14:56:41,218][oumi][rank0][pid:6650][MainThread][INFO]][dataset_analyzer.py:439] Converting conversation dataset with 100 items
[2026-01-07 14:56:41

Converting HuggingFaceH4/ultrachat_200k to DataFrames: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10/10 [00:00<00:00, 566.57item/s]


[2026-01-07 14:56:41,265][oumi][rank0][pid:6650][MainThread][INFO]][embedding_analyzer.py:518] Computing embeddings for 10 samples...
[2026-01-07 14:56:41,266][oumi][rank0][pid:6650][MainThread][INFO]][embedding_analyzer.py:196] Loading embedding model: all-MiniLM-L6-v2


Computing embeddings: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10/10 [00:00<00:00, 93.55it/s]


[2026-01-07 14:56:42,847][oumi][rank0][pid:6650][MainThread][INFO]][embedding_analyzer.py:537] Detecting semantic duplicates...
[2026-01-07 14:56:42,849][oumi][rank0][pid:6650][MainThread][INFO]][embedding_analyzer.py:624] Detecting fuzzy duplicates using MinHash LSH...
[2026-01-07 14:56:42,849][oumi][rank0][pid:6650][MainThread][INFO]][embedding_analyzer.py:350] Creating MinHash signatures for 10 samples...


Creating MinHash signatures: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10/10 [00:00<00:00, 156.22it/s]


[2026-01-07 14:56:42,923][oumi][rank0][pid:6650][MainThread][INFO]][embedding_analyzer.py:369] Finding fuzzy duplicates using LSH...


Finding duplicates: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10/10 [00:00<00:00, 46294.75it/s]


[2026-01-07 14:56:42,928][oumi][rank0][pid:6650][MainThread][INFO]][question_diversity_analyzer.py:464] Computing embeddings for 10 user questions...
[2026-01-07 14:56:42,928][oumi][rank0][pid:6650][MainThread][INFO]][question_diversity_analyzer.py:174] Loading embedding model: all-MiniLM-L6-v2


Computing embeddings: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10/10 [00:00<00:00, 101.58it/s]


[2026-01-07 14:56:44,576][oumi][rank0][pid:6650][MainThread][INFO]][question_diversity_analyzer.py:469] Clustering 10 questions using dbscan...
[2026-01-07 14:56:44,772][oumi][rank0][pid:6650][MainThread][INFO]][question_diversity_analyzer.py:482] Found 0 clusters, 10 unique/diverse questions (not similar to others)
[2026-01-07 14:56:44,776][oumi][rank0][pid:6650][MainThread][INFO]][repr_diversity_analyzer.py:363] Computing diversity scores for 10 samples in column 'conversation_text_content'...
[2026-01-07 14:56:44,779][oumi][rank0][pid:6650][MainThread][INFO]][repr_diversity_analyzer.py:165] Loading embedding model: sentence-transformers/all-MiniLM-L6-v2


Computing embeddings: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10/10 [00:00<00:00, 93.28it/s]


[2026-01-07 14:56:46,579][oumi][rank0][pid:6650][MainThread][INFO]][repr_diversity_analyzer.py:230] Computing nearest neighbor distances for 10 samples (k=5)...
[2026-01-07 14:56:46,580][oumi][rank0][pid:6650][MainThread][INFO]][repr_diversity_analyzer.py:556] Column 'conversation_text_content': 0/10 samples (0.0%) are redundant
[2026-01-07 14:56:46,726][oumi][rank0][pid:6650][MainThread][INFO]][llm_judge_analyzer.py:444] Initialized LLM Judge with model: gpt-4o-mini, engine: openai


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10/10 [00:05<00:00,  1.83it/s]


[2026-01-07 14:56:52,198][oumi][rank0][pid:6650][MainThread][INFO]][llm_judge_analyzer.py:787] Skipping conversation-level analysis (analyze_conversation_level=False). Set analyze_conversation_level=True to enable.
[2026-01-07 14:56:52,199][oumi][rank0][pid:6650][MainThread][INFO]][llm_judge_analyzer.py:787] Skipping conversation-level analysis (analyze_conversation_level=False). Set analyze_conversation_level=True to enable.
[2026-01-07 14:56:52,199][oumi][rank0][pid:6650][MainThread][INFO]][llm_judge_analyzer.py:787] Skipping conversation-level analysis (analyze_conversation_level=False). Set analyze_conversation_level=True to enable.
[2026-01-07 14:56:52,200][oumi][rank0][pid:6650][MainThread][INFO]][llm_judge_analyzer.py:444] Initialized LLM Judge with model: gpt-4o-mini, engine: openai


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10/10 [00:02<00:00,  3.56it/s]


[2026-01-07 14:56:55,019][oumi][rank0][pid:6650][MainThread][INFO]][llm_judge_analyzer.py:787] Skipping conversation-level analysis (analyze_conversation_level=False). Set analyze_conversation_level=True to enable.
[2026-01-07 14:56:55,021][oumi][rank0][pid:6650][MainThread][INFO]][llm_judge_analyzer.py:444] Initialized LLM Judge with model: gpt-4o-mini, engine: openai


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10/10 [00:02<00:00,  3.49it/s]


[2026-01-07 14:56:57,892][oumi][rank0][pid:6650][MainThread][INFO]][llm_judge_analyzer.py:444] Initialized LLM Judge with model: gpt-4o-mini, engine: openai


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 10/10 [00:02<00:00,  4.00it/s]


[2026-01-07 14:57:00,447][oumi][rank0][pid:6650][MainThread][INFO]][embedding_analyzer.py:518] Computing embeddings for 78 samples...


Computing embeddings: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 78/78 [00:00<00:00, 183.64it/s]


[2026-01-07 14:57:00,875][oumi][rank0][pid:6650][MainThread][INFO]][embedding_analyzer.py:537] Detecting semantic duplicates...
[2026-01-07 14:57:00,877][oumi][rank0][pid:6650][MainThread][INFO]][embedding_analyzer.py:624] Detecting fuzzy duplicates using MinHash LSH...
[2026-01-07 14:57:00,885][oumi][rank0][pid:6650][MainThread][INFO]][embedding_analyzer.py:350] Creating MinHash signatures for 78 samples...


Creating MinHash signatures: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 78/78 [00:00<00:00, 525.00it/s]


[2026-01-07 14:57:01,073][oumi][rank0][pid:6650][MainThread][INFO]][embedding_analyzer.py:369] Finding fuzzy duplicates using LSH...


Finding duplicates: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 78/78 [00:00<00:00, 23423.48it/s]


[2026-01-07 14:57:01,085][oumi][rank0][pid:6650][MainThread][INFO]][question_diversity_analyzer.py:464] Computing embeddings for 39 user questions...


Computing embeddings: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 39/39 [00:00<00:00, 169.98it/s]


[2026-01-07 14:57:01,319][oumi][rank0][pid:6650][MainThread][INFO]][question_diversity_analyzer.py:469] Clustering 39 questions using dbscan...
[2026-01-07 14:57:01,323][oumi][rank0][pid:6650][MainThread][INFO]][question_diversity_analyzer.py:482] Found 0 clusters, 39 unique/diverse questions (not similar to others)
[2026-01-07 14:57:01,324][oumi][rank0][pid:6650][MainThread][INFO]][repr_diversity_analyzer.py:363] Computing diversity scores for 78 samples in column 'text_content'...


Computing embeddings: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 78/78 [00:00<00:00, 178.39it/s]


[2026-01-07 14:57:01,765][oumi][rank0][pid:6650][MainThread][INFO]][repr_diversity_analyzer.py:230] Computing nearest neighbor distances for 78 samples (k=5)...
[2026-01-07 14:57:01,770][oumi][rank0][pid:6650][MainThread][INFO]][repr_diversity_analyzer.py:556] Column 'text_content': 8/78 samples (10.3%) are redundant
[2026-01-07 14:57:01,846][oumi][rank0][pid:6650][MainThread][INFO]][llm_judge_analyzer.py:780] Skipping message-level analysis (analyze_message_level=False). Set analyze_message_level=True to enable.
[2026-01-07 14:57:01,848][oumi][rank0][pid:6650][MainThread][INFO]][llm_judge_analyzer.py:828] Evaluating 39 'user' messages (filtered from 78 total)
[2026-01-07 14:57:01,853][oumi][rank0][pid:6650][MainThread][INFO]][llm_judge_analyzer.py:444] Initialized LLM Judge with model: gpt-4o-mini, engine: openai


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 39/39 [00:07<00:00,  5.00it/s]


[2026-01-07 14:57:09,664][oumi][rank0][pid:6650][MainThread][INFO]][llm_judge_analyzer.py:828] Evaluating 39 'assistant' messages (filtered from 78 total)
[2026-01-07 14:57:09,666][oumi][rank0][pid:6650][MainThread][INFO]][llm_judge_analyzer.py:444] Initialized LLM Judge with model: gpt-4o-mini, engine: openai


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 39/39 [00:04<00:00,  8.13it/s]


[2026-01-07 14:57:14,480][oumi][rank0][pid:6650][MainThread][INFO]][llm_judge_analyzer.py:828] Evaluating 39 'assistant' messages (filtered from 78 total)
[2026-01-07 14:57:14,482][oumi][rank0][pid:6650][MainThread][INFO]][llm_judge_analyzer.py:444] Initialized LLM Judge with model: gpt-4o-mini, engine: openai


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 39/39 [00:02<00:00, 14.25it/s]


[2026-01-07 14:57:17,233][oumi][rank0][pid:6650][MainThread][INFO]][llm_judge_analyzer.py:780] Skipping message-level analysis (analyze_message_level=False). Set analyze_message_level=True to enable.
[2026-01-07 14:57:17,235][oumi][rank0][pid:6650][MainThread][INFO]][llm_judge_analyzer.py:828] Evaluating 39 'assistant' messages (filtered from 78 total)
[2026-01-07 14:57:17,236][oumi][rank0][pid:6650][MainThread][INFO]][llm_judge_analyzer.py:444] Initialized LLM Judge with model: gpt-4o-mini, engine: openai


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 39/39 [00:02<00:00, 15.15it/s]


[2026-01-07 14:57:19,825][oumi][rank0][pid:6650][MainThread][INFO]][llm_judge_analyzer.py:780] Skipping message-level analysis (analyze_message_level=False). Set analyze_message_level=True to enable.
[2026-01-07 14:57:19,826][oumi][rank0][pid:6650][MainThread][INFO]][llm_judge_analyzer.py:780] Skipping message-level analysis (analyze_message_level=False). Set analyze_message_level=True to enable.
Total conversations analyzed: 10


In [18]:
from pathlib import Path
import traceback
from oumi.utils.analysis_utils import save_analyzer_artifacts

# Save all analyzer artifacts (dataframes, schemas, summary)
save_analyzer_artifacts(analyzer, Path(config.output_path), output_format="parquet")


[2026-01-07 14:57:25,303][oumi.utils.analysis_utils][rank0][pid:6650][MainThread][INFO]][analysis_utils.py:1411] Saved message analysis to: /Users/ryanarman/code/oumi/analysis_output/ultrachat/messages_df.parquet
[2026-01-07 14:57:25,310][oumi.utils.analysis_utils][rank0][pid:6650][MainThread][INFO]][analysis_utils.py:1417] Saved conversation analysis to: /Users/ryanarman/code/oumi/analysis_output/ultrachat/conversations_df.parquet
[2026-01-07 14:57:25,326][oumi.utils.analysis_utils][rank0][pid:6650][MainThread][INFO]][analysis_utils.py:1423] Saved merged analysis to: /Users/ryanarman/code/oumi/analysis_output/ultrachat/merged_df.parquet
[2026-01-07 14:57:25,328][oumi.utils.analysis_utils][rank0][pid:6650][MainThread][INFO]][analysis_utils.py:1435] Saved message schema to: /Users/ryanarman/code/oumi/analysis_output/ultrachat/message_schema.json
[2026-01-07 14:57:25,329][oumi.utils.analysis_utils][rank0][pid:6650][MainThread][INFO]][analysis_utils.py:1442] Saved conversation schema to: 

# Load artifacts

In [20]:
OUTPUT_PATH = "/Users/ryanarman/code/oumi/analysis_output/ultrachat"
from oumi.utils.analysis_utils import (
    load_analyzer_artifacts,
    regenerate_recommendations,
)

artifacts = load_analyzer_artifacts(OUTPUT_PATH)

# Regenerate recommendations with latest code (e.g., updated duplicate detection)
artifacts = regenerate_recommendations(artifacts, outlier_threshold=3.0)

artifacts.keys()


[2026-01-07 14:57:48,637][oumi.utils.analysis_utils][rank0][pid:6650][MainThread][INFO]][analysis_utils.py:1536] Loaded message analysis from: /Users/ryanarman/code/oumi/analysis_output/ultrachat/messages_df
[2026-01-07 14:57:48,644][oumi.utils.analysis_utils][rank0][pid:6650][MainThread][INFO]][analysis_utils.py:1543] Loaded conversation analysis from: /Users/ryanarman/code/oumi/analysis_output/ultrachat/conversations_df
[2026-01-07 14:57:48,655][oumi.utils.analysis_utils][rank0][pid:6650][MainThread][INFO]][analysis_utils.py:1550] Loaded merged analysis from: /Users/ryanarman/code/oumi/analysis_output/ultrachat/merged_df
[2026-01-07 14:57:48,656][oumi.utils.analysis_utils][rank0][pid:6650][MainThread][INFO]][analysis_utils.py:1559] Loaded combined schemas from: /Users/ryanarman/code/oumi/analysis_output/ultrachat/schema.json
[2026-01-07 14:57:48,658][oumi.utils.analysis_utils][rank0][pid:6650][MainThread][INFO]][analysis_utils.py:1581] Loaded analysis summary from: /Users/ryanarman/c

dict_keys(['messages_df', 'conversations_df', 'merged_df', 'schemas', 'analysis_summary'])

In [21]:
# Generate HTML report if configured


try:
    from oumi.core.analyze.report_generator import HTMLReportGenerator

    report_gen = HTMLReportGenerator()
    report_path = report_gen.generate_report(
        artifacts=artifacts,
        output_path=OUTPUT_PATH,
        title="Ultrachat Analysis Report",
    )
    print(f"‚úÖ Generated HTML report at: {report_path / 'index.html'}")
except ImportError:
    print("‚ö†Ô∏è  Plotly not installed. Skipping HTML report generation.")
    print("   Install with: pip install 'oumi[analyze_advanced]'")
except Exception as e:
    print(f"‚ö†Ô∏è  Failed to generate HTML report: {e}")
    print("\nüîç FULL TRACEBACK:")
    print("=" * 70)
    traceback.print_exc()
    print("=" * 70)

print(f"\nüìÅ All results saved to: {OUTPUT_PATH}")


[2026-01-07 14:57:50,179][oumi][rank0][pid:6650][MainThread][INFO]][report_generator.py:263] Generated HTML report: /Users/ryanarman/code/oumi/analysis_output/ultrachat/index.html
[2026-01-07 14:57:50,180][oumi][rank0][pid:6650][MainThread][INFO]][report_generator.py:264] External data files written to: /Users/ryanarman/code/oumi/analysis_output/ultrachat/data
‚úÖ Generated HTML report at: /Users/ryanarman/code/oumi/analysis_output/ultrachat/index.html

üìÅ All results saved to: /Users/ryanarman/code/oumi/analysis_output/ultrachat


In [22]:
schema = analyzer.get_schema()
print(f"len(schema): {len(schema)}")
merged_columns = analyzer.analysis_df.columns
print(f"len(merged_columns): {len(merged_columns)}")

len(schema): 206
len(merged_columns): 206


In [23]:
analyzer_names = [a.instance_id for a in config.analyzers]
analyzer_names

['length',
 'diversity',
 'format',
 'quality',
 'content_pattern',
 'embedding',
 'question_diversity',
 'repr_diversity',
 'conversation_structure',
 'response_completeness',
 'training_quality',
 'task_category',
 'safety',
 'difficulty',
 'input_quality',
 'instruct_reward',
 'cost',
 'helpfulness',
 'instruction_quality',
 'response_quality',
 'excessive_politeness',
 'roleplay_bleeding',
 'reasoning_leakage',
 'style_homogenization',
 'repetitive_turns']

# Conv level

In [24]:
conv_df = artifacts["conversations_df"]
msg_df = artifacts["messages_df"]

In [25]:
conv_df = analyzer.conversation_df
conv_df.head()

Unnamed: 0,conversation_index,conversation_id,num_messages,conversation_text_content,conversation_text_content__length__token_count,conversation_text_content__diversity__unique_words_ratio,conversation_text_content__format__has_markdown,conversation_text_content__format__has_json,conversation_text_content__format__has_code_blocks,conversation_text_content__format__code_block_count,...,conversation_text_content__roleplay_bleeding__reasoning,conversation_text_content__roleplay_bleeding__raw_response,conversation_text_content__style_homogenization__score,conversation_text_content__style_homogenization__label,conversation_text_content__style_homogenization__reasoning,conversation_text_content__style_homogenization__raw_response,conversation_text_content__repetitive_turns__score,conversation_text_content__repetitive_turns__label,conversation_text_content__repetitive_turns__reasoning,conversation_text_content__repetitive_turns__raw_response
0,0,0,8,<|im_start|>user\nThese instructions apply to ...,729,0.423372,True,False,False,0,...,"The assistant provided straightforward, factua...","```json\n{\n ""score"": 0,\n ""label"": ""none"",\...",7.0,moderate,The conversation contains some repetitive stru...,"```json\n{\n ""score"": 7,\n ""label"": ""moderat...",7.0,moderate,"The conversation contains moderate repetition,...","```json\n{\n ""score"": 7,\n ""label"": ""moderat..."
1,1,1,12,<|im_start|>user\nWhich famous landmarks shoul...,1541,0.439195,True,False,False,0,...,The assistant provides informative and engagin...,"```json\n{\n ""score"": 0,\n ""label"": ""none"",\...",6.0,moderate,The conversation contains some repetitive stru...,"```json\n{\n ""score"": 6,\n ""label"": ""moderat...",6.0,moderate,"The conversation exhibits some repetition, par...","```json\n{\n ""score"": 6,\n ""label"": ""moderat..."
2,2,2,8,<|im_start|>user\nWrite a comprehensive blog p...,3007,0.293608,True,False,False,0,...,The assistant provided a natural and informati...,"```json\n{\n ""score"": 0,\n ""label"": ""none"",\...",8.0,severe,The conversation exhibits a high degree of hom...,"```json\n{\n ""score"": 8,\n ""label"": ""severe""...",6.0,moderate,"The conversation exhibits some repetition, par...","```json\n{\n ""score"": 6,\n ""label"": ""moderat..."
3,3,3,4,"<|im_start|>user\nDe Le√≥n, previewing the spee...",425,0.531056,True,False,False,0,...,The assistant provided a straightforward and i...,"```json\n{\n ""score"": 0,\n ""label"": ""none"",\...",7.0,moderate,The conversation contains some generic phrasin...,"```json\n{\n ""score"": 7,\n ""label"": ""moderat...",4.0,minimal,While there is some repetition in discussing S...,"```json\n{\n ""score"": 4,\n ""label"": ""minimal..."
4,4,4,8,<|im_start|>user\nWrite an essay that evaluate...,2195,0.336683,True,False,False,0,...,The assistant provides natural responses and p...,"```json\n{\n ""score"": 0,\n ""label"": ""none"",\...",8.0,severe,The conversation exhibits a high degree of hom...,"```json\n{\n ""score"": 8,\n ""label"": ""severe""...",5.0,moderate,"The conversation exhibits some repetition, par...","```json\n{\n ""score"": 5,\n ""label"": ""moderat..."


In [26]:
conv_df.conversation_text_content__helpfulness__score

0    8.0
1    9.0
2    8.0
3    9.0
4    8.0
5    9.0
6    9.0
7    9.0
8    9.0
9    9.0
Name: conversation_text_content__helpfulness__score, dtype: float64

In [27]:
conv_df.conversation_text_content__helpfulness__reasoning

0    The conversation provides clear and relevant i...
1    The conversation provides detailed and relevan...
2    The conversation provides a comprehensive over...
3    The conversation provides relevant information...
4    The conversation provides a thorough evaluatio...
5    The conversation provides a detailed and relev...
6    The conversation provides a clear and structur...
7    The conversation provides clear, detailed, and...
8    The conversation provides detailed information...
9    The conversation provides detailed instruction...
Name: conversation_text_content__helpfulness__reasoning, dtype: object

In [14]:
from oumi.core.analyze.column_utils import (
    filter_analyzer_columns,
    get_analyzer_columns_by_analyzer,
    parse_analyzer_column_name,
)

conv_columns = analyzer.conversation_df.columns
row = analyzer.conversation_df.iloc[0]

# Choose the analzyer to analyze
analyzer_name = analyzer_names[0]


filtered_cols = filter_analyzer_columns(conv_columns, analyzer_id=analyzer_name)
if filtered_cols:
    print(f"Analyzer: {analyzer_name}")
    info = parse_analyzer_column_name(filtered_cols[0])
    # print("\nInput:")
    # print(f"source_column: {info.source_column}")
    # print(f"{row[info.source_column]}\n")

    for col in filtered_cols:
        info = parse_analyzer_column_name(col)
        print(f"metric: {info.metric_name}")
        # print(f"type: {schema[col]['type']}")
        # print(f"content_type: {schema[col]['content_type']}")
        print(f"description: {schema[col]['description']}")
        print(f"value: {row[col]}")
        print("\n")
else:
    print(f"No columns found for analyzer: {analyzer_name}")


Analyzer: length
metric: token_count
description: Token count for conversation_text_content
value: 729




# Message level

In [15]:
msg_df = analyzer.message_df
msg_df.head()

Unnamed: 0,conversation_index,conversation_id,message_index,message_id,role,text_content,text_content__length__token_count,text_content__diversity__unique_words_ratio,text_content__format__has_markdown,text_content__format__has_json,...,text_content__response_quality__reasoning,text_content__response_quality__raw_response,text_content__excessive_politeness__score,text_content__excessive_politeness__label,text_content__excessive_politeness__reasoning,text_content__excessive_politeness__raw_response,text_content__reasoning_leakage__score,text_content__reasoning_leakage__label,text_content__reasoning_leakage__reasoning,text_content__reasoning_leakage__raw_response
0,0,0,0,msg_0,user,These instructions apply to section-based them...,124,0.73913,False,False,...,,,,,,,,,,
1,0,0,1,msg_1,assistant,This feature only applies to Collection pages ...,22,0.95,False,False,...,The response is somewhat relevant but lacks co...,"```json\n{\n ""score"": 6,\n ""label"": ""needs_i...",2.0,minimal,"The response is direct and factual, with no si...","```json\n{\n ""score"": 2,\n ""label"": ""minimal...",2.0,minimal,The response is mostly direct and provides a c...,"```json\n{\n ""score"": 2,\n ""label"": ""minimal..."
2,0,0,2,msg_2,user,Can you guide me through the process of enabli...,23,0.954545,False,False,...,,,,,,,,,,
3,0,0,3,msg_3,assistant,"Sure, here are the steps to enable the seconda...",184,0.64,True,False,...,"The response is helpful and relevant, providin...","```json\n{\n ""score"": 9,\n ""label"": ""good"",\...",2.0,minimal,The response is direct and provides clear step...,"```json\n{\n ""score"": 2,\n ""label"": ""minimal...",5.0,moderate,The response provides a detailed step-by-step ...,"```json\n{\n ""score"": 5,\n ""label"": ""moderat..."
4,0,0,4,msg_4,user,Can you provide me with a link to the document...,14,1.0,False,False,...,,,,,,,,,,


In [16]:
msg_df[msg_df.role == "assistant"].text_content__response_quality__score

1     6.0
3     9.0
5     8.0
7     8.0
9     9.0
11    9.0
13    9.0
15    9.0
17    8.0
19    8.0
21    8.0
23    8.0
25    8.0
27    9.0
29    7.0
31    9.0
33    9.0
35    9.0
37    9.0
39    9.0
41    8.0
43    9.0
45    6.0
47    6.0
49    7.0
51    9.0
53    9.0
55    9.0
57    9.0
59    9.0
61    8.0
63    9.0
65    9.0
67    9.0
69    9.0
71    9.0
73    9.0
75    8.0
77    9.0
Name: text_content__response_quality__score, dtype: float64

In [17]:
msg_df[msg_df.role == "system"].text_content__instruction_quality__score.iloc[0]


IndexError: single positional indexer is out-of-bounds

In [None]:
msg_df[msg_df.role == "system"].text_content__instruction_quality__label.iloc[0]


'good'

In [None]:
msg_df[msg_df.role == "system"].text_content__instruction_quality__reasoning.iloc[0]


'The instruction is clear and specific, with a defined goal of classifying user queries into one of 77 banking intents. It uses action verbs and provides context through examples and intent descriptions. However, it could be improved by clarifying the range of IDs (0-76) instead of 0-77, as the highest ID listed is 76.'

In [None]:
from oumi.core.analyze.column_utils import (
    filter_analyzer_columns,
    get_analyzer_columns_by_analyzer,
    parse_analyzer_column_name,
)

msg_columns = analyzer.message_df.columns
row = analyzer.message_df.iloc[0]

# Choose the analzyer to analyze
analyzer_name = analyzer_names[3]


filtered_cols = filter_analyzer_columns(msg_columns, analyzer_id=analyzer_name)
if filtered_cols:
    print(f"Analyzer: {analyzer_name}")
    info = parse_analyzer_column_name(filtered_cols[0])

    print("\nInput:")
    print(f"[{row['role']}]: {row[info.source_column]}\n")
    # print(f"source_column: {info.source_column}")
    print(f"{row[info.source_column]}\n")

    for col in filtered_cols:
        info = parse_analyzer_column_name(col)
        print(f"metric: {info.metric_name}")
        # print(f"type: {schema[col]['type']}")
        # print(f"content_type: {schema[col]['content_type']}")
        print(f"description: {schema[col]['description']}")
        print(f"value: {row[col]}")
        print("\n")
else:
    print(f"No columns found for analyzer: {analyzer_name}")


Analyzer: instruction_quality

Input:
[system]: You are a banking intent classifier. Classify the user's query into one of  77 banking intents (output is a single integer ID).

IDs:

0: activate_my_card
1: age_limit
2: apple_pay_or_google_pay
3: atm_support
4: automatic_top_up
5: balance_not_updated_after_bank_transfer
6: balance_not_updated_after_cheque_or_cash_deposit
7: beneficiary_not_allowed
8: cancel_transfer
9: card_about_to_expire
10: card_acceptance
11: card_arrival
12: card_delivery_estimate
13: card_linking
14: card_not_working
15: card_payment_fee_charged
16: card_payment_not_recognised
17: card_payment_wrong_exchange_rate
18: card_swallowed
19: cash_withdrawal_charge
20: cash_withdrawal_not_recognised
21: change_pin
22: compromised_card
23: contactless_not_working
24: country_support
25: declined_card_payment
26: declined_cash_withdrawal
27: declined_transfer
28: direct_debit_payment_not_recognised
29: disposable_card_limits
30: edit_personal_details
31: exchange_charge
32