In [1]:
import os
import sys

# CRITICAL: Add local source to path FIRST to use local changes
sys.path.insert(0, "/Users/ryanarman/code/oumi/src")
print(f"Using local source code from: /Users/ryanarman/code/oumi/src")

# IMPORTANT: Set these BEFORE importing torch or any ML libraries
# Disable all GPU/MPS backends to prevent crashes with IFD analyzer
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["CUDA_VISIBLE_DEVICES"] = ""  # Disable CUDA
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
os.environ["PYTORCH_MPS_HIGH_WATERMARK_RATIO"] = "0.0"  # Disable MPS memory allocation
os.environ["DISABLE_MPS_COMPAT"] = "1"  # Additional MPS disable flag
os.environ["HF_HUB_DISABLE_TELEMETRY"] = "1"  # Disable HuggingFace telemetry
os.environ["TRANSFORMERS_OFFLINE"] = "0"  # Allow model downloads

# Force CPU usage in PyTorch to avoid MPS crashes
import torch

# Forcefully disable MPS before anything else
torch.set_default_device("cpu")
if hasattr(torch.backends, "mps"):
    # Monkey-patch to prevent MPS usage
    original_is_available = torch.backends.mps.is_available
    torch.backends.mps.is_available = lambda: False
    print("MPS has been disabled - forcing CPU-only mode")
else:
    print("Using CPU for all computations")

print(f"PyTorch device: {torch.get_default_device()}")
print(f"PyTorch version: {torch.__version__}")

Using local source code from: /Users/ryanarman/code/oumi/src
MPS has been disabled - forcing CPU-only mode
PyTorch device: cpu
PyTorch version: 2.6.0


In [2]:
import os
from oumi.core.configs import AnalyzeConfig
from oumi.core.analyze.dataset_analyzer import DatasetAnalyzer

# Path to the config file
config_path = "/Users/ryanarman/code/oumi/configs/examples/analyze/analyze.yaml"

# Path to your dataset file
dataset_path = "/Users/ryanarman/code/scratch/ryan_hillclimbing_experiments/banking77/notebooks/data/banking77_train.jsonl"

# Load the config from YAML
config = AnalyzeConfig.from_yaml(
    config_path=config_path,
)

config.sample_count = 10

# Override the dataset settings to use your local file
config.dataset_path = dataset_path
config.dataset_name = None  # Clear dataset_name so it uses dataset_path instead

# Optionally update output path
config.output_path = "./analysis_output/banking77"

# IMPORTANT: Disable analyzers that require large model downloads or have issues
# IFD requires downloading Qwen model and may cause MPS crashes
# fasttext requires additional dependencies
# repr_diversity and question_diversity download embedding models
problematic_analyzers = []
# problematic_analyzers = ["ifd", "fasttext", "repr_diversity", "question_diversity"]
config.analyzers = [a for a in config.analyzers if a.id not in problematic_analyzers]
print(f"Running {len(config.analyzers)} analyzers: {[a.id for a in config.analyzers]}")

# Validate the configuration
config.finalize_and_validate()

# Create the analyzer
analyzer = DatasetAnalyzer(config)

Running 18 analyzers: ['length', 'diversity', 'format', 'quality', 'training_quality', 'cost', 'content_pattern', 'question_diversity', 'fasttext', 'ifd', 'repr_diversity', 'task_category', 'safety', 'difficulty', 'instruct_reward', 'input_quality', 'conversation_structure', 'response_completeness']
[2025-12-29 17:10:11,863][oumi][rank0][pid:90380][MainThread][INFO]][base_map_dataset.py:91] Creating map dataset (type: TextSftJsonLinesDataset)... dataset_name: 'custom'
[2025-12-29 17:10:11,864][oumi.utils.analysis_utils][rank0][pid:90380][MainThread][INFO]][analysis_utils.py:225] Loaded text dataset from: /Users/ryanarman/code/scratch/ryan_hillclimbing_experiments/banking77/notebooks/data/banking77_train.jsonl
[2025-12-29 17:10:11,864][oumi][rank0][pid:90380][MainThread][INFO]][dataset_analyzer.py:154] Loaded dataset from config: None
[2025-12-29 17:10:11,987][oumi][rank0][pid:90380][MainThread][INFO]][dataset_analyzer.py:304] Initialized sample analyzer: length
[2025-12-29 17:10:11,988

In [3]:
# Run the analysis
analyzer.analyze_dataset()

# The results are stored in analyzer object
if analyzer._analysis_results:
    print(
        f"Total conversations analyzed: {analyzer._analysis_results.conversations_analyzed}"
    )

[2025-12-29 17:10:12,024][oumi][rank0][pid:90380][MainThread][INFO]][dataset_analyzer.py:347] Starting analysis of dataset: None
[2025-12-29 17:10:12,025][oumi][rank0][pid:90380][MainThread][INFO]][dataset_analyzer.py:348] Using 18 sample analyzers: ['length', 'diversity', 'format', 'quality', 'training_quality', 'cost', 'content_pattern', 'question_diversity', 'fasttext', 'ifd', 'repr_diversity', 'task_category', 'safety', 'difficulty', 'instruct_reward', 'input_quality', 'conversation_structure', 'response_completeness']
[2025-12-29 17:10:12,025][oumi][rank0][pid:90380][MainThread][INFO]][dataset_analyzer.py:371] Analyzing 10 of 8002 conversations
[2025-12-29 17:10:12,026][oumi][rank0][pid:90380][MainThread][INFO]][dataset_analyzer.py:421] Converting conversation dataset with 8002 items
[2025-12-29 17:10:12,026][oumi][rank0][pid:90380][MainThread][INFO]][dataset_analyzer.py:428] Limiting analysis to first 10 items (dataset has 8002 total)


Converting Unknown Dataset to DataFrames: 100%|██████████| 10/10 [00:00<00:00, 2076.49item/s]


[2025-12-29 17:10:12,079][oumi][rank0][pid:90380][MainThread][INFO]][question_diversity_analyzer.py:463] Computing embeddings for 10 user questions...
[2025-12-29 17:10:12,080][oumi][rank0][pid:90380][MainThread][INFO]][question_diversity_analyzer.py:173] Loading embedding model: all-MiniLM-L6-v2


Computing embeddings: 100%|██████████| 10/10 [00:00<00:00, 75.83it/s]


[2025-12-29 17:10:13,238][oumi][rank0][pid:90380][MainThread][INFO]][question_diversity_analyzer.py:468] Clustering 10 questions using dbscan...
[2025-12-29 17:10:13,484][oumi][rank0][pid:90380][MainThread][INFO]][question_diversity_analyzer.py:486] Found 1 clusters
[2025-12-29 17:10:13,506][oumi][rank0][pid:90380][MainThread][INFO]][fasttext_analyzer.py:219] Initialized fast-langdetect for language detection
[2025-12-29 17:10:13,507][oumi][rank0][pid:90380][MainThread][INFO]][fasttext_analyzer.py:435] Analyzing language for column: conversation_text_content
[2025-12-29 17:10:13,597][oumi][rank0][pid:90380][MainThread][INFO]][ifd_analyzer.py:153] Loading model for IFD analysis: Qwen/Qwen3-0.6B
[2025-12-29 17:10:15,459][oumi][rank0][pid:90380][MainThread][INFO]][ifd_analyzer.py:192] Loaded Qwen/Qwen3-0.6B on cpu (dtype: torch.float32)
[2025-12-29 17:10:15,461][oumi][rank0][pid:90380][MainThread][INFO]][repr_diversity_analyzer.py:362] Computing diversity scores for 10 samples in column '

Computing embeddings: 100%|██████████| 10/10 [00:00<00:00, 103.07it/s]


[2025-12-29 17:10:16,504][oumi][rank0][pid:90380][MainThread][INFO]][repr_diversity_analyzer.py:229] Computing nearest neighbor distances for 10 samples (k=5)...
[2025-12-29 17:10:16,506][oumi][rank0][pid:90380][MainThread][INFO]][repr_diversity_analyzer.py:555] Column 'conversation_text_content': 10/10 samples (100.0%) are redundant
[2025-12-29 17:10:16,649][oumi][rank0][pid:90380][MainThread][INFO]][question_diversity_analyzer.py:463] Computing embeddings for 10 user questions...


Computing embeddings: 100%|██████████| 10/10 [00:00<00:00, 604.85it/s]


[2025-12-29 17:10:16,670][oumi][rank0][pid:90380][MainThread][INFO]][question_diversity_analyzer.py:468] Clustering 10 questions using dbscan...
[2025-12-29 17:10:16,673][oumi][rank0][pid:90380][MainThread][INFO]][question_diversity_analyzer.py:481] Found 0 clusters, 10 unique/diverse questions (not similar to others)
[2025-12-29 17:10:16,674][oumi][rank0][pid:90380][MainThread][INFO]][fasttext_analyzer.py:435] Analyzing language for column: text_content
[2025-12-29 17:10:16,681][oumi][rank0][pid:90380][MainThread][INFO]][ifd_analyzer.py:617] Detected conversation format. Computing IFD for assistant messages using preceding user messages as instructions.
[2025-12-29 17:10:19,506][oumi][rank0][pid:90380][MainThread][INFO]][ifd_analyzer.py:497] IFD analysis complete. Processed 10 assistant messages out of 10 total.
[2025-12-29 17:10:19,508][oumi][rank0][pid:90380][MainThread][INFO]][repr_diversity_analyzer.py:362] Computing diversity scores for 30 samples in column 'text_content'...


Computing embeddings: 100%|██████████| 30/30 [00:00<00:00, 136.86it/s]


[2025-12-29 17:10:19,729][oumi][rank0][pid:90380][MainThread][INFO]][repr_diversity_analyzer.py:229] Computing nearest neighbor distances for 30 samples (k=5)...
[2025-12-29 17:10:19,763][oumi][rank0][pid:90380][MainThread][INFO]][repr_diversity_analyzer.py:555] Column 'text_content': 19/30 samples (63.3%) are redundant


  sqr = _ensure_numeric((avg - values) ** 2)


Total conversations analyzed: 10


In [4]:
schema = analyzer.get_schema()
print(f"len(schema): {len(schema)}")
merged_columns = analyzer.analysis_df.columns
print(f"len(merged_columns): {len(merged_columns)}")

len(schema): 174
len(merged_columns): 174


In [5]:
analyzer.analysis_df.head()

Unnamed: 0,conversation_index,conversation_id,num_messages,conversation_text_content,conversation_text_content_length_token_count,conversation_text_content_diversity_unique_words_ratio,conversation_text_content_format_has_markdown,conversation_text_content_format_has_json,conversation_text_content_format_has_code_blocks,conversation_text_content_format_code_block_count,...,conversation_structure_conversation_depth,conversation_structure_role_balance,conversation_structure_has_system_prompt,conversation_structure_avg_turn_length,conversation_structure_turn_length_variance,text_content_response_completeness_is_complete,text_content_response_completeness_score,text_content_response_completeness_ends_naturally,text_content_response_completeness_has_conclusion,text_content_response_completeness_truncation_type
0,0,0,3,SYSTEM: You are a banking intent classifier. C...,1775,0.516556,True,False,False,0,...,1,0.5,True,6.5,30.25,,,,,
1,0,0,3,SYSTEM: You are a banking intent classifier. C...,1775,0.516556,True,False,False,0,...,1,0.5,True,6.5,30.25,,,,,
2,0,0,3,SYSTEM: You are a banking intent classifier. C...,1775,0.516556,True,False,False,0,...,1,0.5,True,6.5,30.25,False,0.5,False,False,
3,1,1,3,SYSTEM: You are a banking intent classifier. C...,1774,0.511602,True,False,False,0,...,1,0.5,True,6.0,25.0,,,,,
4,1,1,3,SYSTEM: You are a banking intent classifier. C...,1774,0.511602,True,False,False,0,...,1,0.5,True,6.0,25.0,,,,,


# Conv level

In [6]:
conv_columns = analyzer.conversation_df.columns
row = analyzer.conversation_df.iloc[0]
col = conv_columns[51]
print(f"col: {col}")
print(f"schema[col]: {schema[col]}")
print(f"row[col]: {row[col]}")

col: conversation_text_content_repr_diversity_percentile
schema[col]: {'type': <ColumnType.FLOAT: 'float'>, 'content_type': <ContentType.NUMERIC: 'numeric'>, 'description': 'Diversity percentile rank (0.0-100.0)'}
row[col]: 100.0


In [10]:
for i in range(len(conv_columns)):
    if "question" in conv_columns[i]:
        print(i)

In [11]:
conv_columns

Index(['conversation_index', 'conversation_id', 'num_messages',
       'conversation_text_content',
       'conversation_text_content_length_token_count',
       'conversation_text_content_diversity_unique_words_ratio',
       'conversation_text_content_format_has_markdown',
       'conversation_text_content_format_has_json',
       'conversation_text_content_format_has_code_blocks',
       'conversation_text_content_format_code_block_count',
       'conversation_text_content_format_code_block_languages',
       'conversation_text_content_format_has_urls',
       'conversation_text_content_format_has_emails',
       'conversation_text_content_format_format_complexity_score',
       'conversation_text_content_quality_has_pii',
       'conversation_text_content_quality_pii_types',
       'conversation_text_content_quality_pii_count',
       'conversation_text_content_quality_has_encoding_issues',
       'conversation_text_content_quality_repetition_ratio',
       'conversation_text_conte

# Message level

In [13]:
msg_columns = analyzer.message_df.columns

for i in range(len(msg_columns)):
    if "question" in msg_columns[i]:
        print(i)


In [14]:
msg_columns

Index(['conversation_index', 'conversation_id', 'message_index', 'message_id',
       'role', 'text_content', 'text_content_length_token_count',
       'text_content_diversity_unique_words_ratio',
       'text_content_format_has_markdown', 'text_content_format_has_json',
       'text_content_format_has_code_blocks',
       'text_content_format_code_block_count',
       'text_content_format_code_block_languages',
       'text_content_format_has_urls', 'text_content_format_has_emails',
       'text_content_format_format_complexity_score',
       'text_content_quality_has_pii', 'text_content_quality_pii_types',
       'text_content_quality_pii_count',
       'text_content_quality_has_encoding_issues',
       'text_content_quality_repetition_ratio',
       'text_content_quality_has_high_repetition',
       'text_content_training_quality_response_completeness_score',
       'text_content_training_quality_has_proper_ending',
       'text_content_training_quality_has_structure',
       'text_

In [9]:
row = analyzer.message_df.iloc[1]
col = msg_columns[56]
print(f"col: {col}")
print(f"schema[col]: {schema[col]}")
print(f"row[col]: {row[col]}")

col: text_content_repr_diversity_is_redundant
schema[col]: {'type': <ColumnType.BOOL: 'bool'>, 'content_type': <ContentType.BOOLEAN: 'boolean'>, 'description': 'Whether sample is redundant (too similar to others)'}
row[col]: False
