In [1]:
import os

# IMPORTANT: Set these BEFORE importing torch or any ML libraries
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["CUDA_VISIBLE_DEVICES"] = ""  # Disable CUDA
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"

# Force CPU usage in PyTorch to avoid MPS crashes
import torch

torch.set_default_device("cpu")
if torch.backends.mps.is_available():
    print("MPS is available but forcing CPU usage to avoid crashes")
else:
    print("Using CPU for all computations")

print(f"PyTorch device: {torch.get_default_device()}")


MPS is available but forcing CPU usage to avoid crashes
PyTorch device: cpu


In [2]:
import os
from oumi.core.configs import AnalyzeConfig
from oumi.core.analyze.dataset_analyzer import DatasetAnalyzer

# Path to the config file
config_path = "/Users/ryanarman/code/oumi/configs/examples/analyze/analyze.yaml"

# Path to your dataset file
dataset_path = "/Users/ryanarman/code/scratch/ryan_hillclimbing_experiments/banking77/notebooks/data/banking77_train.jsonl"

# Load the config from YAML
config = AnalyzeConfig.from_yaml(
    config_path=config_path,
)

config.sample_count = 10

# Override the dataset settings to use your local file
config.dataset_path = dataset_path
config.dataset_name = None  # Clear dataset_name so it uses dataset_path instead

# Optionally update output path
config.output_path = "./analysis_output/banking77"

# IMPORTANT: Disable analyzers that require large model downloads or have issues
# IFD requires downloading Qwen model and may cause MPS crashes
# fasttext requires additional dependencies
# repr_diversity and question_diversity download embedding models
problematic_analyzers = ["ifd", "fasttext", "repr_diversity", "question_diversity"]
config.analyzers = [a for a in config.analyzers if a.id not in problematic_analyzers]
print(f"Running {len(config.analyzers)} analyzers: {[a.id for a in config.analyzers]}")

# Validate the configuration
config.finalize_and_validate()

# Create the analyzer
analyzer = DatasetAnalyzer(config)

Running 14 analyzers: ['length', 'diversity', 'format', 'quality', 'training_quality', 'cost', 'content_pattern', 'task_category', 'safety', 'difficulty', 'instruct_reward', 'input_quality', 'conversation_structure', 'response_completeness']
[2025-12-29 15:19:24,215][oumi][rank0][pid:71971][MainThread][INFO]][base_map_dataset.py:91] Creating map dataset (type: TextSftJsonLinesDataset)... dataset_name: 'custom'
[2025-12-29 15:19:24,216][oumi.utils.analysis_utils][rank0][pid:71971][MainThread][INFO]][analysis_utils.py:225] Loaded text dataset from: /Users/ryanarman/code/scratch/ryan_hillclimbing_experiments/banking77/notebooks/data/banking77_train.jsonl
[2025-12-29 15:19:24,217][oumi][rank0][pid:71971][MainThread][INFO]][dataset_analyzer.py:154] Loaded dataset from config: None
[2025-12-29 15:19:24,334][oumi][rank0][pid:71971][MainThread][INFO]][dataset_analyzer.py:304] Initialized sample analyzer: length
[2025-12-29 15:19:24,334][oumi][rank0][pid:71971][MainThread][INFO]][dataset_analyz

In [3]:
# Run the analysis
analyzer.analyze_dataset()

# The results are stored in analyzer object
if analyzer._analysis_results:
    print(
        f"Total conversations analyzed: {analyzer._analysis_results.conversations_analyzed}"
    )

[2025-12-29 15:19:24,350][oumi][rank0][pid:71971][MainThread][INFO]][dataset_analyzer.py:347] Starting analysis of dataset: None
[2025-12-29 15:19:24,351][oumi][rank0][pid:71971][MainThread][INFO]][dataset_analyzer.py:348] Using 14 sample analyzers: ['length', 'diversity', 'format', 'quality', 'training_quality', 'cost', 'content_pattern', 'task_category', 'safety', 'difficulty', 'instruct_reward', 'input_quality', 'conversation_structure', 'response_completeness']
[2025-12-29 15:19:24,351][oumi][rank0][pid:71971][MainThread][INFO]][dataset_analyzer.py:371] Analyzing 10 of 8002 conversations
[2025-12-29 15:19:24,352][oumi][rank0][pid:71971][MainThread][INFO]][dataset_analyzer.py:421] Converting conversation dataset with 8002 items
[2025-12-29 15:19:24,353][oumi][rank0][pid:71971][MainThread][INFO]][dataset_analyzer.py:428] Limiting analysis to first 10 items (dataset has 8002 total)


Converting Unknown Dataset to DataFrames: 100%|██████████| 10/10 [00:00<00:00, 1466.49item/s]


Total conversations analyzed: 10


In [4]:
config.analyzers

[SampleAnalyzerParams(id='length', params={'token_count': True}),
 SampleAnalyzerParams(id='diversity', params={'unique_words_ratio': True, 'case_sensitive': False}),
 SampleAnalyzerParams(id='format', params={'detect_markdown': True, 'detect_json': True, 'detect_code_blocks': True, 'detect_urls': True, 'detect_emails': True, 'compute_complexity': True}),
 SampleAnalyzerParams(id='quality', params={'detect_pii': True, 'detect_emails': True, 'detect_phones': True, 'detect_ssn': True, 'detect_credit_cards': True, 'detect_api_keys': True, 'detect_encoding_issues': True, 'detect_repetition': True, 'repetition_ngram_size': 3, 'repetition_threshold': 0.3}),
 SampleAnalyzerParams(id='training_quality', params={'compute_response_completeness': True, 'min_response_words': 5}),
 SampleAnalyzerParams(id='cost', params={'target_context_windows': [4096, 8192, 16384, 32768], 'compute_packing_efficiency': True, 'packing_overhead_tokens': 10}),
 SampleAnalyzerParams(id='content_pattern', params={'dete

In [5]:
schema = analyzer.get_schema()
conv_columns = analyzer.conversation_df.columns
merged_columns = analyzer.analysis_df.columns

In [6]:
schema

{'conversation_index': {'type': <ColumnType.INT: 'int'>,
  'content_type': <ContentType.IDENTIFIER: 'identifier'>,
  'description': 'Conversation index in dataset'},
 'conversation_id': {'type': <ColumnType.STRING: 'string'>,
  'content_type': <ContentType.IDENTIFIER: 'identifier'>,
  'description': 'Conversation identifier'},
 'num_messages': {'type': <ColumnType.INT: 'int'>,
  'content_type': <ContentType.NUMERIC: 'numeric'>,
  'description': 'Number of messages in conversation'},
 'conversation_text_content': {'type': <ColumnType.STRING: 'string'>,
  'content_type': <ContentType.TEXT: 'text'>,
  'description': 'Full conversation rendered as text'},
 'message_index': {'type': <ColumnType.INT: 'int'>,
  'content_type': <ContentType.IDENTIFIER: 'identifier'>,
  'description': 'Message index within conversation'},
 'message_id': {'type': <ColumnType.STRING: 'string'>,
  'content_type': <ContentType.IDENTIFIER: 'identifier'>,
  'description': 'Message identifier'},
 'role': {'type': <Col

In [8]:
row = analyzer.conversation_df.iloc[0]
col = conv_columns[50]
print(f"col: {col}")
print(f"schema[col]: {schema[col]}")
print(f"row[col]: {row[col]}")

col: conversation_text_content_difficulty_score


KeyError: 'conversation_text_content_difficulty_score'