In [1]:
import os
import sys


# IMPORTANT: Set these BEFORE importing torch or any ML libraries
# Disable all GPU/MPS backends to prevent crashes with IFD analyzer
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["CUDA_VISIBLE_DEVICES"] = ""  # Disable CUDA
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
os.environ["PYTORCH_MPS_HIGH_WATERMARK_RATIO"] = "0.0"  # Disable MPS memory allocation
os.environ["DISABLE_MPS_COMPAT"] = "1"  # Additional MPS disable flag
os.environ["HF_HUB_DISABLE_TELEMETRY"] = "1"  # Disable HuggingFace telemetry
os.environ["TRANSFORMERS_OFFLINE"] = "0"  # Allow model downloads

# Force CPU usage in PyTorch to avoid MPS crashes
import torch

# Forcefully disable MPS before anything else
torch.set_default_device("cpu")
if hasattr(torch.backends, "mps"):
    # Monkey-patch to prevent MPS usage
    original_is_available = torch.backends.mps.is_available
    torch.backends.mps.is_available = lambda: False
    print("MPS has been disabled - forcing CPU-only mode")
else:
    print("Using CPU for all computations")

print(f"PyTorch device: {torch.get_default_device()}")
print(f"PyTorch version: {torch.__version__}")


MPS has been disabled - forcing CPU-only mode
PyTorch device: cpu
PyTorch version: 2.6.0


In [3]:
# Download and convert OpenAssistant dataset to JSONL format
import json
from pathlib import Path
from datasets import load_dataset

# Set paths
DATA_DIR = Path("/Users/ryanarman/code/oumi/data/openassistant")
DATA_DIR.mkdir(parents=True, exist_ok=True)
DATASET_PATH = DATA_DIR / "openassistant_oasst1.jsonl"

# Download OpenAssistant OASST1 if not already downloaded
if not DATASET_PATH.exists():
    print("üì• Downloading OpenAssistant OASST1 dataset...")
    dataset = load_dataset("OpenAssistant/oasst1", split="train")

    # Get dataset length safely
    from datasets import Dataset

    if isinstance(dataset, Dataset):
        dataset_len = len(dataset)
    else:
        dataset_len = "unknown"

    print(f"‚úÖ Downloaded dataset (size: {dataset_len})")
    print(f"üìù Converting to JSONL format...")

    def build_conversations_from_messages(all_messages):
        """Build conversations from OASST1 message tree structure.

        OASST1 has messages with parent_id relationships. We need to:
        1. Find root messages (no parent_id)
        2. Build conversation threads by following parent_id chains
        """
        # Create a map of message_id -> message
        msg_map = {}
        for msg in all_messages:
            msg_id = msg.get("message_id")
            if msg_id:
                msg_map[msg_id] = msg

        # Find root messages (no parent_id or parent_id is None)
        roots = []
        for msg in all_messages:
            parent_id = msg.get("parent_id")
            if not parent_id or parent_id not in msg_map:
                roots.append(msg)

        conversations = []
        for root in roots:
            messages = []

            def add_message(msg):
                role = "user" if msg.get("role") == "prompter" else "assistant"
                content = msg.get("text", "")
                if content:  # Only add non-empty messages
                    messages.append({"role": role, "content": content})

                # Find children (messages with this message as parent)
                for child_id, child_msg in msg_map.items():
                    if child_msg.get("parent_id") == msg.get("message_id"):
                        add_message(child_msg)

            add_message(root)

            if messages:  # Only add conversations with at least one message
                conversations.append({"messages": messages})

        return conversations

    def convert_example(example):
        """Convert a single OASST1 example to conversation format."""
        # OASST1 can have different structures:
        # 1. Single message with parent_id (tree structure)
        # 2. Pre-built conversation in "messages" field
        # 3. Tree structure in "message_tree" field

        if "messages" in example and isinstance(example["messages"], list):
            # Already in conversation format
            return {"messages": example["messages"]}
        elif "message_tree" in example:
            # Tree structure - build conversations
            conversations = build_conversations_from_messages(example["message_tree"])
            return conversations[0] if conversations else None
        elif "text" in example:
            # Single message - create a simple conversation
            role = "user" if example.get("role") == "prompter" else "assistant"
            return {"messages": [{"role": role, "content": example["text"]}]}
        else:
            # Try to extract from available fields
            return None

    # Convert and save
    # First, collect all messages to build conversation trees
    # OASST1 structure: each example might be a single message or a conversation
    print("üìä Inspecting dataset structure...")
    first_example = next(iter(dataset))
    print(f"   Example keys: {list(first_example.keys())[:10]}")  # Show first 10 keys

    count = 0
    with open(DATASET_PATH, "w") as f:
        for example in dataset:
            conversation = convert_example(example)
            if conversation and conversation.get("messages"):
                f.write(json.dumps(conversation) + "\n")
                count += 1

    print(f"‚úÖ Saved {count} conversations to {DATASET_PATH}")
else:
    print(f"‚úÖ Dataset already exists at {DATASET_PATH}")

print(f"üìÅ Dataset path: {DATASET_PATH}")


‚úÖ Dataset already exists at /Users/ryanarman/code/oumi/data/openassistant/openassistant_oasst1.jsonl
üìÅ Dataset path: /Users/ryanarman/code/oumi/data/openassistant/openassistant_oasst1.jsonl


In [4]:
# OUTPUT_PATH = "/Users/ryanarman/code/oumi/analysis_output/openassistant"
OUTPUT_PATH = "/Users/ryanarman/code/oumi/analysis_output/openassistant"

import os
from oumi.core.configs import AnalyzeConfig
from oumi.core.analyze.dataset_analyzer import DatasetAnalyzer

# Load config from YAML file
config = AnalyzeConfig.from_yaml(
    "/Users/ryanarman/code/oumi/configs/examples/analyze/analyze_openassistant.yaml"
)

# Override settings for this run
dataset_path = str(DATASET_PATH)
config.dataset_path = dataset_path
config.dataset_name = None  # Clear dataset_name so it uses dataset_path instead
config.sample_count = 50
config.chat_template = "default"  # Simple template without special variables

# Set absolute output path (makes it easier to find the results!)
config.output_path = OUTPUT_PATH

print(f"‚úÖ Config loaded with {len(config.analyzers)} analyzers:")
for analyzer in config.analyzers:
    instance_id = analyzer.instance_id or analyzer.id
    print(f"  - {instance_id} (type: {analyzer.id})")

print(f"üìÅ Output will be saved to: {config.output_path}")
print(f"üìÇ Dataset path: {config.dataset_path}")

# Validate the configuration
config.finalize_and_validate()
print("‚úÖ Config validated successfully!")

analyzer = DatasetAnalyzer(config)


‚úÖ Config loaded with 19 analyzers:
  - length (type: length)
  - token_stats (type: token_stats)
  - diversity (type: diversity)
  - format (type: format)
  - quality (type: quality)
  - content_pattern (type: content_pattern)
  - embedding (type: embedding)
  - question_diversity (type: question_diversity)
  - repr_diversity (type: repr_diversity)
  - conversation_structure (type: conversation_structure)
  - response_completeness (type: response_completeness)
  - training_quality (type: training_quality)
  - task_category (type: task_category)
  - safety (type: safety)
  - difficulty (type: difficulty)
  - input_quality (type: input_quality)
  - instruct_reward (type: instruct_reward)
  - cost (type: cost)
  - helpfulness (type: llm_judge)
üìÅ Output will be saved to: /Users/ryanarman/code/oumi/analysis_output/openassistant
üìÇ Dataset path: /Users/ryanarman/code/oumi/data/openassistant/openassistant_oasst1.jsonl
‚úÖ Config validated successfully!
[2026-01-08 16:52:18,502][oumi][r

In [5]:
# Run the analysis
analyzer.analyze_dataset()

# The results are stored in analyzer object
if analyzer._analysis_results:
    print(
        f"Total conversations analyzed: {analyzer._analysis_results.conversations_analyzed}"
    )


[2026-01-08 16:52:21,057][oumi][rank0][pid:62985][MainThread][INFO]][dataset_analyzer.py:363] Starting analysis of dataset: None
[2026-01-08 16:52:21,059][oumi][rank0][pid:62985][MainThread][INFO]][dataset_analyzer.py:364] Using 19 sample analyzers: ['length', 'token_stats', 'diversity', 'format', 'quality', 'content_pattern', 'embedding', 'question_diversity', 'repr_diversity', 'conversation_structure', 'response_completeness', 'training_quality', 'task_category', 'safety', 'difficulty', 'input_quality', 'instruct_reward', 'cost', 'helpfulness']
[2026-01-08 16:52:21,060][oumi][rank0][pid:62985][MainThread][INFO]][dataset_analyzer.py:387] Analyzing 50 of 84437 conversations
[2026-01-08 16:52:21,060][oumi][rank0][pid:62985][MainThread][INFO]][dataset_analyzer.py:439] Converting conversation dataset with 84437 items
[2026-01-08 16:52:21,061][oumi][rank0][pid:62985][MainThread][INFO]][dataset_analyzer.py:446] Limiting analysis to first 50 items (dataset has 84437 total)


Converting Unknown Dataset to DataFrames: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50/50 [00:00<00:00, 678.11item/s]


[2026-01-08 16:52:21,172][oumi][rank0][pid:62985][MainThread][INFO]][embedding_analyzer.py:518] Computing embeddings for 50 samples...
[2026-01-08 16:52:21,173][oumi][rank0][pid:62985][MainThread][INFO]][embedding_analyzer.py:196] Loading embedding model: all-MiniLM-L6-v2


Computing embeddings: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50/50 [00:00<00:00, 167.25it/s]


[2026-01-08 16:52:22,434][oumi][rank0][pid:62985][MainThread][INFO]][embedding_analyzer.py:537] Detecting semantic duplicates...
[2026-01-08 16:52:22,438][oumi][rank0][pid:62985][MainThread][INFO]][embedding_analyzer.py:624] Detecting fuzzy duplicates using MinHash LSH...
[2026-01-08 16:52:22,447][oumi][rank0][pid:62985][MainThread][INFO]][embedding_analyzer.py:350] Creating MinHash signatures for 50 samples...


Creating MinHash signatures: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50/50 [00:00<00:00, 552.26it/s]


[2026-01-08 16:52:22,565][oumi][rank0][pid:62985][MainThread][INFO]][embedding_analyzer.py:369] Finding fuzzy duplicates using LSH...


Finding duplicates: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50/50 [00:00<00:00, 20392.38it/s]


[2026-01-08 16:52:22,573][oumi][rank0][pid:62985][MainThread][INFO]][question_diversity_analyzer.py:464] Computing embeddings for 50 user questions...
[2026-01-08 16:52:22,574][oumi][rank0][pid:62985][MainThread][INFO]][question_diversity_analyzer.py:174] Loading embedding model: all-MiniLM-L6-v2


Computing embeddings: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50/50 [00:00<00:00, 125.01it/s]


[2026-01-08 16:52:23,892][oumi][rank0][pid:62985][MainThread][INFO]][question_diversity_analyzer.py:469] Clustering 50 questions using dbscan...
[2026-01-08 16:52:24,167][oumi][rank0][pid:62985][MainThread][INFO]][question_diversity_analyzer.py:482] Found 0 clusters, 50 unique/diverse questions (not similar to others)
[2026-01-08 16:52:24,175][oumi][rank0][pid:62985][MainThread][INFO]][repr_diversity_analyzer.py:363] Computing diversity scores for 50 samples in column 'conversation_text_content'...
[2026-01-08 16:52:24,183][oumi][rank0][pid:62985][MainThread][INFO]][repr_diversity_analyzer.py:165] Loading embedding model: sentence-transformers/all-MiniLM-L6-v2


Computing embeddings: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50/50 [00:00<00:00, 166.73it/s]


[2026-01-08 16:52:25,531][oumi][rank0][pid:62985][MainThread][INFO]][repr_diversity_analyzer.py:230] Computing nearest neighbor distances for 50 samples (k=5)...
[2026-01-08 16:52:25,534][oumi][rank0][pid:62985][MainThread][INFO]][repr_diversity_analyzer.py:556] Column 'conversation_text_content': 11/50 samples (22.0%) are redundant
[2026-01-08 16:52:25,615][oumi][rank0][pid:62985][MainThread][INFO]][llm_judge_analyzer.py:444] Initialized LLM Judge with model: gpt-4o-mini, engine: openai


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50/50 [00:04<00:00, 11.15it/s]


[2026-01-08 16:52:30,158][oumi][rank0][pid:62985][MainThread][INFO]][embedding_analyzer.py:518] Computing embeddings for 50 samples...


Computing embeddings: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50/50 [00:00<00:00, 188.34it/s]


[2026-01-08 16:52:30,427][oumi][rank0][pid:62985][MainThread][INFO]][embedding_analyzer.py:537] Detecting semantic duplicates...
[2026-01-08 16:52:30,429][oumi][rank0][pid:62985][MainThread][INFO]][embedding_analyzer.py:624] Detecting fuzzy duplicates using MinHash LSH...
[2026-01-08 16:52:30,433][oumi][rank0][pid:62985][MainThread][INFO]][embedding_analyzer.py:350] Creating MinHash signatures for 50 samples...


Creating MinHash signatures: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50/50 [00:00<00:00, 651.13it/s]


[2026-01-08 16:52:30,543][oumi][rank0][pid:62985][MainThread][INFO]][embedding_analyzer.py:369] Finding fuzzy duplicates using LSH...


Finding duplicates: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50/50 [00:00<00:00, 27084.49it/s]


[2026-01-08 16:52:30,551][oumi][rank0][pid:62985][MainThread][INFO]][question_diversity_analyzer.py:464] Computing embeddings for 20 user questions...


Computing embeddings: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20/20 [00:00<00:00, 120.18it/s]


[2026-01-08 16:52:30,723][oumi][rank0][pid:62985][MainThread][INFO]][question_diversity_analyzer.py:469] Clustering 20 questions using dbscan...
[2026-01-08 16:52:30,727][oumi][rank0][pid:62985][MainThread][INFO]][question_diversity_analyzer.py:482] Found 0 clusters, 20 unique/diverse questions (not similar to others)
[2026-01-08 16:52:30,729][oumi][rank0][pid:62985][MainThread][INFO]][repr_diversity_analyzer.py:363] Computing diversity scores for 50 samples in column 'text_content'...


Computing embeddings: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 50/50 [00:00<00:00, 168.05it/s]


[2026-01-08 16:52:31,030][oumi][rank0][pid:62985][MainThread][INFO]][repr_diversity_analyzer.py:230] Computing nearest neighbor distances for 50 samples (k=5)...
[2026-01-08 16:52:31,034][oumi][rank0][pid:62985][MainThread][INFO]][repr_diversity_analyzer.py:556] Column 'text_content': 11/50 samples (22.0%) are redundant
[2026-01-08 16:52:31,075][oumi][rank0][pid:62985][MainThread][INFO]][llm_judge_analyzer.py:780] Skipping message-level analysis (analyze_message_level=False). Set analyze_message_level=True to enable.


  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)


Total conversations analyzed: 50


In [6]:
from pathlib import Path
import traceback
from oumi.utils.analysis_utils import save_analyzer_artifacts

# Save all analyzer artifacts (dataframes, schemas, summary)
save_analyzer_artifacts(analyzer, Path(config.output_path), output_format="parquet")


[2026-01-08 16:52:33,494][oumi.utils.analysis_utils][rank0][pid:62985][MainThread][INFO]][analysis_utils.py:1426] Saved message analysis to: /Users/ryanarman/code/oumi/analysis_output/openassistant/messages_df.parquet
[2026-01-08 16:52:33,501][oumi.utils.analysis_utils][rank0][pid:62985][MainThread][INFO]][analysis_utils.py:1432] Saved conversation analysis to: /Users/ryanarman/code/oumi/analysis_output/openassistant/conversations_df.parquet
[2026-01-08 16:52:33,512][oumi.utils.analysis_utils][rank0][pid:62985][MainThread][INFO]][analysis_utils.py:1438] Saved merged analysis to: /Users/ryanarman/code/oumi/analysis_output/openassistant/merged_df.parquet
[2026-01-08 16:52:33,514][oumi.utils.analysis_utils][rank0][pid:62985][MainThread][INFO]][analysis_utils.py:1450] Saved message schema to: /Users/ryanarman/code/oumi/analysis_output/openassistant/message_schema.json
[2026-01-08 16:52:33,515][oumi.utils.analysis_utils][rank0][pid:62985][MainThread][INFO]][analysis_utils.py:1457] Saved con

# Load artifacts


In [7]:
# OUTPUT_PATH = "/Users/ryanarman/code/oumi/analysis_output/openassistant"
OUTPUT_PATH = "/Users/ryanarman/code/oumi/analysis_output/openassistant"

from oumi.utils.analysis_utils import (
    load_analyzer_artifacts,
    regenerate_recommendations,
)

artifacts = load_analyzer_artifacts(OUTPUT_PATH)

# Regenerate recommendations with latest code (e.g., updated duplicate detection)
artifacts = regenerate_recommendations(artifacts, outlier_threshold=3.0)

artifacts.keys()

# Generate HTML report if configured


try:
    from oumi.core.analyze.report_generator import HTMLReportGenerator

    report_gen = HTMLReportGenerator()
    report_path = report_gen.generate_report(
        artifacts=artifacts,
        output_path=OUTPUT_PATH,
        title="OpenAssistant Analysis Report",
    )
    print(f"‚úÖ Generated HTML report at: {report_path / 'index.html'}")
except ImportError:
    print("‚ö†Ô∏è  Plotly not installed. Skipping HTML report generation.")
    print("   Install with: pip install 'oumi[analyze_advanced]'")
except Exception as e:
    print(f"‚ö†Ô∏è  Failed to generate HTML report: {e}")
    print("\nüîç FULL TRACEBACK:")
    print("=" * 70)
    traceback.print_exc()
    print("=" * 70)

print(f"\nüìÅ All results saved to: {OUTPUT_PATH}")


[2026-01-08 16:52:38,542][oumi.utils.analysis_utils][rank0][pid:62985][MainThread][INFO]][analysis_utils.py:1551] Loaded message analysis from: /Users/ryanarman/code/oumi/analysis_output/openassistant/messages_df
[2026-01-08 16:52:38,548][oumi.utils.analysis_utils][rank0][pid:62985][MainThread][INFO]][analysis_utils.py:1558] Loaded conversation analysis from: /Users/ryanarman/code/oumi/analysis_output/openassistant/conversations_df
[2026-01-08 16:52:38,559][oumi.utils.analysis_utils][rank0][pid:62985][MainThread][INFO]][analysis_utils.py:1565] Loaded merged analysis from: /Users/ryanarman/code/oumi/analysis_output/openassistant/merged_df
[2026-01-08 16:52:38,560][oumi.utils.analysis_utils][rank0][pid:62985][MainThread][INFO]][analysis_utils.py:1574] Loaded combined schemas from: /Users/ryanarman/code/oumi/analysis_output/openassistant/schema.json
[2026-01-08 16:52:38,562][oumi.utils.analysis_utils][rank0][pid:62985][MainThread][INFO]][analysis_utils.py:1596] Loaded analysis summary fro

  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)


[2026-01-08 16:52:38,648][oumi.utils.analysis_utils][rank0][pid:62985][MainThread][INFO]][analysis_utils.py:1654] Regenerated 10 recommendations from artifacts with latest code



Number of distinct clusters (3) found smaller than n_clusters (4). Possibly due to duplicate points in X.


Number of distinct clusters (3) found smaller than n_clusters (4). Possibly due to duplicate points in X.


Number of distinct clusters (3) found smaller than n_clusters (4). Possibly due to duplicate points in X.



[2026-01-08 16:52:39,415][oumi][rank0][pid:62985][MainThread][INFO]][report_generator.py:290] Generated HTML report: /Users/ryanarman/code/oumi/analysis_output/openassistant/index.html
[2026-01-08 16:52:39,416][oumi][rank0][pid:62985][MainThread][INFO]][report_generator.py:291] External data files written to: /Users/ryanarman/code/oumi/analysis_output/openassistant/data
‚úÖ Generated HTML report at: /Users/ryanarman/code/oumi/analysis_output/openassistant/index.html

üìÅ All results saved to: /Users/ryanarman/code/oumi/analysis_output/openassistant
