In [1]:
# Path to the config file
config_path = (
    "/Users/ryanarman/code/oumi/configs/examples/analyze/analyze_dedup_emb.yaml"
)

In [2]:
import os
import sys


# IMPORTANT: Set these BEFORE importing torch or any ML libraries
# Disable all GPU/MPS backends to prevent crashes with IFD analyzer
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["CUDA_VISIBLE_DEVICES"] = ""  # Disable CUDA
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
os.environ["PYTORCH_MPS_HIGH_WATERMARK_RATIO"] = "0.0"  # Disable MPS memory allocation
os.environ["DISABLE_MPS_COMPAT"] = "1"  # Additional MPS disable flag
os.environ["HF_HUB_DISABLE_TELEMETRY"] = "1"  # Disable HuggingFace telemetry
os.environ["TRANSFORMERS_OFFLINE"] = "0"  # Allow model downloads

# Force CPU usage in PyTorch to avoid MPS crashes
import torch

# Forcefully disable MPS before anything else
torch.set_default_device("cpu")
if hasattr(torch.backends, "mps"):
    # Monkey-patch to prevent MPS usage
    original_is_available = torch.backends.mps.is_available
    torch.backends.mps.is_available = lambda: False
    print("MPS has been disabled - forcing CPU-only mode")
else:
    print("Using CPU for all computations")

print(f"PyTorch device: {torch.get_default_device()}")
print(f"PyTorch version: {torch.__version__}")

MPS has been disabled - forcing CPU-only mode
PyTorch device: cpu
PyTorch version: 2.6.0


In [None]:
import os
from oumi.core.configs import AnalyzeConfig
from oumi.core.analyze.dataset_analyzer import DatasetAnalyzer

# Path to your dataset file
dataset_path = "/Users/ryanarman/code/scratch/ryan_hillclimbing_experiments/banking77/notebooks/data/banking77_train.jsonl"

# Load the config from YAML
config = AnalyzeConfig.from_yaml(
    config_path=config_path,
)

config.sample_count = 10

# Override the dataset settings to use your local file
config.dataset_path = dataset_path
config.dataset_name = None  # Clear dataset_name so it uses dataset_path instead

# Optionally update output path
config.output_path = "./analysis_output/banking77"

# IMPORTANT: Disable analyzers that require large model downloads or have issues
# IFD requires downloading Qwen model and may cause MPS crashes
# fasttext requires additional dependencies
# repr_diversity and question_diversity download embedding models
print(
    f"Running {len(config.analyzers)} analyzers: {[a.instance_id for a in config.analyzers]}"
)

# Validate the configuration
config.finalize_and_validate()

# Create the analyzer
analyzer = DatasetAnalyzer(config)

Running 1 analyzers: ['embedding']
[2025-12-30 14:32:05,085][oumi][rank0][pid:54978][MainThread][INFO]][base_map_dataset.py:91] Creating map dataset (type: TextSftJsonLinesDataset)... dataset_name: 'custom'
[2025-12-30 14:32:05,086][oumi.utils.analysis_utils][rank0][pid:54978][MainThread][INFO]][analysis_utils.py:225] Loaded text dataset from: /Users/ryanarman/code/scratch/ryan_hillclimbing_experiments/banking77/notebooks/data/banking77_train.jsonl
[2025-12-30 14:32:05,086][oumi][rank0][pid:54978][MainThread][INFO]][dataset_analyzer.py:154] Loaded dataset from config: None
[2025-12-30 14:32:05,095][oumi][rank0][pid:54978][MainThread][INFO]][dataset_analyzer.py:304] Initialized sample analyzer: embedding


In [4]:
# Run the analysis
analyzer.analyze_dataset()

# The results are stored in analyzer object
if analyzer._analysis_results:
    print(
        f"Total conversations analyzed: {analyzer._analysis_results.conversations_analyzed}"
    )

[2025-12-30 14:32:05,102][oumi][rank0][pid:54978][MainThread][INFO]][dataset_analyzer.py:347] Starting analysis of dataset: None
[2025-12-30 14:32:05,103][oumi][rank0][pid:54978][MainThread][INFO]][dataset_analyzer.py:348] Using 1 sample analyzers: ['embedding']
[2025-12-30 14:32:05,104][oumi][rank0][pid:54978][MainThread][INFO]][dataset_analyzer.py:371] Analyzing 10 of 8002 conversations
[2025-12-30 14:32:05,104][oumi][rank0][pid:54978][MainThread][INFO]][dataset_analyzer.py:421] Converting conversation dataset with 8002 items
[2025-12-30 14:32:05,105][oumi][rank0][pid:54978][MainThread][INFO]][dataset_analyzer.py:428] Limiting analysis to first 10 items (dataset has 8002 total)


Converting Unknown Dataset to DataFrames: 100%|██████████| 10/10 [00:00<00:00, 1551.84item/s]


[2025-12-30 14:32:05,137][oumi][rank0][pid:54978][MainThread][INFO]][embedding_analyzer.py:518] Computing embeddings for 10 samples...
[2025-12-30 14:32:05,138][oumi][rank0][pid:54978][MainThread][INFO]][embedding_analyzer.py:196] Loading embedding model: all-MiniLM-L6-v2


Computing embeddings: 100%|██████████| 10/10 [00:00<00:00, 88.94it/s]


[2025-12-30 14:32:06,215][oumi][rank0][pid:54978][MainThread][INFO]][embedding_analyzer.py:537] Detecting semantic duplicates...
[2025-12-30 14:32:06,218][oumi][rank0][pid:54978][MainThread][INFO]][embedding_analyzer.py:518] Computing embeddings for 30 samples...


Computing embeddings: 100%|██████████| 30/30 [00:00<00:00, 135.23it/s]


[2025-12-30 14:32:06,443][oumi][rank0][pid:54978][MainThread][INFO]][embedding_analyzer.py:537] Detecting semantic duplicates...
Total conversations analyzed: 10


In [5]:
schema = analyzer.get_schema()
print(f"len(schema): {len(schema)}")
merged_columns = analyzer.analysis_df.columns
print(f"len(merged_columns): {len(merged_columns)}")

len(schema): 14
len(merged_columns): 14


In [6]:
set(merged_columns) - set(schema.keys())

set()

In [None]:
analyzer_names = [a.instance_id for a in config.analyzers]
analyzer_names

['embedding']

# Conv level

In [8]:
analyzer_names

['embedding']

In [13]:
from oumi.core.analyze.column_utils import (
    filter_analyzer_columns,
    get_analyzer_columns_by_analyzer,
    parse_analyzer_column_name,
)

conv_columns = analyzer.conversation_df.columns
row = analyzer.conversation_df.iloc[0]

# Choose the analzyer to analyze
analyzer_name = analyzer_names[0]


filtered_cols = filter_analyzer_columns(conv_columns, analyzer_id=analyzer_name)
if filtered_cols:
    print(f"Analyzer: {analyzer_name}")
    info = parse_analyzer_column_name(filtered_cols[0])
    # print("\nInput:")
    # print(f"source_column: {info.source_column}")
    # print(f"{row[info.source_column]}\n")

    for col in filtered_cols:
        info = parse_analyzer_column_name(col)
        print(f"metric: {info.metric_name}")
        # print(f"type: {schema[col]['type']}")
        # print(f"content_type: {schema[col]['content_type']}")
        print(f"description: {schema[col]['description']}")
        print(f"value: {row[col]}")
        print("\n")
else:
    print(f"No columns found for analyzer: {analyzer_name}")


Analyzer: embedding
metric: duplicate_group
description: Semantic duplicate group ID (samples with same ID are duplicates)
value: 0


metric: has_semantic_duplicate
description: Whether sample has semantic duplicates
value: True


metric: embedding
description: Text embedding vector
value: [-0.010222597047686577, 0.03211353346705437, -0.0741022601723671, -0.032227084040641785, 0.01380691397935152, -0.036284253001213074, 0.16152727603912354, -0.022863080725073814, 0.01166739221662283, -0.041188184171915054, 0.0352775976061821, -0.06773829460144043, -0.0008225407218560576, -0.020127248018980026, -0.0380123034119606, -0.044397756457328796, 0.03937692940235138, -0.06034582108259201, -0.03606202080845833, 0.07922107726335526, -0.009423102252185345, 0.035254113376140594, -0.02838706411421299, 0.021017396822571754, 0.019764892756938934, -0.03351414576172829, -0.0021970532834529877, 0.01244708988815546, -0.04643503203988075, -0.028735056519508362, 0.01840903051197529, 0.1296456903219223, 0.041

# Message level

In [16]:
from oumi.core.analyze.column_utils import (
    filter_analyzer_columns,
    get_analyzer_columns_by_analyzer,
    parse_analyzer_column_name,
)

msg_columns = analyzer.message_df.columns
row = analyzer.message_df.iloc[2]

# Choose the analzyer to analyze
analyzer_name = analyzer_names[0]


filtered_cols = filter_analyzer_columns(msg_columns, analyzer_id=analyzer_name)
if filtered_cols:
    print(f"Analyzer: {analyzer_name}")
    info = parse_analyzer_column_name(filtered_cols[0])

    print("\nInput:")
    print(f"[{row['role']}]: {row[info.source_column]}\n")
    # print(f"source_column: {info.source_column}")
    print(f"{row[info.source_column]}\n")

    for col in filtered_cols:
        info = parse_analyzer_column_name(col)
        print(f"metric: {info.metric_name}")
        # print(f"type: {schema[col]['type']}")
        # print(f"content_type: {schema[col]['content_type']}")
        print(f"description: {schema[col]['description']}")
        print(f"value: {row[col]}")
        print("\n")
else:
    print(f"No columns found for analyzer: {analyzer_name}")


Analyzer: embedding

Input:
[assistant]: 52

52

metric: duplicate_group
description: Semantic duplicate group ID (samples with same ID are duplicates)
value: 2


metric: has_semantic_duplicate
description: Whether sample has semantic duplicates
value: True


metric: embedding
description: Text embedding vector
value: [-0.019899001345038414, 0.07813163846731186, -0.08902392536401749, 0.06082920730113983, -0.07830720394849777, 0.03836137801408768, 0.05867939069867134, -0.029619067907333374, -0.036152105778455734, 0.007074934430420399, -0.05634910985827446, -0.0476958304643631, 0.0012010467471554875, -0.08206155151128769, -0.03062894009053707, 0.018509017303586006, -0.0966188907623291, -0.02883944846689701, -0.013973702676594257, -0.02230471931397915, -0.05444377288222313, -0.006196385715156794, -0.00017845199909061193, -0.014118782244622707, 0.0009813549695536494, 0.03854510188102722, -0.0532224215567112, 0.04079664498567581, -0.07059436291456223, -0.03797011449933052, -0.01004101056605