In [1]:
import os
import sys


# IMPORTANT: Set these BEFORE importing torch or any ML libraries
# Disable all GPU/MPS backends to prevent crashes with IFD analyzer
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["CUDA_VISIBLE_DEVICES"] = ""  # Disable CUDA
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
os.environ["PYTORCH_MPS_HIGH_WATERMARK_RATIO"] = "0.0"  # Disable MPS memory allocation
os.environ["DISABLE_MPS_COMPAT"] = "1"  # Additional MPS disable flag
os.environ["HF_HUB_DISABLE_TELEMETRY"] = "1"  # Disable HuggingFace telemetry
os.environ["TRANSFORMERS_OFFLINE"] = "0"  # Allow model downloads

# Force CPU usage in PyTorch to avoid MPS crashes
import torch

# Forcefully disable MPS before anything else
torch.set_default_device("cpu")
if hasattr(torch.backends, "mps"):
    # Monkey-patch to prevent MPS usage
    original_is_available = torch.backends.mps.is_available
    torch.backends.mps.is_available = lambda: False
    print("MPS has been disabled - forcing CPU-only mode")
else:
    print("Using CPU for all computations")

print(f"PyTorch device: {torch.get_default_device()}")
print(f"PyTorch version: {torch.__version__}")


MPS has been disabled - forcing CPU-only mode
PyTorch device: cpu
PyTorch version: 2.6.0


In [2]:
OUTPUT_PATH = "/Users/ryanarman/data/DMG/analysis_output"

import os
from oumi.core.configs import AnalyzeConfig
from oumi.core.analyze.dataset_analyzer import DatasetAnalyzer

# Load config from YAML file
config = AnalyzeConfig.from_yaml(
    "/Users/ryanarman/code/oumi/configs/examples/analyze/analyze_dmg.yaml"
)

# Override settings for this run
dataset_path = "/Users/ryanarman/data/DMG/train.jsonl"
config.dataset_path = dataset_path
config.dataset_name = None  # Clear dataset_name so it uses dataset_path instead
config.sample_count = 1000  # Adjust as needed
config.chat_template = "default"

# Set absolute output path (makes it easier to find the results!)
config.output_path = OUTPUT_PATH

print(f"‚úÖ Config loaded with {len(config.analyzers)} analyzers:")
for analyzer in config.analyzers:
    instance_id = analyzer.instance_id or analyzer.id
    print(f"  - {instance_id} (type: {analyzer.id})")

print(f"üìÅ Output will be saved to: {config.output_path}")

# Validate the configuration
config.finalize_and_validate()
print("‚úÖ Config validated successfully!")

analyzer = DatasetAnalyzer(config)


‚úÖ Config loaded with 10 analyzers:
  - length (type: length)
  - token_stats (type: token_stats)
  - cost (type: cost)
  - fasttext (type: fasttext)
  - embedding (type: embedding)
  - question_diversity (type: question_diversity)
  - repr_diversity (type: repr_diversity)
  - validation_quality (type: llm_judge)
  - instruction_quality (type: llm_judge)
  - response_quality (type: llm_judge)
üìÅ Output will be saved to: /Users/ryanarman/data/DMG/analysis_output
‚úÖ Config validated successfully!
[2026-01-12 16:52:25,913][oumi][rank0][pid:7752][MainThread][INFO]][base_map_dataset.py:91] Creating map dataset (type: TextSftJsonLinesDataset)... dataset_name: 'custom'
[2026-01-12 16:52:25,914][oumi.utils.analysis_utils][rank0][pid:7752][MainThread][INFO]][analysis_utils.py:227] Loaded text dataset from: /Users/ryanarman/data/DMG/train.jsonl
[2026-01-12 16:52:25,914][oumi][rank0][pid:7752][MainThread][INFO]][dataset_analyzer.py:153] Loaded dataset from config: None
[2026-01-12 16:52:26,04

In [3]:
# Run the analysis
analyzer.analyze_dataset()

# The results are stored in analyzer object
if analyzer._analysis_results:
    print(
        f"Total conversations analyzed: {analyzer._analysis_results.conversations_analyzed}"
    )


[2026-01-12 16:52:26,070][oumi][rank0][pid:7752][MainThread][INFO]][dataset_analyzer.py:362] Starting analysis of dataset: None
[2026-01-12 16:52:26,071][oumi][rank0][pid:7752][MainThread][INFO]][dataset_analyzer.py:363] Using 10 sample analyzers: ['length', 'token_stats', 'cost', 'fasttext', 'embedding', 'question_diversity', 'repr_diversity', 'validation_quality', 'instruction_quality', 'response_quality']
[2026-01-12 16:52:26,071][oumi][rank0][pid:7752][MainThread][INFO]][dataset_analyzer.py:386] Analyzing 1000 of 5100 conversations
[2026-01-12 16:52:26,072][oumi][rank0][pid:7752][MainThread][INFO]][dataset_analyzer.py:438] Converting conversation dataset with 5100 items
[2026-01-12 16:52:26,072][oumi][rank0][pid:7752][MainThread][INFO]][dataset_analyzer.py:445] Limiting analysis to first 1000 items (dataset has 5100 total)


Converting Unknown Dataset to DataFrames: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1000/1000 [00:01<00:00, 940.28item/s]


[2026-01-12 16:52:27,196][oumi.utils.analysis_utils][rank0][pid:7752][MainThread][INFO]][analysis_utils.py:1337] Adding default schema entries for 1 columns not in base schema: ['trade']
[2026-01-12 16:52:28,087][oumi][rank0][pid:7752][MainThread][INFO]][fasttext_analyzer.py:220] Initialized fast-langdetect for language detection
[2026-01-12 16:52:28,088][oumi][rank0][pid:7752][MainThread][INFO]][fasttext_analyzer.py:458] Analyzing language for column: conversation_text_content
[2026-01-12 16:52:28,707][oumi][rank0][pid:7752][MainThread][INFO]][embedding_analyzer.py:518] Computing embeddings for 1000 samples...
[2026-01-12 16:52:28,708][oumi][rank0][pid:7752][MainThread][INFO]][embedding_analyzer.py:196] Loading embedding model: all-MiniLM-L6-v2


Computing embeddings: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1000/1000 [00:08<00:00, 121.49it/s]


[2026-01-12 16:52:37,946][oumi][rank0][pid:7752][MainThread][INFO]][embedding_analyzer.py:537] Detecting semantic duplicates...
[2026-01-12 16:52:38,086][oumi][rank0][pid:7752][MainThread][INFO]][embedding_analyzer.py:624] Detecting fuzzy duplicates using MinHash LSH...
[2026-01-12 16:52:38,090][oumi][rank0][pid:7752][MainThread][INFO]][embedding_analyzer.py:350] Creating MinHash signatures for 1000 samples...


Creating MinHash signatures: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1000/1000 [00:08<00:00, 112.16it/s]


[2026-01-12 16:52:47,028][oumi][rank0][pid:7752][MainThread][INFO]][embedding_analyzer.py:369] Finding fuzzy duplicates using LSH...


Finding duplicates: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1000/1000 [00:00<00:00, 1378.14it/s]


[2026-01-12 16:52:47,758][oumi][rank0][pid:7752][MainThread][INFO]][question_diversity_analyzer.py:464] Computing embeddings for 1000 user questions...
[2026-01-12 16:52:47,758][oumi][rank0][pid:7752][MainThread][INFO]][question_diversity_analyzer.py:174] Loading embedding model: all-MiniLM-L6-v2


Computing embeddings: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1000/1000 [00:07<00:00, 125.10it/s]


[2026-01-12 16:52:56,784][oumi][rank0][pid:7752][MainThread][INFO]][question_diversity_analyzer.py:469] Clustering 1000 questions using dbscan...
[2026-01-12 16:52:57,092][oumi][rank0][pid:7752][MainThread][INFO]][question_diversity_analyzer.py:487] Found 1 clusters
[2026-01-12 16:52:57,102][oumi][rank0][pid:7752][MainThread][INFO]][repr_diversity_analyzer.py:363] Computing diversity scores for 1000 samples in column 'conversation_text_content'...
[2026-01-12 16:52:57,103][oumi][rank0][pid:7752][MainThread][INFO]][repr_diversity_analyzer.py:165] Loading embedding model: sentence-transformers/all-MiniLM-L6-v2


Computing embeddings: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1000/1000 [00:08<00:00, 120.85it/s]


[2026-01-12 16:53:06,462][oumi][rank0][pid:7752][MainThread][INFO]][repr_diversity_analyzer.py:230] Computing nearest neighbor distances for 1000 samples (k=5)...
[2026-01-12 16:53:06,478][oumi][rank0][pid:7752][MainThread][INFO]][repr_diversity_analyzer.py:556] Column 'conversation_text_content': 1000/1000 samples (100.0%) are redundant
[2026-01-12 16:53:06,514][oumi][rank0][pid:7752][MainThread][INFO]][llm_judge_analyzer.py:444] Initialized LLM Judge with model: gpt-4o-mini, engine: openai


 33%|‚ñà‚ñà‚ñà‚ñé      | 334/1000 [00:08<00:06, 96.58it/s]



100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1000/1000 [00:17<00:00, 55.93it/s]


[2026-01-12 16:53:24,651][oumi][rank0][pid:7752][MainThread][INFO]][llm_judge_analyzer.py:787] Skipping conversation-level analysis (analyze_conversation_level=False). Set analyze_conversation_level=True to enable.
[2026-01-12 16:53:24,652][oumi][rank0][pid:7752][MainThread][INFO]][llm_judge_analyzer.py:787] Skipping conversation-level analysis (analyze_conversation_level=False). Set analyze_conversation_level=True to enable.
[2026-01-12 16:53:25,778][oumi][rank0][pid:7752][MainThread][INFO]][fasttext_analyzer.py:458] Analyzing language for column: text_content
[2026-01-12 16:53:26,522][oumi][rank0][pid:7752][MainThread][INFO]][embedding_analyzer.py:518] Computing embeddings for 3000 samples...


Computing embeddings: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 3000/3000 [00:18<00:00, 162.32it/s]


[2026-01-12 16:53:45,008][oumi][rank0][pid:7752][MainThread][INFO]][embedding_analyzer.py:537] Detecting semantic duplicates...
[2026-01-12 16:53:45,187][oumi][rank0][pid:7752][MainThread][INFO]][embedding_analyzer.py:624] Detecting fuzzy duplicates using MinHash LSH...
[2026-01-12 16:53:45,189][oumi][rank0][pid:7752][MainThread][INFO]][embedding_analyzer.py:350] Creating MinHash signatures for 3000 samples...


Creating MinHash signatures: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 3000/3000 [00:12<00:00, 249.46it/s]


[2026-01-12 16:53:57,249][oumi][rank0][pid:7752][MainThread][INFO]][embedding_analyzer.py:369] Finding fuzzy duplicates using LSH...


Finding duplicates: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 3000/3000 [00:01<00:00, 2854.82it/s]


[2026-01-12 16:53:58,310][oumi][rank0][pid:7752][MainThread][INFO]][question_diversity_analyzer.py:464] Computing embeddings for 1000 user questions...


Computing embeddings: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1000/1000 [00:05<00:00, 184.07it/s]


[2026-01-12 16:54:03,748][oumi][rank0][pid:7752][MainThread][INFO]][question_diversity_analyzer.py:469] Clustering 1000 questions using dbscan...
[2026-01-12 16:54:03,755][oumi][rank0][pid:7752][MainThread][INFO]][question_diversity_analyzer.py:482] Found 77 clusters, 792 unique/diverse questions (not similar to others)
[2026-01-12 16:54:03,758][oumi][rank0][pid:7752][MainThread][INFO]][repr_diversity_analyzer.py:363] Computing diversity scores for 3000 samples in column 'text_content'...


Computing embeddings: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 3000/3000 [00:18<00:00, 160.64it/s]


[2026-01-12 16:54:22,438][oumi][rank0][pid:7752][MainThread][INFO]][repr_diversity_analyzer.py:230] Computing nearest neighbor distances for 3000 samples (k=5)...
[2026-01-12 16:54:22,641][oumi][rank0][pid:7752][MainThread][INFO]][repr_diversity_analyzer.py:556] Column 'text_content': 1939/3000 samples (64.6%) are redundant
[2026-01-12 16:54:22,645][oumi][rank0][pid:7752][MainThread][INFO]][llm_judge_analyzer.py:780] Skipping message-level analysis (analyze_message_level=False). Set analyze_message_level=True to enable.
[2026-01-12 16:54:22,648][oumi][rank0][pid:7752][MainThread][INFO]][llm_judge_analyzer.py:828] Evaluating 1000 'system' messages (filtered from 3000 total)
[2026-01-12 16:54:22,656][oumi][rank0][pid:7752][MainThread][INFO]][llm_judge_analyzer.py:444] Initialized LLM Judge with model: gpt-4o-mini, engine: openai
[2026-01-12 16:54:22,661][oumi][rank0][pid:7752][MainThread][INFO]][llm_judge_analyzer.py:679] Batch of 1000: 1 unique to evaluate, 999 duplicates, 0 from cache


 46%|‚ñà‚ñà‚ñà‚ñà‚ñå     | 460/999 [00:07<00:05, 94.09it/s] 



100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 999/999 [00:15<00:00, 64.35it/s] 


Total conversations analyzed: 1000


In [4]:
from pathlib import Path
import traceback
from oumi.utils.analysis_utils import save_analyzer_artifacts

# Save all analyzer artifacts (dataframes, schemas, summary)
save_analyzer_artifacts(analyzer, Path(config.output_path), output_format="parquet")


[2026-01-12 16:54:41,228][oumi.utils.analysis_utils][rank0][pid:7752][MainThread][INFO]][analysis_utils.py:1426] Saved message analysis to: /Users/ryanarman/data/DMG/analysis_output/messages_df.parquet
[2026-01-12 16:54:41,247][oumi.utils.analysis_utils][rank0][pid:7752][MainThread][INFO]][analysis_utils.py:1432] Saved conversation analysis to: /Users/ryanarman/data/DMG/analysis_output/conversations_df.parquet
[2026-01-12 16:54:41,296][oumi.utils.analysis_utils][rank0][pid:7752][MainThread][INFO]][analysis_utils.py:1438] Saved merged analysis to: /Users/ryanarman/data/DMG/analysis_output/merged_df.parquet
[2026-01-12 16:54:41,298][oumi.utils.analysis_utils][rank0][pid:7752][MainThread][INFO]][analysis_utils.py:1450] Saved message schema to: /Users/ryanarman/data/DMG/analysis_output/message_schema.json
[2026-01-12 16:54:41,299][oumi.utils.analysis_utils][rank0][pid:7752][MainThread][INFO]][analysis_utils.py:1457] Saved conversation schema to: /Users/ryanarman/data/DMG/analysis_output/co

# Load artifacts and generate report


In [5]:
OUTPUT_PATH = "/Users/ryanarman/data/DMG/analysis_output"
from oumi.utils.analysis_utils import (
    load_analyzer_artifacts,
    regenerate_recommendations,
)

artifacts = load_analyzer_artifacts(OUTPUT_PATH)

# Regenerate recommendations with latest code (e.g., updated duplicate detection)
artifacts = regenerate_recommendations(artifacts, outlier_threshold=3.0)

artifacts.keys()


# Generate HTML report if configured


try:
    from oumi.core.analyze.report_generator import HTMLReportGenerator

    report_gen = HTMLReportGenerator()
    report_path = report_gen.generate_report(
        artifacts=artifacts,
        output_path=OUTPUT_PATH,
        title="DMG Invoice Validation Analysis Report",
    )
    print(f"‚úÖ Generated HTML report at: {report_path / 'index.html'}")
except ImportError:
    print("‚ö†Ô∏è  Plotly not installed. Skipping HTML report generation.")
    print("   Install with: pip install 'oumi[analyze_advanced]'")
except Exception as e:
    print(f"‚ö†Ô∏è  Failed to generate HTML report: {e}")
    print("\nüîç FULL TRACEBACK:")
    print("=" * 70)
    traceback.print_exc()
    print("=" * 70)

print(f"\nüìÅ All results saved to: {OUTPUT_PATH}")


[2026-01-12 16:54:41,317][oumi.utils.analysis_utils][rank0][pid:7752][MainThread][INFO]][analysis_utils.py:1551] Loaded message analysis from: /Users/ryanarman/data/DMG/analysis_output/messages_df
[2026-01-12 16:54:41,327][oumi.utils.analysis_utils][rank0][pid:7752][MainThread][INFO]][analysis_utils.py:1558] Loaded conversation analysis from: /Users/ryanarman/data/DMG/analysis_output/conversations_df
[2026-01-12 16:54:41,352][oumi.utils.analysis_utils][rank0][pid:7752][MainThread][INFO]][analysis_utils.py:1565] Loaded merged analysis from: /Users/ryanarman/data/DMG/analysis_output/merged_df
[2026-01-12 16:54:41,353][oumi.utils.analysis_utils][rank0][pid:7752][MainThread][INFO]][analysis_utils.py:1574] Loaded combined schemas from: /Users/ryanarman/data/DMG/analysis_output/schema.json
[2026-01-12 16:54:41,354][oumi.utils.analysis_utils][rank0][pid:7752][MainThread][INFO]][analysis_utils.py:1596] Loaded analysis summary from: /Users/ryanarman/data/DMG/analysis_output/analysis_summary.jso