In [None]:
# sys.path.insert(0, 'oumi/src')
from oumi.core.analyze.dataset_analyzer import DatasetAnalyzer
from oumi.core.configs import AnalyzeConfig, DatasetSource, SampleAnalyzerParams

# Test with HuggingFace ultrachat_200k dataset
config = AnalyzeConfig(
    dataset_source=DatasetSource.CONFIG,
    dataset_name="huggingfaceh4/ultrachat_200k",
    split="train_sft",
    subset=None,
    sample_count=1,  # Analyze only 1 sample
    analyzers=[
        SampleAnalyzerParams(
            id="length",
            params={
                "char_count": True,
                "word_count": True,
                "sentence_count": True,
                "token_count": True,
            },
        )
    ],
    tokenizer_config={"model_name": "microsoft/DialoGPT-small", "tokenizer_kwargs": {}},
)



In [2]:
print("Loading and analyzing HuggingFace ultrachat_200k dataset...")
analyzer = DatasetAnalyzer(config)
analyzer.analyze_dataset()

Loading and analyzing HuggingFace ultrachat_200k dataset...
[2025-09-23 17:13:54,119][oumi][rank0][pid:75142][MainThread][INFO]][models.py:526] Using the chat template 'gpt2', which is the default for model 'microsoft/DialoGPT-small'. 
[2025-09-23 17:13:54,120][oumi.utils.analysis_utils][rank0][pid:75142][MainThread][INFO]][analysis_utils.py:54] Built tokenizer for model: microsoft/DialoGPT-small
[2025-09-23 17:13:54,121][oumi][rank0][pid:75142][MainThread][INFO]][base_map_dataset.py:91] Creating map dataset (type: UltrachatH4Dataset)... dataset_name: 'huggingfaceh4/ultrachat_200k'
[2025-09-23 17:13:56,645][oumi][rank0][pid:75142][MainThread][INFO]][base_map_dataset.py:487] Dataset Info:
	Split: train_sft
	Version: 0.0.0
	Dataset size: 3047427114
	Download size: 1624049723
	Size: 4671476837 bytes
	Rows: 207865
	Columns: ['prompt', 'prompt_id', 'messages']
[2025-09-23 17:13:57,586][oumi][rank0][pid:75142][MainThread][INFO]][base_map_dataset.py:426] Loaded DataFrame with shape: (207865, 

Analyzing items in huggingfaceh4/ultrachat_200k: 100%|██████████| 1/1 [00:00<00:00, 115.01item/s]


In [5]:
print("✅ Analysis completed!")
print(f"  Dataset: {analyzer.dataset_name}")
print(f"  Data type: {analyzer.data_type}")
print(f"  Rows DF shape: {analyzer.rows_df.shape}")
print(f"  Items DF shape: {analyzer.items_df.shape}")
print(f"  Text fields: {analyzer.text_fields}")

✅ Analysis completed!
  Dataset: huggingfaceh4/ultrachat_200k
  Data type: conversation
  Rows DF shape: (8, 10)
  Items DF shape: (1, 8)
  Text fields: ['rendered_sample', 'content']


In [6]:
analyzer.dataset.conversation(0)

USER: These instructions apply to section-based themes (Responsive 6.0+, Retina 4.0+, Parallax 3.0+ Turbo 2.0+, Mobilia 5.0+). What theme version am I using?
On your Collections pages & Featured Collections sections, you can easily show the secondary image of a product on hover by enabling one of the theme's built-in settings!
Your Collection pages & Featured Collections sections will now display the secondary product image just by hovering over that product image thumbnail.
Does this feature apply to all sections of the theme or just specific ones as listed in the text material?
ASSISTANT: This feature only applies to Collection pages and Featured Collections sections of the section-based themes listed in the text material.
USER: Can you guide me through the process of enabling the secondary image hover feature on my Collection pages and Featured Collections sections?
ASSISTANT: Sure, here are the steps to enable the secondary image hover feature on your Collection pages and Featured 

In [7]:
analyzer.rows_df

Unnamed: 0,item_index,row_id,row_index,row_type,role,content,content_char_count,content_word_count,content_sentence_count,content_token_count
0,0,msg_0,0,user_message,user,These instructions apply to section-based them...,580,92,10,128
1,0,msg_1,1,assistant_message,assistant,This feature only applies to Collection pages ...,580,92,10,128
2,0,msg_2,2,user_message,user,Can you guide me through the process of enabli...,580,92,10,128
3,0,msg_3,3,assistant_message,assistant,"Sure, here are the steps to enable the seconda...",580,92,10,128
4,0,msg_4,4,user_message,user,Can you provide me with a link to the document...,580,92,10,128
5,0,msg_5,5,assistant_message,assistant,I don't have access to your store's theme info...,580,92,10,128
6,0,msg_6,6,user_message,user,Can you confirm if this feature also works for...,580,92,10,128
7,0,msg_7,7,assistant_message,assistant,The secondary image hover feature may or may n...,580,92,10,128


In [10]:
analyzer.items_df

Unnamed: 0,item_index,item_id,item_type,rendered_sample,sample_length_char_count,sample_length_word_count,sample_length_sentence_count,sample_length_token_count
0,0,conv_0,conversation,These instructions apply to section-based them...,3191,507,44,655


In [11]:
analyzer.analysis_df

Unnamed: 0,item_index,row_id,row_index,row_type,role,content,content_char_count,content_word_count,content_sentence_count,content_token_count,item_id,item_type,rendered_sample,sample_length_char_count,sample_length_word_count,sample_length_sentence_count,sample_length_token_count
0,0,msg_0,0,user_message,user,These instructions apply to section-based them...,580,92,10,128,conv_0,conversation,These instructions apply to section-based them...,3191,507,44,655
1,0,msg_1,1,assistant_message,assistant,This feature only applies to Collection pages ...,580,92,10,128,conv_0,conversation,These instructions apply to section-based them...,3191,507,44,655
2,0,msg_2,2,user_message,user,Can you guide me through the process of enabli...,580,92,10,128,conv_0,conversation,These instructions apply to section-based them...,3191,507,44,655
3,0,msg_3,3,assistant_message,assistant,"Sure, here are the steps to enable the seconda...",580,92,10,128,conv_0,conversation,These instructions apply to section-based them...,3191,507,44,655
4,0,msg_4,4,user_message,user,Can you provide me with a link to the document...,580,92,10,128,conv_0,conversation,These instructions apply to section-based them...,3191,507,44,655
5,0,msg_5,5,assistant_message,assistant,I don't have access to your store's theme info...,580,92,10,128,conv_0,conversation,These instructions apply to section-based them...,3191,507,44,655
6,0,msg_6,6,user_message,user,Can you confirm if this feature also works for...,580,92,10,128,conv_0,conversation,These instructions apply to section-based them...,3191,507,44,655
7,0,msg_7,7,assistant_message,assistant,The secondary image hover feature may or may n...,580,92,10,128,conv_0,conversation,These instructions apply to section-based them...,3191,507,44,655


In [None]:
print("\n=== Summary Statistics ===")
print("User messages:")
user_df = analyzer.fields_df[analyzer.fields_df["field_name"] == "user_message"]
print(f"  Count: {len(user_df)}")
print(f"  Avg chars: {user_df['field_length_char_count'].mean():.1f}")
print(f"  Avg words: {user_df['field_length_word_count'].mean():.1f}")
print(f"  Avg tokens: {user_df['field_length_token_count'].mean():.1f}")

print("\nAssistant messages:")


=== Summary Statistics ===
User messages:
  Count: 0
  Avg chars: nan
  Avg words: nan
  Avg tokens: nan

Assistant messages:


In [None]:
assistant_df = analyzer.fields_df[
    analyzer.fields_df["field_name"] == "assistant_message"
]
print(f"  Count: {len(assistant_df)}")
print(f"  Avg chars: {assistant_df['field_length_char_count'].mean():.1f}")
print(f"  Avg words: {assistant_df['field_length_word_count'].mean():.1f}")
print(f"  Avg tokens: {assistant_df['field_length_token_count'].mean():.1f}")

  Count: 0
  Avg chars: nan
  Avg words: nan
  Avg tokens: nan


In [None]:
print("\nRendered conversations:")
rendered_df = analyzer.fields_df[analyzer.fields_df["field_name"] == "rendered_sample"]
print(f"  Count: {len(rendered_df)}")
print(f"  Avg chars: {rendered_df['field_length_char_count'].mean():.1f}")
print(f"  Avg words: {rendered_df['field_length_word_count'].mean():.1f}")
print(f"  Avg tokens: {rendered_df['field_length_token_count'].mean():.1f}")

print("\nSample-level totals:")
print(
    f"  Avg total chars: {analyzer.samples_df['sample_length_char_count'].mean():.1f}"
)
print(
    f"  Avg total words: {analyzer.samples_df['sample_length_word_count'].mean():.1f}"
)
print(
    f"  Avg total tokens: {analyzer.samples_df['sample_length_token_count'].mean():.1f}"
)


Rendered conversations:
  Count: 0
  Avg chars: nan
  Avg words: nan
  Avg tokens: nan

Sample-level totals:
  Avg total chars: 3094.0
  Avg total words: 514.0
  Avg total tokens: 655.0
