In [1]:
import sys

# sys.path.insert(0, 'oumi/src')
from oumi.core.analyze.dataset_analyzer import DatasetAnalyzer
from oumi.core.configs import AnalyzeConfig, SampleAnalyzerParams, DatasetSource

# Test with HuggingFace ultrachat_200k dataset
config = AnalyzeConfig(
    dataset_source=DatasetSource.CONFIG,
    dataset_name='huggingfaceh4/ultrachat_200k',
    split='train_sft',
    subset=None,
    sample_count=1,  # Analyze only 10 samples
    analyzers=[
        SampleAnalyzerParams(
            id='length', 
            params={
                'char_count': True, 
                'word_count': True, 
                'sentence_count': True, 
                'token_count': True
            }
        )
    ],
    tokenizer_config={
        'model_name': 'microsoft/DialoGPT-small', 
        'tokenizer_kwargs': {}
    }
)



In [2]:
print('Loading and analyzing HuggingFace ultrachat_200k dataset...')
analyzer = DatasetAnalyzer(config)
analyzer.analyze_dataset()

Loading and analyzing HuggingFace ultrachat_200k dataset...
[2025-09-19 15:53:51,983][oumi][rank0][pid:75076][MainThread][INFO]][models.py:526] Using the chat template 'gpt2', which is the default for model 'microsoft/DialoGPT-small'. 
[2025-09-19 15:53:51,984][oumi.utils.analysis_utils][rank0][pid:75076][MainThread][INFO]][analysis_utils.py:54] Built tokenizer for model: microsoft/DialoGPT-small
[2025-09-19 15:53:51,984][oumi][rank0][pid:75076][MainThread][INFO]][base_map_dataset.py:91] Creating map dataset (type: UltrachatH4Dataset)... dataset_name: 'huggingfaceh4/ultrachat_200k'
[2025-09-19 15:53:54,903][oumi][rank0][pid:75076][MainThread][INFO]][base_map_dataset.py:487] Dataset Info:
	Split: train_sft
	Version: 0.0.0
	Dataset size: 3047427114
	Download size: 1624049723
	Size: 4671476837 bytes
	Rows: 207865
	Columns: ['prompt', 'prompt_id', 'messages']
[2025-09-19 15:53:55,680][oumi][rank0][pid:75076][MainThread][INFO]][base_map_dataset.py:426] Loaded DataFrame with shape: (207865, 

Analyzing conversations in huggingfaceh4/ultrachat_200k:   0%|          | 0/1 [00:00<?, ?conv/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1308 > 1024). Running this sequence through the model will result in indexing errors
Analyzing conversations in huggingfaceh4/ultrachat_200k: 100%|██████████| 1/1 [00:00<00:00, 148.41conv/s]


In [3]:
print('✅ Analysis completed!')
print(f'  Dataset: {analyzer.dataset_name}')
print(f'  Data type: {analyzer.data_type}')
print(f'  Message DF shape: {analyzer.message_df.shape}')
print(f'  Sample DF shape: {analyzer._conversation_df.shape}')
print(f'  Text fields: {analyzer.text_fields}')

✅ Analysis completed!
  Dataset: huggingfaceh4/ultrachat_200k
  Data type: conversation
  Message DF shape: (9, 8)
  Sample DF shape: (1, 5)
  Text fields: ['user_message_0', 'assistant_message_1', 'user_message_2', 'assistant_message_3', 'user_message_4', 'assistant_message_5', 'user_message_6', 'assistant_message_7', 'rendered_sample']


In [4]:
analyzer.dataset.conversation(0)


USER: These instructions apply to section-based themes (Responsive 6.0+, Retina 4.0+, Parallax 3.0+ Turbo 2.0+, Mobilia 5.0+). What theme version am I using?
On your Collections pages & Featured Collections sections, you can easily show the secondary image of a product on hover by enabling one of the theme's built-in settings!
Your Collection pages & Featured Collections sections will now display the secondary product image just by hovering over that product image thumbnail.
Does this feature apply to all sections of the theme or just specific ones as listed in the text material?
ASSISTANT: This feature only applies to Collection pages and Featured Collections sections of the section-based themes listed in the text material.
USER: Can you guide me through the process of enabling the secondary image hover feature on my Collection pages and Featured Collections sections?
ASSISTANT: Sure, here are the steps to enable the secondary image hover feature on your Collection pages and Featured 

In [5]:
analyzer.message_df

Unnamed: 0,sample_index,field_name,field_index,text_content,field_length_char_count,field_length_word_count,field_length_sentence_count,field_length_token_count
0,0,user_message_0,0,These instructions apply to section-based them...,580,92,10,128
1,0,assistant_message_1,1,This feature only applies to Collection pages ...,136,20,1,23
2,0,user_message_2,2,Can you guide me through the process of enabli...,140,22,1,23
3,0,assistant_message_3,3,"Sure, here are the steps to enable the seconda...",909,150,15,200
4,0,user_message_4,4,Can you provide me with a link to the document...,65,13,1,14
5,0,assistant_message_5,5,I don't have access to your store's theme info...,370,63,3,78
6,0,user_message_6,6,Can you confirm if this feature also works for...,82,16,1,17
7,0,assistant_message_7,7,The secondary image hover feature may or may n...,805,138,11,164
8,0,rendered_sample,8,These instructions apply to section-based them...,3191,507,44,655


In [12]:
analyzer.conversation_df

Unnamed: 0,sample_index,sample_length_total_token_count,sample_length_total_char_count,sample_length_total_word_count,sample_length_total_sentence_count
0,0,655,6300,1023,87


In [7]:
analyzer.analysis_df

Unnamed: 0,sample_index,field_name,field_index,text_content,field_length_char_count,field_length_word_count,field_length_sentence_count,field_length_token_count,sample_length_total_token_count,sample_length_total_char_count,sample_length_total_word_count,sample_length_total_sentence_count
0,0,user_message_0,0,These instructions apply to section-based them...,580,92,10,128,655,6300,1023,87
1,0,assistant_message_1,1,This feature only applies to Collection pages ...,136,20,1,23,655,6300,1023,87
2,0,user_message_2,2,Can you guide me through the process of enabli...,140,22,1,23,655,6300,1023,87
3,0,assistant_message_3,3,"Sure, here are the steps to enable the seconda...",909,150,15,200,655,6300,1023,87
4,0,user_message_4,4,Can you provide me with a link to the document...,65,13,1,14,655,6300,1023,87
5,0,assistant_message_5,5,I don't have access to your store's theme info...,370,63,3,78,655,6300,1023,87
6,0,user_message_6,6,Can you confirm if this feature also works for...,82,16,1,17,655,6300,1023,87
7,0,assistant_message_7,7,The secondary image hover feature may or may n...,805,138,11,164,655,6300,1023,87
8,0,rendered_sample,8,These instructions apply to section-based them...,3191,507,44,655,655,6300,1023,87


In [8]:
print('\n=== Summary Statistics ===')
print('User messages:')
user_df = analyzer.message_df[analyzer.message_df['field_name'] == 'user_message']
print(f'  Count: {len(user_df)}')
print(f'  Avg chars: {user_df["field_length_char_count"].mean():.1f}')
print(f'  Avg words: {user_df["field_length_word_count"].mean():.1f}')
print(f'  Avg tokens: {user_df["field_length_token_count"].mean():.1f}')

print('\nAssistant messages:')


=== Summary Statistics ===
User messages:
  Count: 0
  Avg chars: nan
  Avg words: nan
  Avg tokens: nan

Assistant messages:


In [9]:
assistant_df = analyzer.message_df[analyzer.message_df['field_name'] == 'assistant_message']
print(f'  Count: {len(assistant_df)}')
print(f'  Avg chars: {assistant_df["field_length_char_count"].mean():.1f}')
print(f'  Avg words: {assistant_df["field_length_word_count"].mean():.1f}')
print(f'  Avg tokens: {assistant_df["field_length_token_count"].mean():.1f}')

  Count: 0
  Avg chars: nan
  Avg words: nan
  Avg tokens: nan


In [10]:
print('\nRendered conversations:')
rendered_df = analyzer.message_df[analyzer.message_df['field_name'] == 'rendered_sample']
print(f'  Count: {len(rendered_df)}')
print(f'  Avg chars: {rendered_df["field_length_char_count"].mean():.1f}')
print(f'  Avg words: {rendered_df["field_length_word_count"].mean():.1f}')
print(f'  Avg tokens: {rendered_df["field_length_token_count"].mean():.1f}')

print('\nSample-level totals:')
print(f'  Avg total chars: {analyzer._conversation_df["sample_length_total_char_count"].mean():.1f}')
print(f'  Avg total words: {analyzer._conversation_df["sample_length_total_word_count"].mean():.1f}')
print(f'  Avg total tokens: {analyzer._conversation_df["sample_length_total_token_count"].mean():.1f}')


Rendered conversations:
  Count: 1
  Avg chars: 3191.0
  Avg words: 507.0
  Avg tokens: 655.0

Sample-level totals:
  Avg total chars: 6300.0
  Avg total words: 1023.0
  Avg total tokens: 655.0
