In [1]:
import pandas as pd

from oumi.core.analyze.length_analyzer import LengthAnalyzer

# Create test data
test_data = {
    "id": [1, 2, 3],
    "request": [
        "What is machine learning?",
        "How do neural networks work?",
        "Explain gradient descent",
    ],
    "response": [
        "Machine learning is a branch of AI that enables systems to learn from data.",
        "Neural networks are computing systems inspired by biological neural networks.",
        "Gradient descent is an optimization algorithm used to minimize loss functions",
    ],
}

# Create DataFrame
test_df = pd.DataFrame(test_data)

# Initialize LengthAnalyzer
length_analyzer = LengthAnalyzer(char_count=True, word_count=True, sentence_count=True)

# Create proper column configuration
column_config = {
    "id": {"content_type": "id"},
    "request": {"content_type": "text"},
    "response": {"content_type": "text"},
}

# Analyze both columns
df_analyzed = length_analyzer.analyze(test_df, column_config=column_config)
df_analyzed



Unnamed: 0,id,request,response,request_char_count,request_word_count,request_sentence_count,response_char_count,response_word_count,response_sentence_count
0,1,What is machine learning?,Machine learning is a branch of AI that enable...,25,4,1,75,14,1
1,2,How do neural networks work?,Neural networks are computing systems inspired...,28,5,1,77,10,1
2,3,Explain gradient descent,Gradient descent is an optimization algorithm ...,24,3,1,77,11,1


In [2]:
# sys.path.insert(0, 'oumi/src')
from oumi.core.analyze.dataset_analyzer import DatasetAnalyzer
from oumi.core.configs import AnalyzeConfig, DatasetSource, SampleAnalyzerParams

# Test with HuggingFace ultrachat_200k dataset
config = AnalyzeConfig(
    dataset_source=DatasetSource.CONFIG,
    dataset_name="huggingfaceh4/ultrachat_200k",
    split="train_sft",
    subset=None,
    sample_count=1,  # Analyze only 1 sample
    analyzers=[
        SampleAnalyzerParams(
            id="length",
            params={
                "char_count": True,
                "word_count": True,
                "sentence_count": True,
                "token_count": True,
            },
        )
    ],
    tokenizer_config={"model_name": "microsoft/DialoGPT-small", "tokenizer_kwargs": {}},
)

In [3]:
print("Loading and analyzing HuggingFace ultrachat_200k dataset...")
analyzer = DatasetAnalyzer(config)
analyzer.analyze_dataset()

Loading and analyzing HuggingFace ultrachat_200k dataset...
[2025-09-24 19:52:20,606][oumi][rank0][pid:92009][MainThread][INFO]][models.py:526] Using the chat template 'gpt2', which is the default for model 'microsoft/DialoGPT-small'. 
[2025-09-24 19:52:20,608][oumi.utils.analysis_utils][rank0][pid:92009][MainThread][INFO]][analysis_utils.py:54] Built tokenizer for model: microsoft/DialoGPT-small
[2025-09-24 19:52:20,608][oumi][rank0][pid:92009][MainThread][INFO]][base_map_dataset.py:91] Creating map dataset (type: UltrachatH4Dataset)... dataset_name: 'huggingfaceh4/ultrachat_200k'
[2025-09-24 19:52:22,778][oumi][rank0][pid:92009][MainThread][INFO]][base_map_dataset.py:487] Dataset Info:
	Split: train_sft
	Version: 0.0.0
	Dataset size: 3047427114
	Download size: 1624049723
	Size: 4671476837 bytes
	Rows: 207865
	Columns: ['prompt', 'prompt_id', 'messages']
[2025-09-24 19:52:23,514][oumi][rank0][pid:92009][MainThread][INFO]][base_map_dataset.py:426] Loaded DataFrame with shape: (207865, 

Analyzing items in huggingfaceh4/ultrachat_200k: 100%|██████████| 1/1 [00:00<00:00, 132.40item/s]


In [6]:
print("✅ Analysis completed!")
print(f"  Dataset: {analyzer.dataset_name}")
print(f"  Rows DF shape: {analyzer.rows_df.shape}")
print(f"  Items DF shape: {analyzer.items_df.shape}")

✅ Analysis completed!
  Dataset: huggingfaceh4/ultrachat_200k
  Rows DF shape: (8, 8)
  Items DF shape: (1, 8)


In [7]:
analyzer.dataset.conversation(0)

USER: These instructions apply to section-based themes (Responsive 6.0+, Retina 4.0+, Parallax 3.0+ Turbo 2.0+, Mobilia 5.0+). What theme version am I using?
On your Collections pages & Featured Collections sections, you can easily show the secondary image of a product on hover by enabling one of the theme's built-in settings!
Your Collection pages & Featured Collections sections will now display the secondary product image just by hovering over that product image thumbnail.
Does this feature apply to all sections of the theme or just specific ones as listed in the text material?
ASSISTANT: This feature only applies to Collection pages and Featured Collections sections of the section-based themes listed in the text material.
USER: Can you guide me through the process of enabling the secondary image hover feature on my Collection pages and Featured Collections sections?
ASSISTANT: Sure, here are the steps to enable the secondary image hover feature on your Collection pages and Featured 

In [8]:
analyzer.column_config

{'item_id': {'type': <ColumnType.STRING: 'string'>,
  'content_type': <ContentType.METADATA: 'metadata'>,
  'description': 'Conversation identifier'},
 'item_type': {'type': <ColumnType.STRING: 'string'>,
  'content_type': <ContentType.METADATA: 'metadata'>,
  'description': 'Type of item (conversation)'},
 'rendered_item': {'type': <ColumnType.STRING: 'string'>,
  'content_type': <ContentType.TEXT: 'text'>,
  'description': 'Rendered conversation for token counting and display'},
 'row_index': {'type': <ColumnType.INT: 'int'>,
  'content_type': <ContentType.METADATA: 'metadata'>,
  'description': 'Message index within conversation'},
 'role': {'type': <ColumnType.STRING: 'string'>,
  'content_type': <ContentType.METADATA: 'metadata'>,
  'description': 'Message role (user/assistant/system)'},
 'content': {'type': <ColumnType.STRING: 'string'>,
  'content_type': <ContentType.TEXT: 'text'>,
  'description': 'Message text content'},
 'timestamp': {'type': <ColumnType.TIMESTAMP: 'timestamp

In [9]:
analyzer.rows_df

Unnamed: 0,item_index,row_index,role,content,content_char_count,content_word_count,content_sentence_count,content_token_count
0,0,0,user,These instructions apply to section-based them...,580,92,10,128
1,0,1,assistant,This feature only applies to Collection pages ...,136,20,1,23
2,0,2,user,Can you guide me through the process of enabli...,140,22,1,23
3,0,3,assistant,"Sure, here are the steps to enable the seconda...",909,150,15,200
4,0,4,user,Can you provide me with a link to the document...,65,13,1,14
5,0,5,assistant,I don't have access to your store's theme info...,370,63,3,78
6,0,6,user,Can you confirm if this feature also works for...,82,16,1,17
7,0,7,assistant,The secondary image hover feature may or may n...,805,138,11,164


In [10]:
analyzer.rows_df.dtypes

item_index                         int64
row_index                          int64
role                      string[python]
content                   string[python]
content_char_count                 int64
content_word_count                 int64
content_sentence_count             int64
content_token_count                int64
dtype: object

In [11]:
analyzer.items_df

Unnamed: 0,item_index,item_id,item_type,rendered_item,rendered_item_char_count,rendered_item_word_count,rendered_item_sentence_count,rendered_item_token_count
0,0,conv_0,conversation,These instructions apply to section-based them...,3191,507,44,655


In [12]:
analyzer.items_df.dtypes

item_index                               int64
item_id                         string[python]
item_type                       string[python]
rendered_item                   string[python]
rendered_item_char_count                 int64
rendered_item_word_count                 int64
rendered_item_sentence_count             int64
rendered_item_token_count                int64
dtype: object

In [13]:
analyzer.analysis_df

Unnamed: 0,item_index,row_index,role,content,content_char_count,content_word_count,content_sentence_count,content_token_count,item_id,item_type,rendered_item,rendered_item_char_count,rendered_item_word_count,rendered_item_sentence_count,rendered_item_token_count
0,0,0,user,These instructions apply to section-based them...,580,92,10,128,conv_0,conversation,These instructions apply to section-based them...,3191,507,44,655
1,0,1,assistant,This feature only applies to Collection pages ...,136,20,1,23,conv_0,conversation,These instructions apply to section-based them...,3191,507,44,655
2,0,2,user,Can you guide me through the process of enabli...,140,22,1,23,conv_0,conversation,These instructions apply to section-based them...,3191,507,44,655
3,0,3,assistant,"Sure, here are the steps to enable the seconda...",909,150,15,200,conv_0,conversation,These instructions apply to section-based them...,3191,507,44,655
4,0,4,user,Can you provide me with a link to the document...,65,13,1,14,conv_0,conversation,These instructions apply to section-based them...,3191,507,44,655
5,0,5,assistant,I don't have access to your store's theme info...,370,63,3,78,conv_0,conversation,These instructions apply to section-based them...,3191,507,44,655
6,0,6,user,Can you confirm if this feature also works for...,82,16,1,17,conv_0,conversation,These instructions apply to section-based them...,3191,507,44,655
7,0,7,assistant,The secondary image hover feature may or may n...,805,138,11,164,conv_0,conversation,These instructions apply to section-based them...,3191,507,44,655
