# 01 - Data Exploration: Generative Manim Training Data

Explore the existing datasets and synthetic data generated for the SFT → DPO → GRPO pipeline.

In [None]:
import json
import pandas as pd
from collections import Counter
from pathlib import Path

## Load Existing Datasets

In [None]:
# Load edoh dataset
edoh = []
with open('../datasets/edoh-dataset.jsonl') as f:
    for line in f:
        edoh.append(json.loads(line))

print(f'Edoh dataset: {len(edoh)} examples')
print(f'Sample: {edoh[0]["messages"][1]["content"][:100]}')

In [None]:
# Load physics dataset
physics = []
with open('../datasets/physics-01.jsonl') as f:
    for line in f:
        physics.append(json.loads(line))

print(f'Physics dataset: {len(physics)} examples')
print(f'Sample: {physics[0]["messages"][1]["content"][:100]}')

## Analyze Prompt Lengths & Code Lengths

In [None]:
def analyze_dataset(data, name):
    prompts = [d['messages'][1]['content'] for d in data]
    codes = [d['messages'][2]['content'] for d in data]
    
    prompt_lens = [len(p) for p in prompts]
    code_lens = [len(c) for c in codes]
    
    print(f'=== {name} ===')
    print(f'  Prompts: min={min(prompt_lens)}, max={max(prompt_lens)}, avg={sum(prompt_lens)/len(prompt_lens):.0f}')
    print(f'  Code: min={min(code_lens)}, max={max(code_lens)}, avg={sum(code_lens)/len(code_lens):.0f}')
    return prompt_lens, code_lens

edoh_pl, edoh_cl = analyze_dataset(edoh, 'Edoh')
if physics:
    phys_pl, phys_cl = analyze_dataset(physics, 'Physics')

## Load SFT Dataset (if generated)

In [None]:
sft_path = Path('./data/outputs/sft_train.jsonl')
if sft_path.exists():
    sft = []
    with open(sft_path) as f:
        for line in f:
            sft.append(json.loads(line))
    print(f'SFT train: {len(sft)} examples')
    analyze_dataset(sft, 'SFT Train')
else:
    print('SFT dataset not yet generated. Run the data pipeline first.')

## Prompt Category Distribution

In [None]:
prompts_path = Path('./data/outputs/raw_prompts.jsonl')
if prompts_path.exists():
    cats = Counter()
    with open(prompts_path) as f:
        for line in f:
            data = json.loads(line)
            cats[data['category']] += 1
    
    df = pd.DataFrame(cats.items(), columns=['Category', 'Count']).sort_values('Count', ascending=False)
    print(f'Total prompts: {df["Count"].sum()}')
    print(df.to_string(index=False))
else:
    print('Prompts not yet generated.')