# Explore Converted Data

Inspect the pre-processed Wikipedia chunks with embeddings in `data/converted/`.

In [None]:
import json
import numpy as np
from pathlib import Path

In [None]:
# Path to converted data
data_dir = Path('../data/converted')
batch_files = sorted(data_dir.glob('elasticsearch_batch_*.jsonl'))

print(f"Found {len(batch_files)} batch files")

In [None]:
# Load first batch and inspect structure
with open(batch_files[0], 'r') as f:
    lines = f.readlines()

print(f"Lines in first batch: {len(lines)}")
print(f"\nFormat: alternating index/document lines (bulk format)")

In [None]:
# Parse a sample document (every other line is the actual document)
sample_doc = json.loads(lines[1])  # Second line is first document

print("Document fields:")
for key in sample_doc.keys():
    if key == 'embedding':
        print(f"  {key}: [{len(sample_doc[key])} dimensions]")
    else:
        print(f"  {key}: {repr(sample_doc[key])[:80]}")

In [None]:
# Show sample document content
print(f"Title: {sample_doc['title']}")
print(f"Chunk index: {sample_doc['chunk_index']}")
print(f"Text length: {sample_doc['text_length']}")
print(f"\nText preview:\n{sample_doc['text'][:500]}...")

In [None]:
# Embedding stats
embedding = np.array(sample_doc['embedding'])

print(f"Embedding dimension: {len(embedding)}")
print(f"\nVector stats:")
print(f"  Min: {embedding.min():.4f}")
print(f"  Max: {embedding.max():.4f}")
print(f"  Mean: {embedding.mean():.4f}")
print(f"  Norm: {np.linalg.norm(embedding):.4f}")

In [None]:
# Count total documents across all batches
total_docs = 0
for f in batch_files:
    with open(f, 'r') as file:
        # Each doc has 2 lines (index action + document)
        total_docs += sum(1 for _ in file) // 2

print(f"Total documents: {total_docs:,}")

In [None]:
# Total storage size
total_size = sum(f.stat().st_size for f in batch_files)
print(f"Total storage: {total_size / (1024**3):.2f} GB")