# Explore Converted Data

Inspect the pre-processed Wikipedia chunks with embeddings in `data/converted/`.

In [1]:
import json
import numpy as np
from pathlib import Path

In [2]:
# Path to converted data
data_dir = Path('../data/converted')
batch_files = sorted(data_dir.glob('elasticsearch_batch_*.jsonl'))

print(f"Found {len(batch_files)} batch files")

Found 664 batch files


In [3]:
# Load first batch and inspect structure
with open(batch_files[0], 'r') as f:
    lines = f.readlines()

print(f"Lines in first batch: {len(lines)}")
print(f"\nFormat: alternating index/document lines (bulk format)")

Lines in first batch: 2000

Format: alternating index/document lines (bulk format)


In [4]:
# Parse a sample document (every other line is the actual document)
sample_doc = json.loads(lines[1])  # Second line is first document

print("Document fields:")
for key in sample_doc.keys():
    if key == 'embedding':
        print(f"  {key}: [{len(sample_doc[key])} dimensions]")
    else:
        print(f"  {key}: {repr(sample_doc[key])[:80]}")

Document fields:
  id: 'A & B High Performance Firearms_0'
  title: 'A & B High Performance Firearms'
  text: 'A & B High Performance Firearms was a competition pistol manufacturer. Products
  chunk_index: 0
  text_length: 324
  embedding: [384 dimensions]


In [5]:
# Show sample document content
print(f"Title: {sample_doc['title']}")
print(f"Chunk index: {sample_doc['chunk_index']}")
print(f"Text length: {sample_doc['text_length']}")
print(f"\nText preview:\n{sample_doc['text'][:500]}...")

Title: A & B High Performance Firearms
Chunk index: 0
Text length: 324

Text preview:
A & B High Performance Firearms was a competition pistol manufacturer. Products included the "Limited Class" and "Open Class" semi-automatic pistols, both available in .40 S&W; and .45 ACP. A & B sold directly to consumers. Category:Defunct firearms manufacturers Category:Defunct manufacturing companies based in California...


In [6]:
# Embedding stats
embedding = np.array(sample_doc['embedding'])

print(f"Embedding dimension: {len(embedding)}")
print(f"\nVector stats:")
print(f"  Min: {embedding.min():.4f}")
print(f"  Max: {embedding.max():.4f}")
print(f"  Mean: {embedding.mean():.4f}")
print(f"  Norm: {np.linalg.norm(embedding):.4f}")

Embedding dimension: 384

Vector stats:
  Min: -0.1419
  Max: 0.1547
  Mean: -0.0002
  Norm: 1.0000


In [7]:
# Count total documents across all batches
total_docs = 0
for f in batch_files:
    with open(f, 'r') as file:
        # Each doc has 2 lines (index action + document)
        total_docs += sum(1 for _ in file) // 2

print(f"Total documents: {total_docs:,}")

Total documents: 663,576


In [8]:
# Total storage size
total_size = sum(f.stat().st_size for f in batch_files)
print(f"Total storage: {total_size / (1024**3):.2f} GB")

Total storage: 5.94 GB
