# Sanity Statistics

**Purpose:** Compute basic statistics and sanity checks on the EVA lines dataset.

This notebook validates data quality and provides an overview of the corpus.

## Prerequisites

- EVA lines dataset built (`python -m builders.build_eva_lines`)
- Dataset available in `output/eva_lines.parquet`

In [None]:
import sys
from pathlib import Path

# Ensure project root is in path
project_root = Path.cwd().parent
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

print(f"Project root: {project_root}")

In [None]:
import json

from datasets import load_dataset

# Load the dataset
data_file = project_root / "output" / "eva_lines.parquet"
if not data_file.exists():
    raise FileNotFoundError(f"Dataset not found at {data_file}. Run build_eva_lines first.")

ds = load_dataset("parquet", data_files=str(data_file), split="train")
print(f"Loaded dataset with {len(ds)} records")
print(f"\nColumns: {ds.column_names}")

## 1. EVA Corpus Overview

In [None]:
# Basic counts
total_lines = len(ds)
unique_pages = len(set(ds['page_id']))
total_chars = sum(ds['char_count'])
total_words = sum(ds['word_count'])

print("=" * 50)
print("EVA CORPUS STATISTICS")
print("=" * 50)
print(f"Total pages: {unique_pages}")
print(f"Total lines: {total_lines}")
print(f"Total characters: {total_chars:,}")
print(f"Total words: {total_words:,}")
print(f"Average lines/page: {total_lines / unique_pages:.1f}")
print(f"Average chars/line: {total_chars / total_lines:.1f}")
print(f"Average words/line: {total_words / total_lines:.1f}")

In [None]:
# Lines per page distribution
from collections import Counter

page_line_counts = Counter(ds['page_id'])
line_counts = list(page_line_counts.values())

print("\nLines per page distribution:")
print(f"  Min: {min(line_counts)}")
print(f"  Max: {max(line_counts)}")
print(f"  Mean: {sum(line_counts) / len(line_counts):.1f}")
print(f"  Median: {sorted(line_counts)[len(line_counts)//2]}")

# Show pages with most/fewest lines
most_lines = page_line_counts.most_common(5)
print("\nPages with most lines:")
for page, count in most_lines:
    print(f"  {page}: {count} lines")

fewest_lines = page_line_counts.most_common()[-5:]
print("\nPages with fewest lines:")
for page, count in reversed(fewest_lines):
    print(f"  {page}: {count} lines")

## 2. Character Frequency Analysis

In [None]:
from collections import Counter

# Count all characters (excluding spaces and punctuation)
char_counter = Counter()
for text in ds['text']:
    # Remove spaces and word separators
    clean = text.replace(' ', '').replace('.', '').replace(',', '')
    char_counter.update(clean)

print("Top 20 most frequent characters:")
print("-" * 30)
for char, count in char_counter.most_common(20):
    pct = count / sum(char_counter.values()) * 100
    print(f"  '{char}': {count:>6} ({pct:>5.2f}%)")

In [None]:
# Character frequency visualization
try:
    import matplotlib.pyplot as plt

    top_chars = char_counter.most_common(20)
    chars = [c[0] for c in top_chars]
    counts = [c[1] for c in top_chars]

    plt.figure(figsize=(12, 5))
    plt.bar(chars, counts, color='steelblue')
    plt.xlabel('EVA Character')
    plt.ylabel('Frequency')
    plt.title('Top 20 EVA Character Frequencies')
    plt.tight_layout()

    # Save figure
    output_dir = project_root / "reports" / "figures"
    output_dir.mkdir(parents=True, exist_ok=True)
    plt.savefig(output_dir / "char_frequency.png", dpi=150)
    plt.show()
    print(f"Saved to: {output_dir / 'char_frequency.png'}")
except ImportError:
    print("matplotlib not available - skipping visualization")

## 3. Word Length Distribution

In [None]:
# Extract all words and compute length distribution
word_lengths = []
all_words = []

for text in ds['text']:
    # Split on spaces and period separators
    words = text.replace('.', ' ').replace(',', ' ').split()
    for word in words:
        if word:  # Skip empty strings
            word_lengths.append(len(word))
            all_words.append(word)

length_counter = Counter(word_lengths)

print("Word length distribution:")
print("-" * 30)
for length in sorted(length_counter.keys()):
    count = length_counter[length]
    pct = count / len(word_lengths) * 100
    bar = '█' * int(pct / 2)
    print(f"  {length:>2} chars: {count:>5} ({pct:>5.1f}%) {bar}")

In [None]:
# Word length statistics
print("\nWord length statistics:")
print(f"  Total words: {len(word_lengths):,}")
print(f"  Min length: {min(word_lengths)}")
print(f"  Max length: {max(word_lengths)}")
print(f"  Mean length: {sum(word_lengths) / len(word_lengths):.2f}")
print(f"  Median length: {sorted(word_lengths)[len(word_lengths)//2]}")

## 4. Most Common Words

In [None]:
word_counter = Counter(all_words)

print("Top 30 most frequent words:")
print("-" * 40)
for i, (word, count) in enumerate(word_counter.most_common(30), 1):
    pct = count / len(all_words) * 100
    print(f"  {i:>2}. {word:<15} {count:>5} ({pct:>4.2f}%)")

In [None]:
# Vocabulary statistics
unique_words = len(word_counter)
hapax_legomena = sum(1 for w, c in word_counter.items() if c == 1)
dis_legomena = sum(1 for w, c in word_counter.items() if c == 2)

print("\nVocabulary statistics:")
print(f"  Unique words (types): {unique_words:,}")
print(f"  Total words (tokens): {len(all_words):,}")
print(f"  Type-token ratio: {unique_words / len(all_words):.4f}")
print(f"  Hapax legomena (appear once): {hapax_legomena} ({hapax_legomena/unique_words*100:.1f}%)")
print(f"  Dis legomena (appear twice): {dis_legomena} ({dis_legomena/unique_words*100:.1f}%)")

## 5. Section Analysis

In [None]:
# Lines and pages by section
section_stats = {}

for record in ds:
    section = record['section'] or 'unknown'
    if section not in section_stats:
        section_stats[section] = {'lines': 0, 'pages': set(), 'chars': 0, 'words': 0}
    section_stats[section]['lines'] += 1
    section_stats[section]['pages'].add(record['page_id'])
    section_stats[section]['chars'] += record['char_count']
    section_stats[section]['words'] += record['word_count']

print("Statistics by manuscript section:")
print("-" * 70)
print(f"{'Section':<20} {'Pages':>8} {'Lines':>8} {'Chars':>10} {'Words':>10}")
print("-" * 70)

for section in sorted(section_stats.keys()):
    stats = section_stats[section]
    print(f"{section:<20} {len(stats['pages']):>8} {stats['lines']:>8} {stats['chars']:>10,} {stats['words']:>10,}")

In [None]:
# Section visualization
try:
    import matplotlib.pyplot as plt

    sections = sorted(section_stats.keys())
    page_counts = [len(section_stats[s]['pages']) for s in sections]

    plt.figure(figsize=(10, 6))
    plt.barh(sections, page_counts, color='forestgreen')
    plt.xlabel('Number of Pages')
    plt.ylabel('Section')
    plt.title('Pages per Manuscript Section')
    plt.tight_layout()

    plt.savefig(output_dir / "pages_per_section.png", dpi=150)
    plt.show()
    print(f"Saved to: {output_dir / 'pages_per_section.png'}")
except ImportError:
    print("matplotlib not available - skipping visualization")

## 6. Currier Language Distribution

In [None]:
# Analyze Currier language distribution
lang_stats = {}

for record in ds:
    lang = record['currier_language'] or 'unknown'
    if lang not in lang_stats:
        lang_stats[lang] = {'lines': 0, 'pages': set()}
    lang_stats[lang]['lines'] += 1
    lang_stats[lang]['pages'].add(record['page_id'])

print("Currier language distribution:")
print("-" * 40)
for lang in sorted(lang_stats.keys()):
    stats = lang_stats[lang]
    print(f"  Language {lang}: {len(stats['pages'])} pages, {stats['lines']} lines")

## 7. Line Type Distribution

In [None]:
# Line type analysis
line_type_counter = Counter(ds['line_type'])

print("Line type distribution:")
print("-" * 40)
for lt, count in line_type_counter.most_common():
    pct = count / len(ds) * 100
    print(f"  {lt}: {count} ({pct:.1f}%)")

## 8. Data Quality Checks

In [None]:
# Check for potential issues
issues = []

# Check for empty lines
empty_lines = sum(1 for r in ds if not r['text'].strip())
if empty_lines > 0:
    issues.append(f"Empty lines: {empty_lines}")
else:
    print("✓ No empty lines")

# Check for duplicate line IDs
line_ids = list(ds['line_id'])
duplicate_ids = len(line_ids) - len(set(line_ids))
if duplicate_ids > 0:
    issues.append(f"Duplicate line IDs: {duplicate_ids}")
else:
    print("✓ No duplicate line IDs")

# Check for lines with uncertain readings
uncertain_lines = sum(1 for r in ds if r['has_uncertain'])
print(f"ℹ Lines with uncertain readings (?): {uncertain_lines} ({uncertain_lines/len(ds)*100:.1f}%)")

# Check for lines with illegible markers
illegible_lines = sum(1 for r in ds if r['has_illegible'])
print(f"ℹ Lines with illegible markers (!): {illegible_lines} ({illegible_lines/len(ds)*100:.1f}%)")

# Check character counts
zero_char_lines = sum(1 for r in ds if r['char_count'] == 0)
if zero_char_lines > 0:
    issues.append(f"Lines with zero characters: {zero_char_lines}")
else:
    print("✓ All lines have characters")

# Summary
print("\n" + "=" * 40)
if issues:
    print("⚠ DATA QUALITY ISSUES FOUND:")
    for issue in issues:
        print(f"  - {issue}")
else:
    print("✓ ALL DATA QUALITY CHECKS PASSED")

## 9. Build Report Verification

In [None]:
# Load and verify build report
report_file = project_root / "output" / "eva_lines_build_report.json"
if report_file.exists():
    with open(report_file) as f:
        report = json.load(f)

    print("Build Report:")
    print("-" * 40)
    print(f"  Source: {report.get('source', 'unknown')}")
    print(f"  Source hash: {report.get('source_hash', 'unknown')[:16]}...")
    print(f"  Build time: {report.get('build_time', 'unknown')}")
    print(f"  Total pages: {report.get('total_pages', 'unknown')}")
    print(f"  Total lines: {report.get('total_lines', 'unknown')}")

    # Verify against actual data
    print("\nVerification:")
    if report.get('total_lines') == len(ds):
        print("  ✓ Line count matches")
    else:
        print(f"  ✗ Line count mismatch: report={report.get('total_lines')}, actual={len(ds)}")

    if report.get('total_pages') == unique_pages:
        print("  ✓ Page count matches")
    else:
        print(f"  ✗ Page count mismatch: report={report.get('total_pages')}, actual={unique_pages}")
else:
    print("⚠ Build report not found")

## 10. Summary Statistics Export

In [None]:
# Export summary statistics
summary = {
    "corpus_statistics": {
        "total_pages": unique_pages,
        "total_lines": total_lines,
        "total_characters": total_chars,
        "total_words": total_words,
        "unique_words": unique_words,
        "hapax_legomena": hapax_legomena,
        "type_token_ratio": unique_words / len(all_words),
        "avg_line_length_chars": total_chars / total_lines,
        "avg_word_length": sum(word_lengths) / len(word_lengths),
    },
    "section_statistics": {
        section: {
            "pages": len(stats['pages']),
            "lines": stats['lines'],
            "chars": stats['chars'],
            "words": stats['words'],
        }
        for section, stats in section_stats.items()
    },
    "currier_language": {
        lang: {
            "pages": len(stats['pages']),
            "lines": stats['lines'],
        }
        for lang, stats in lang_stats.items()
    },
    "line_types": dict(line_type_counter),
    "top_20_words": [{'word': w, 'count': c} for w, c in word_counter.most_common(20)],
    "top_20_characters": [{'char': c, 'count': n} for c, n in char_counter.most_common(20)],
    "data_quality": {
        "empty_lines": empty_lines,
        "duplicate_line_ids": duplicate_ids,
        "uncertain_lines": uncertain_lines,
        "illegible_lines": illegible_lines,
        "issues": issues,
        "all_checks_passed": len(issues) == 0,
    }
}

output_file = project_root / "reports" / "sanity_statistics.json"
output_file.parent.mkdir(exist_ok=True)
with open(output_file, 'w') as f:
    json.dump(summary, f, indent=2)

print(f"Summary statistics saved to: {output_file}")

In [None]:
print("\n" + "=" * 60)
print("SANITY STATISTICS COMPLETE")
print("=" * 60)
print(f"\nDataset: {len(ds):,} lines across {unique_pages} pages")
print(f"Vocabulary: {unique_words:,} unique words from {len(all_words):,} total")
print(f"Quality: {'✓ All checks passed' if len(issues) == 0 else '⚠ Issues found'}")