# VCAT: Load and Explore Voynich EVA Data

This notebook demonstrates how to load the Voynich EVA transcription dataset from Hugging Face.

In [None]:
# Install datasets library if needed
# !pip install datasets

In [None]:
from datasets import load_dataset

# Load from Hugging Face
ds = load_dataset("Ched-ai/voynich-eva", "lines")
print(ds)

In [None]:
# View first record
print(ds["train"][0])

In [None]:
# Convert to pandas for easier exploration

df = ds["train"].to_pandas()
print(f"Total records: {len(df)}")
df.head()

## Basic Statistics

In [None]:
# Count by Currier language
print("Lines by Currier Language:")
print(df["currier_language"].value_counts())

In [None]:
# Count by section
print("\nLines by Section:")
print(df["section"].value_counts())

In [None]:
# Count by line type
print("\nLines by Type:")
print(df["line_type"].value_counts())

## Filter Examples

In [None]:
# Get all herbal section lines
herbal = df[df["section"] == "herbal"]
print(f"Herbal section: {len(herbal)} lines")

In [None]:
# Get lines with uncertain readings
uncertain = df[df["has_uncertain"]]
print(f"Lines with uncertain readings: {len(uncertain)}")

In [None]:
# Compare Language A vs Language B
lang_a = df[df["currier_language"] == "A"]
lang_b = df[df["currier_language"] == "B"]

print(f"Language A: {len(lang_a)} lines, avg words: {lang_a['word_count'].mean():.1f}")
print(f"Language B: {len(lang_b)} lines, avg words: {lang_b['word_count'].mean():.1f}")

## Character Frequency Analysis

In [None]:
from collections import Counter

# Count all characters in clean text
all_text = "".join(df["text_clean"].fillna(""))
# Remove separators
all_text = all_text.replace(".", "").replace(",", "")

char_counts = Counter(all_text)
print("Top 15 characters:")
for char, count in char_counts.most_common(15):
    pct = count / len(all_text) * 100
    print(f"  {char}: {count:,} ({pct:.1f}%)")

## Local Loading (from Parquet file)

If you have the local parquet file from the repository:

In [None]:
# Alternative: Load from local parquet
# from datasets import load_dataset
# ds_local = load_dataset("parquet", data_files="output/eva_lines.parquet")
# print(ds_local)