In [1]:
# Cell 1: Setup
import sys
sys.path.append('../src')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

from data_loader import PoetryLoader
from feature_extractor import PoetryFeatureExtractor
from analyzer import PoetryAnalyzer
from visualizer import PoetryVisualizer

print("✓ Modules imported successfully")

# Cell 2: Load Data
analyzer = PoetryAnalyzer()
viz = PoetryVisualizer(output_dir='../output')

# Load Tang poems (MVP: 500 poems)
print("Loading Tang Dynasty poems...")
tang_raw = analyzer.loader.load_tang_poems(max_poems=500)
print(f"Loaded {len(tang_raw)} Tang poems")

# Display sample
tang_raw.head()

# Cell 3: Extract Features
print("Extracting features from corpus...")
tang_processed = analyzer.process_corpus(tang_raw)
print("✓ Feature extraction complete")

# Show feature columns
tang_processed.columns.tolist()

# Cell 4: Basic Statistics
print("CORPUS STATISTICS")
print("="*50)
print(f"Total poems: {len(tang_processed)}")
print(f"Average lines per poem: {tang_processed['line_count'].mean():.2f}")
print(f"Average line length: {tang_processed['avg_line_length'].mean():.2f}")
print(f"Poems with uniform line length: {tang_processed['is_uniform_length'].sum()} ({tang_processed['is_uniform_length'].sum()/len(tang_processed)*100:.1f}%)")

# Cell 5: Form Distribution
form_dist = tang_processed['poem_form'].value_counts()
print("\nPOEM FORM DISTRIBUTION")
print("="*50)
for form, count in form_dist.items():
    print(f"{form:15s}: {count:4d} ({count/len(tang_processed)*100:5.1f}%)")

viz.plot_form_distribution(tang_processed, title='Tang Poetry Form Distribution')

# Cell 6: Line Length Analysis
viz.plot_line_length_distribution(tang_processed, title='Tang Poetry Line Length Distribution')

# Cell 7: End Character Analysis
print("ANALYZING END-LINE CHARACTERS")
print("="*50)

all_end_chars = []
for chars in tang_processed['end_characters']:
    all_end_chars.extend(chars)

end_char_counter = Counter(all_end_chars)
print(f"Total unique end characters: {len(end_char_counter)}")
print(f"\nTop 20 most common:")
for char, count in end_char_counter.most_common(20):
    print(f"  {char}: {count}")

viz.plot_end_character_frequency(end_char_counter, top_n=20)

# Cell 8: Save Results
# Save processed data
tang_processed.to_csv('../data/tang_processed_mvp.csv', index=False)
print("✓ Saved processed data to ../data/tang_processed_mvp.csv")

# Save summary statistics
summary = {
    'total_poems': len(tang_processed),
    'avg_lines': tang_processed['line_count'].mean(),
    'avg_line_length': tang_processed['avg_line_length'].mean(),
    'form_distribution': tang_processed['poem_form'].value_counts().to_dict(),
    'top_end_chars': dict(end_char_counter.most_common(20))
}

import json
with open('../output/tang_summary_stats.json', 'w', encoding='utf-8') as f:
    json.dump(summary, f, ensure_ascii=False, indent=2)

print("✓ Saved summary statistics to ../output/tang_summary_stats.json")


✓ Modules imported successfully
Loading Tang Dynasty poems...
Loaded 0 Tang poems
Extracting features from corpus...
✓ Feature extraction complete
CORPUS STATISTICS
Total poems: 0


KeyError: 'line_count'