In [1]:
import gutenbergpy

In [None]:
# Download and parse the Gutenberg catalog, then filter by publication year
from gutenbergpy.gutenbergcache import GutenbergCache
from gutenbergpy.textget import get_text_by_id
import pandas as pd
import os
import gzip
import shutil
import re

# Download the catalog if not already present
GutenbergCache.create()  # This will download and parse the catalog if needed

# Load the catalog as a DataFrame
cache = GutenbergCache.get_cache()
df = pd.DataFrame(cache.records)

# Extract publication year from the metadata (if available)
def extract_year(record):
    # Try to extract year from the 'subject' or 'title' fields
    for field in ['subject', 'title', 'language', 'rights', 'bookshelf', 'author', 'release_date']:
        value = record.get(field, None)
        if value and isinstance(value, str):
            match = re.search(r'(\d{4})', value)
            if match:
                return int(match.group(1))
    return None

df['year'] = df.apply(extract_year, axis=1)

# Filter books published between 1850 and 1900 (example)
filtered = df[(df['year'] >= 1850) & (df['year'] <= 1900)]
filtered_books = filtered[['id', 'title', 'author', 'year']].head(10)  # Show first 10 results
filtered_books