In [1]:
!pip install webdriver-manager



In [2]:
def get_birth_year(soup):
    """
    <td class="header"> 生年： </td> 
    """
    # ищем ячейку с заголовком "生年："
    header_td = soup.find("td", class_="header", string=lambda t: t and "生年" in t)
    if not header_td:
        return None
    # следующий соседский td — наш год
    data_td = header_td.find_next_sibling("td")
    if not data_td:
        return None
    year_text = data_td.text.strip()
    # оставляем только первые четыре цифры
    return int(year_text[:4]) if year_text[:4].isdigit() else None

In [2]:

import os
import re
import time
from urllib.parse import urljoin, urldefrag

import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager

INDEX_URL      = "https://www.aozora.gr.jp/index_pages/person_all.html"
BASE_FOLDER    = "https://www.aozora.gr.jp/index_pages/"
DOWNLOAD_DELAY = 1
RESUME_FILE    = "last_card.txt"
START_AUTHOR_URL = "https://www.aozora.gr.jp/index_pages/person311.html"

# Era 
def get_era(year: int) -> str:
    if year < 1868:
        return "Pre-Meiji"
    if year < 1912:
        return "Meiji"
    if year < 1926:
        return "Taisho"
    if year < 1989:
        return "Showa"
    if year < 2019:
        return "Heisei"
    return "Reiwa"

for era in ["Meiji", "Taisho", "Showa", "Heisei", "Reiwa", "Pre-Meiji"]:
    os.makedirs(era, exist_ok=True)

KANJI_RANGE = (ord("\u4e00"), ord("\u9fff"))
KANA_RANGE  = (ord("\u3040"), ord("\u30ff"))

def is_kanji_only(text: str) -> bool:
    has_kanji = any(KANJI_RANGE[0] <= ord(ch) <= KANJI_RANGE[1] for ch in text)
    has_kana  = any(KANA_RANGE[0] <= ord(ch) <= KANA_RANGE[1] for ch in text)
    return has_kanji and not has_kana

BIRTH_REGEX = re.compile(r"(\d{4})")

def get_birth_year(soup: BeautifulSoup) -> int | None:
    birth_td = soup.find("td", class_="header", string="生年：")
    if not birth_td:
        return None
    value_td = birth_td.find_next_sibling("td")
    if not value_td:
        return None

    m = BIRTH_REGEX.search(value_td.text.strip())
    return int(m.group(1)) if m else None


def main() -> None:
    options = webdriver.ChromeOptions()
    options.add_argument("--headless=new")
    options.add_argument("--disable-gpu")
    options.add_argument("--no-sandbox")
    options.page_load_strategy = 'eager'

    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    driver.set_page_load_timeout(30)

    driver.get(INDEX_URL)
    index_soup = BeautifulSoup(driver.page_source, "html.parser")

    authors: list[tuple[str, str]] = []
    for a in index_soup.select("ol > li > a"):
        name = a.get_text(strip=True)
        if not is_kanji_only(name):
            continue
        full_url = urljoin(BASE_FOLDER, a["href"])
        full_url, _ = urldefrag(full_url)
        authors.append((name, full_url))

    print(f"Found {len(authors)} authors with kanji-only names")

    # Read last downloaded card number
    if os.path.exists(RESUME_FILE):
        with open(RESUME_FILE, "r", encoding="utf-8") as f:
            last_downloaded_card = int(f.read().strip())
        print(f"Resuming after card number: {last_downloaded_card}")
    else:
        last_downloaded_card = None

    skipping_cards = last_downloaded_card is not None
    skipping_authors = START_AUTHOR_URL is not None

    dl_count = 0

    for name, person_url in authors:
        if skipping_authors:
            if person_url == START_AUTHOR_URL:
                skipping_authors = False
            else:
                continue

        print(f"\n👤 {name} → {person_url}")
        try:
            driver.get(person_url)
        except Exception as e:
            print(f"  ❌ Failed to load {person_url}: {e}")
            continue

        person_soup = BeautifulSoup(driver.page_source, 'html.parser')

        if "著作権存続" in person_soup.text:
            print("  ⚠️  Skipped (著作権存続)")
            continue

        birth_year = get_birth_year(person_soup)
        if not birth_year:
            print("  ⚠️  Birth year not found")
            continue
        era = get_era(birth_year)
        print(f"  📅 {birth_year} → era: {era}")

        card_links = person_soup.select("a[href^='../cards/']")
        print(f"  📚 Works found: {len(card_links)}")

        for link in card_links:
            card_href = link['href']
            m = re.search(r'card(\d+)\.html', card_href)
            if m:
                card_num = int(m.group(1))
                if skipping_cards:
                    if card_num == last_downloaded_card:
                        skipping_cards = False
                    else:
                        continue

            card_url = urljoin("https://www.aozora.gr.jp/index_pages/", link['href'])
            try:
                driver.get(card_url)
            except Exception as e:
                print(f"    ❌ Failed to load card: {e}")
                continue

            card_soup = BeautifulSoup(driver.page_source, 'html.parser')

            zip_a = card_soup.find("a", href=re.compile(r"\.zip$"))
            if not zip_a:
                continue

            zip_url = urljoin(card_url, zip_a['href'])
            title_tag = card_soup.find("h1")
            title = title_tag.get_text(strip=True) if title_tag else "untitled"
            safe_title = re.sub(r"[\\/:*?\"<>| ]+", "_", title)[:40]
            filename = f"{era}/{safe_title}_{birth_year}.zip"

            if os.path.exists(filename):
                continue

            print(f"  ⬇️  {safe_title}")
            try:
                resp = requests.get(zip_url, timeout=30)
                resp.raise_for_status()
                with open(filename, "wb") as f:
                    f.write(resp.content)
                dl_count += 1
                # Update last downloaded card
                with open(RESUME_FILE, "w", encoding="utf-8") as f:
                    f.write(str(card_num))
                time.sleep(DOWNLOAD_DELAY)
            except Exception as e:
                print(f"    ❌  Download failed: {e}")

    driver.quit()
    print(f"\n✅ Finished — downloaded {dl_count} file(s)")


if __name__ == "__main__":
    main()


🔎 Found 1031 authors with kanji-only names
🔄 Resuming after card number: 3854

👤 宮本 百合子 → https://www.aozora.gr.jp/index_pages/person311.html
  📅 1899 → era: Meiji
  📚 Works found: 1190
  ⬇️  図書カード：No.3955
  ⬇️  図書カード：No.4193
  ⬇️  図書カード：No.4014
  ⬇️  図書カード：No.4017
  ⬇️  図書カード：No.3272
  ⬇️  図書カード：No.3150
  ⬇️  図書カード：No.3489
  ⬇️  図書カード：No.3937
  ⬇️  図書カード：No.2992
  ⬇️  図書カード：No.2731
  ⬇️  図書カード：No.3700
  ⬇️  図書カード：No.2737
  ⬇️  図書カード：No.1977
  ⬇️  図書カード：No.2916
  ⬇️  図書カード：No.7935
  ⬇️  図書カード：No.2956
  ⬇️  図書カード：No.3927
  ⬇️  図書カード：No.2035
  ⬇️  図書カード：No.15980
  ⬇️  図書カード：No.16026
  ⬇️  図書カード：No.3987
  ⬇️  図書カード：No.2021
  ⬇️  図書カード：No.15985
  ⬇️  図書カード：No.4183
  ⬇️  図書カード：No.3966
  ⬇️  図書カード：No.3469
  ⬇️  図書カード：No.3834
  ⬇️  図書カード：No.3928
  ⬇️  図書カード：No.3951
  ⬇️  図書カード：No.2791
  ⬇️  図書カード：No.3095
  ⬇️  図書カード：No.7908
  ⬇️  図書カード：No.4172
  ⬇️  図書カード：No.16014
  ⬇️  図書カード：No.3010
  ⬇️  図書カード：No.2824
  ⬇️  図書カード：No.2885
  ⬇️  図書カード：No.4163
  ⬇️  図書カード：No.4164
  ⬇️  図書カード：No.4235
  ⬇️  図書カー