In [1]:
import os
import re
import zipfile
import requests
import pickle
from bs4 import BeautifulSoup

In [2]:
#=======================================================
# 作品の収集
#=======================================================
def collect_books(author_url, books_count):

  author_name = None
  book_files = []

#=======================================================
# 作者のページから、作家名を収集
#=======================================================
  response = requests.get(author_url)
  soup = BeautifulSoup(response.content, 'html.parser')

  for tr in soup.find_all('tr'):
    header_td = tr.find('td', class_='header')
    if header_td and '作家名' in header_td.get_text():
      next_td = header_td.find_next_sibling('td')
      if next_td:
        font_tag = next_td.find('font', size='+2')
        if font_tag:
          author_name = font_tag.get_text()
          break

#=======================================================
# 作者のページから、書籍の URL を収集
#=======================================================
  book_urls = []
  
  for ol in soup.find_all('ol'):
    for li in soup.find_all('li'):
      a_tag = li.find('a', href=True)
      if a_tag:
        href = a_tag['href']
        if (href[0:3] == '../'):
          book_urls.append('https://www.aozora.gr.jp/' + href.split('../')[1])

#=======================================================
# 書籍のページから zip ファイルの URL を収集
#=======================================================
  file_urls = []
  file_index = 0

  for url in book_urls:
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    for td in soup.find_all('td'):
      if (("テキストファイル(ルビあり)" in td.get_text()) or 
          ("テキストファイル(ルビなし)" in td.get_text())):
        a_tag = td.find_next('a', href=True)
        if a_tag:
          href = a_tag['href']
          break

    url = re.match(r'(https://www\.aozora\.gr\.jp/cards/\d+/)', url)

    if (file_index < books_count):
      file_urls.append(url.group(1) + href.split('./')[1])
      file_index = file_index + 1
    else:
      break

#=======================================================
# zip ファイルをダウンロードして保存
#=======================================================
  zip_dir = 'zip_' + author_name
  os.makedirs(zip_dir, exist_ok=True)

  for url in file_urls:
    response = requests.get(url)
    file_path = os.path.join(zip_dir, url.split('/')[-1])

    with open(file_path, 'wb') as file:
      file.write(response.content)

#=======================================================
# zip ファイルを解凍してテキストファイルのみを保存
#=======================================================
  txt_dir = 'txt_' + author_name
  os.makedirs(txt_dir, exist_ok=True)

  for file in os.listdir(zip_dir):
    if file.endswith('.zip'):
      file_path = os.path.join(zip_dir, file)

      with zipfile.ZipFile(file_path, 'r') as file:
        for file_info in file.infolist():
          if file_info.filename.endswith('.txt'):
            file.extract(file_info, txt_dir)
            book_files.append(file_info.filename)

  return author_name, book_files

In [3]:
#=======================================================
# メイン（書籍の最初の 10 件をダウンロード）
#=======================================================
url_list = [
  'https://www.aozora.gr.jp/index_pages/person81.html',       # 宮沢 賢治
  'https://www.aozora.gr.jp/index_pages/person106.html',      # 北原 白秋
  'https://www.aozora.gr.jp/index_pages/person879.html',      # 芥川 竜之介  
  'https://www.aozora.gr.jp/index_pages/person1235.html',     # フランツ・カフカ  
  'https://www.aozora.gr.jp/index_pages/person94.html'        # エドガー・アラン・ポー  
]

author_list = []
file_list = []

for list in url_list:
  author, book_files = collect_books(list, 10)                # 書籍を 10 件取集
  author_list.append(author)                                  # 作者を保存
  file_list.append(book_files)                                # ファイル名を保存

print(author_list)
print(file_list)

with open("book_data.pkl", "wb") as f:          # wb = write-binary
  pickle.dump(
    {"author_list": author_list, "file_list": file_list},
    f
  )

['宮沢 賢治', '北原 白秋', '芥川 竜之介', 'カフカ フランツ', 'ポー エドガー・アラン']
[['aobikaru_tenkonohateni.txt', 'aoyagikyoyuo_okuru.txt', 'akita_kaido.txt', 'akutaukaberu_asanomizu.txt', 'akegata.txt', 'asanitsuiteno.txt', 'amenimo_makezu.txt', 'arito_kinoko.txt', 'aru_nogakuseino_nisshi.txt', 'igirisu_kaigan.txt'], ['02aino_shishuno_hajimeni.txt', 'asakusa_aika.txt', 'unasaka.txt', 'otsukisama_ikutsu.txt', 'omoide.txt', 'kaihyoto_kumo.txt', 'kage.txt', 'kazami.txt', 'kansono_aki.txt', 'kansono_toki.txt'], ['aidokushono_insho.txt', 'aki.txt', 'akutagawa_ryunosuke_kashu.txt', 'agunino_kami.txt', 'agunino_kami.txt', 'akuma.txt', 'asakusa_koen.txt', 'anikino_yona_kokoromochi.txt', 'anokorono_jibun.txt', 'ababababa.txt'], ['ieno_arujitoshite_kininarukoto.txt', 'kachono_shinpai.txt', 'kafu.txt', 'koteino_shisha.txt', 'saishono_kuno.txt', 'shokeino_hanashi.txt', 'shiro.txt', 'shinpan.txt', 'danjiki_geinin.txt', 'tsumi_kutsu_kibo_oyobi.txt'], ['usher_keno_fukumetsu.txt', 'asshakeno_hokai.txt', 'william_wilson.txt', 