In [10]:
import requests as req
from bs4 import BeautifulSoup as bs
import os
import re


folderPath = 'project_gutenberg'
url = 'https://www.gutenberg.org/browse/languages/zh'

def get_p_content_and_save(url, folderPath, project_gutenberg):

    try:
        response = req.get(url)
        response.raise_for_status()
        soup = bs(response.text, 'lxml')

        # 提取 <p> 標籤內容
        paragraphs = soup.find_all('p')
        if not os.path.exists(folderPath):
            os.makedirs(folderPath)

        # 將內容寫入文本檔案
        
        with open(os.path.join(folderPath, f"{project_gutenberg}.txt"), "w", encoding="utf-8") as f:
            for p in paragraphs:
                    if len(p.get_text(strip=True)) > 100 :
                        f.write(p.get_text(strip=True) + "\n")
        print(f"Saved content from {url} to {project_gutenberg}.txt")

    except Exception as e:
        print(f"Error fetching {url}: {e}")



def crawl_gutenberg(base_url):
    """
    從 Gutenberg 的中文書目頁面中抓取特定連結，生成頁面並保存內容。
    
    Args:
        base_url (str): Gutenberg 中文書目頁面的 URL。
    """
    try:

        response = req.get(base_url)
        response.raise_for_status()
        soup = bs(response.text, 'lxml')

        links = soup.select('li.pgdbetext > a[href]')
        
        for link in links:
            href = link.get('href', '') #提取href   
            book_name = link.get_text(strip=True) #提取書名     
            if book_name and re.match(r'^[\u4E00-\u9FFF]', book_name):
                match = re.search(r'/ebooks/(\d+)', href) # 匹配書籍 ID
                if match:
                    book_id = match.group(1)
                    # 構造書籍頁面的網址
                    book_url = f"https://www.gutenberg.org/cache/epub/{book_id}/pg{book_id}-images.html"
                    get_p_content_and_save(book_url, folderPath, f"book_{book_id}_{book_name}")

    except Exception as e:
        print(f"Error crawling {base_url}: {e}")

if __name__ == "__main__":
    # Gutenberg 中文書目頁面
    base_url = 'https://www.gutenberg.org/browse/languages/zh'
    crawl_gutenberg(base_url)





Saved content from https://www.gutenberg.org/cache/epub/25328/pg25328-images.html to book_25328_豆棚閒話.txt
Saved content from https://www.gutenberg.org/cache/epub/24225/pg24225-images.html to book_24225_戲中戲.txt
Saved content from https://www.gutenberg.org/cache/epub/24185/pg24185-images.html to book_24185_比目魚.txt
Saved content from https://www.gutenberg.org/cache/epub/27119/pg27119-images.html to book_27119_比目魚.txt
Saved content from https://www.gutenberg.org/cache/epub/12479/pg12479-images.html to book_12479_三字經.txt
Saved content from https://www.gutenberg.org/cache/epub/25146/pg25146-images.html to book_25146_山水情.txt
Saved content from https://www.gutenberg.org/cache/epub/25288/pg25288-images.html to book_25288_山海經.txt
Saved content from https://www.gutenberg.org/cache/epub/23825/pg23825-images.html to book_23825_施公案.txt
Saved content from https://www.gutenberg.org/cache/epub/25393/pg25393-images.html to book_25393_施公案.txt
Saved content from https://www.gutenberg.org/cache/epub/25501/p