In [176]:
#!pip install chardet
#!pip install beautifulsoup4

In [22]:
import os
import json
import chardet
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from bs4 import BeautifulSoup
from urllib.parse import urljoin

In [23]:
# 发送 HTTP GET请求获取网页内容
# 包含 timeout 和 重试机制
# 重点处理了中文字符集的问题
def geturlcontent(url):
    
    #print(f"download: {url}")
    
    retry_times = 3 #设置重试次数
    retry_backoff_factor = 0.5 #设置重试间隔时间
    timeout = 5 #设置超时时间
    
    session = requests.Session()
    retry = Retry(total = retry_times, backoff_factor = retry_backoff_factor,
                  status_forcelist = [443, 500, 502, 503, 504],
                  allowed_methods = ["HEAD", "GET", "OPTIONS"])
    adapter = HTTPAdapter(max_retries = retry)
    session.mount("http://", adapter)
    session.mount("https://", adapter)
    try:
        response = session.get(url, timeout = timeout)
        #response.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(f"download {url} error: {e}.")
        return None
        
    if response.status_code == 200:
        #print(f"text: {response.text}")
        #print(f"status code: {response.status_code}")
        
        #print(f"encoding: {response.encoding}")
        #print(f"apparent encoding: {response.apparent_encoding}")
        chardet_encoding = chardet.detect(response.content);
        #print(f"chardet encoding: {chardet_encoding}"
        
        # 调整编码，避免requests编码识别问题。
        # encoding对中文的编码识别，会识别为：ISO-8859-1。
        # apparent_encoding也会识别错误，会出现：mac_greek。
        # chardet能识别为GB2312，还是会判断错误，会把GB18030判断为GB2312，后果就是，部分中文有错误。
        #response.encoding = response.apparent_encoding
        #response.encoding = chardet_encoding['encoding']
        #response.encoding = 'GB18030'
        if response.apparent_encoding == "GB18030" or chardet_encoding['encoding'] == "GB2312":
            response.encoding = 'GB18030'
        else:
            response.encoding = chardet_encoding['encoding']
        #print(f"encoding: {response.encoding}")
        #print(f"text: {response.text}")
        
        return response;
    else:
        print(f"download {url} error: status_code {response.status_code}.")
        
    return None

In [24]:
# 将 url 的内容解码为书籍的一个章节
# 章节按照 dict 对象进行结构化，以便转换为 json 格式
# 章节的解码具有针对性，不同的 site 章节的格式不一样，需要重新进行编写
def decodecontentaschapter(url, type='guwen'):

    # Decode为JSON文件
    response = geturlcontent(url)
    if response is not None:
        chapter = dict()
        
        # 解析网页内容
        soup = BeautifulSoup(response.text, "html.parser")

        # 提取目标内容
        h1 = soup.find("h1")
        title = h1.get_text(strip = True)
        if title is not None:
            chapter["title"] = title
        else:
            chapter["title"] = ""
        chapter["source"] = url
        
        div = soup.find("div", class_ = "contson")
        ps = div.find_all("p");
        if len(ps) > 0:
            paragraphs = []
            for p in ps:
                c = p.get_text(strip = True)
                c = c.replace("\u3000", "");
                if len(c):
                    paragraph = {}
                    paragraph["type"] = "原文"
                    paragraph["content"] = c
                    paragraphs.append(paragraph)
            if len(paragraphs):
                chapter["paragraphs"] = paragraphs
        else:
            paragraphs = []
            c = div.get_text(strip = True)
            c = c.replace("\u3000", "");
            if len(c):
                paragraph = {}
                paragraph["type"] = "原文"
                paragraph["content"] = c
                paragraphs.append(paragraph)
            if len(paragraphs):
                chapter["paragraphs"] = paragraphs
        
        return chapter;
    
    return None

In [25]:
# 从 gushiwen.cn 网站爬取书籍的章节 urls
def getchapterurlsfromgushiwen(url, type='guwen'):

    # Decode为JSON文件
    response = geturlcontent(url)
    if response is not None:
        book = dict()
        
        # 解析网页内容
        soup = BeautifulSoup(response.text, "html.parser")

        # decode title
        h1 = soup.find("h1")
        title = h1.get_text(strip = True)
        if title is not None:
            book["title"] = title
        else:
            book["title"] = ""
        book["source"] = url

        # decode description
        if type == 'guwen':
            div = soup.find("div", class_="sonspic", id = "sonsyuanwen")
            div = div.find("div", class_="cont")
            p = div.find_all("p");
            description = p[0].get_text(strip = True)
            if description is not None:
                book["description"] = description
            else:
                book["description"] = ""
        else:
            book["description"] = ""
        
        # decode volumes    
        volumes = []
        #div = soup.find("div", class_="sons")
        volumes_class = None
        if type == 'guwen':
            volumes_class = "bookcont"
        else:
            volumes_class = "typecont"
            
        div_volumes = div.find_all("div", class_=volumes_class);
        for div_volume in div_volumes:
            volume = {}
            div_volume_title = div_volume.find(class_="bookMl")
            title = None
            if div_volume_title is not None:
                title = div_volume_title.get_text(strip = True)
            if title is not None:
                volume["title"] = title
            else:
                volume["title"] = ""

            chapters = []
            linkss = div_volume.find_all("a");
            for link in linkss:
                chapter = {}
                chapter["title"] = link.get_text(strip = True)
                if link.has_attr("href"):
                    chapter["source"] = urljoin(url, link.attrs["href"])
                else:
                    chapter["source"] = ""
                chapters.append(chapter)
            volume["chapters"] = chapters
            volumes.append(volume)
        book["volumes"] = volumes
            
        return book
    
    return None

In [26]:
# download book from gushiwen.cn
def downloadbookfromgushiwen(outputpath, url):

    type = None
    if url.find('so.gushiwen.cn/gushi/') != -1:
        type = 'gushi'
    elif url.find('so.gushiwen.cn/guwen/') != -1:
        type = 'guwen'
    else:
        type = None
    if type is None:
        print(f"不支持{url}的下载...")
        return
        
    book = getchapterurlsfromgushiwen(url, type = type)
    if book is None:
        return
    
    print(f"下载《{book['title']}》...")
    
    for volume in book['volumes']:
        for chapter in volume['chapters']:
            if len(volume['title']):
                print(f"  下载章节《{volume['title']}·{chapter['title']}》...")
            else:
                print(f"  下载章节《{book['title']}·{chapter['title']}》...")
            if chapter["source"] is not None and len(chapter["source"]):
                chapter_content = decodecontentaschapter(chapter["source"], type = type)
                if chapter_content is not None:
                    chapter["paragraphs"] = chapter_content["paragraphs"]    

    outputfile = os.path.join(outputpath, f"{book['title']}.json" )
    with open(outputfile, "w", encoding="utf-8") as file:
        json.dump(book, file, ensure_ascii = False)
            
    print(f"保存{outputfile}文件中...")

In [30]:
url = "https://so.gushiwen.cn/guwen/book_46653FD803893E4F83C45E31E115AC99.aspx"
url = "https://so.gushiwen.cn/gushi/shijing.aspx"
url = "https://so.gushiwen.cn/gushi/chuci.aspx"
downloadbookfromgushiwen("/Users/sunyafu/zebra/YIJING/Books/", url)

下载《楚辞》...
  下载章节《楚辞·离骚》...
  下载章节《楚辞·九歌》...
  下载章节《楚辞·天问》...
  下载章节《楚辞·九章》...
  下载章节《楚辞·远游》...
  下载章节《楚辞·卜居》...
  下载章节《楚辞·渔父》...
  下载章节《楚辞·九辩》...
  下载章节《楚辞·招魂》...
  下载章节《楚辞·大招》...
  下载章节《楚辞·惜誓》...
  下载章节《楚辞·招隐士》...
  下载章节《楚辞·七谏》...
  下载章节《楚辞·哀时命》...
  下载章节《楚辞·九怀》...
  下载章节《楚辞·九叹》...
  下载章节《楚辞·九思》...
保存/Users/sunyafu/zebra/YIJING/Books/楚辞.json文件中...


In [203]:
# download books from www.gushiwen.cn
def downloadbooksfromgushiwen(outputpath, booksjson):
    for book in booksjson:
        downloadbookfromgushiwen(outputpath, book["url"])

In [204]:
books_from_gushiwen = [
    {
        "name": "诗经",
        "url": "https://so.gushiwen.cn/gushi/shijing.aspx"
    },
    {
        "name": "楚辞",
        "url": "https://so.gushiwen.cn/gushi/chuci.aspx"
    },
    {
        "name": "尔雅",
        "url": "https://so.gushiwen.cn/guwen/book_46653FD803893E4F1ED72C2149EB00A0.aspx"
    },
    {
        "name": "周礼",
        "url": "https://so.gushiwen.cn/guwen/book_46653FD803893E4F89215E11979D4F9F.aspx"
    },
    {
        "name": "仪礼",
        "url": "https://so.gushiwen.cn/guwen/book_46653FD803893E4F5BD7D6F638166051.aspx"
    },
    {
        "name": "礼记",
        "url": "https://so.gushiwen.cn/guwen/book_46653FD803893E4F4229823F35D89DC5.aspx"
    },
    {
        "name": "大戴礼记",
        "url": "https://so.gushiwen.cn/guwen/book_46653FD803893E4F59554702CBAB8327.aspx"
    },
    {
        "name": "尚书",
        "url": "https://so.gushiwen.cn/guwen/book_46653FD803893E4F93E5C6C3F12F0689.aspx"
    },
    {
        "name": "逸周书",
        "url": "https://so.gushiwen.cn/guwen/book_46653FD803893E4F83C45E31E115AC99.aspx"
    },
    {
        "name": "左传",
        "url": "https://so.gushiwen.cn/guwen/book_46653FD803893E4F699E8628DEAEE3C0.aspx"
    },
    {
        "name": "公羊传",
        "url": "https://so.gushiwen.cn/guwen/book_46653FD803893E4F1C34B0BCE27385A0.aspx"
    },
    {
        "name": "谷梁传",
        "url": "https://so.gushiwen.cn/guwen/book_46653FD803893E4F1AEF8EC5EB7CD7B8.aspx"
    },
    {
        "name": "国语",
        "url": "https://so.gushiwen.cn/guwen/book_46653FD803893E4F8019EA9A29165384.aspx"
    },
    {
        "name": "战国策",
        "url": "https://so.gushiwen.cn/guwen/book_46653FD803893E4F90B30EC6FDD22553.aspx"
    },
    {
        "name": "竹书纪年",
        "url": "https://so.gushiwen.cn/guwen/book_46653FD803893E4F03DA4E2FE601D843.aspx"
    },
    {
        "name": "穆天子传",
        "url": "https://so.gushiwen.cn/guwen/book_46653FD803893E4F4E9CADD37301AA08.aspx"
    },
    {
        "name": "山海经",
        "url": "https://so.gushiwen.cn/guwen/book_46653FD803893E4F6FCB61241D8108D0.aspx"
    },
    {
        "name": "易传",
        "url": "https://so.gushiwen.cn/guwen/book_46653FD803893E4F68FD6588DE2463D0.aspx"
    },
    {
        "name": "黄帝内经",
        "url": "https://so.gushiwen.cn/guwen/book_46653FD803893E4FF001D427402FA6E0.aspx"
    },
    {
        "name": "难经",
        "url": "https://so.gushiwen.cn/guwen/book_46653FD803893E4FAA70382D08D37221.aspx"
    },
    {
        "name": "老子",
        "url": "https://so.gushiwen.cn/guwen/book_46653FD803893E4F33D126D4A6B656E2.aspx"
    },
    {
        "name": "论语",
        "url": "https://so.gushiwen.cn/guwen/book_46653FD803893E4F75696240258265D2.aspx"
    },
    {
        "name": "孟子",
        "url": "https://so.gushiwen.cn/guwen/book_46653FD803893E4F594CDFAF65570422.aspx"
    },
    {
        "name": "荀子",
        "url": "https://so.gushiwen.cn/guwen/book_46653FD803893E4F2B41025A041B0688.aspx"
    },
    {
        "name": "孝经",
        "url": "https://so.gushiwen.cn/guwen/book_46653FD803893E4F640E24BD63B0F779.aspx"
    },
    {
        "name": "墨子",
        "url": "https://so.gushiwen.cn/guwen/book_46653FD803893E4F8BCB0149097D7698.aspx"
    },
    {
        "name": "庄子",
        "url": "https://so.gushiwen.cn/guwen/book_46653FD803893E4F92BED6BFD51EA958.aspx"
    },
    {
        "name": "文子",
        "url": "https://so.gushiwen.cn/guwen/book_46653FD803893E4FE6B070719E9F54EA.aspx"
    },
    {
        "name": "列子",
        "url": "https://so.gushiwen.cn/guwen/book_46653FD803893E4FEC967FA5CC60AB13.aspx"
    },
    {
        "name": "黄帝四经",
        "url": "https://so.gushiwen.cn/guwen/book_46653FD803893E4F453323AC4B6FC428.aspx"
    },
    {
        "name": "韩非子",
        "url": "https://so.gushiwen.cn/guwen/book_46653FD803893E4F405ACAB307224041.aspx"
    },
    {
        "name": "慎子",
        "url": "https://so.gushiwen.cn/guwen/book_46653FD803893E4F3DA9F7500F3E98AC.aspx"
    },
    {
        "name": "申子",
        "url": "https://so.gushiwen.cn/guwen/book_46653FD803893E4FD6362CACB50DF82B.aspx"
    },
    {
        "name": "商君书",
        "url": "https://so.gushiwen.cn/guwen/book_46653FD803893E4FE8F6FCA5ABE50B95.aspx"
    },
    {
        "name": "邓析子",
        "url": "https://so.gushiwen.cn/guwen/book_46653FD803893E4F59F52FE1E114DCA0.aspx"
    },
    {
        "name": "尹文子",
        "url": "https://so.gushiwen.cn/guwen/book_46653FD803893E4F53192E055F6344BD.aspx"
    },
    {
        "name": "公孙龙子",
        "url": "https://so.gushiwen.cn/guwen/book_46653FD803893E4F4FD707ED18AD70DA.aspx"
    },
    {
        "name": "鬼谷子",
        "url": "https://so.gushiwen.cn/guwen/book_46653FD803893E4F0940F1E23EB202EB.aspx"
    },
    {
        "name": "吕氏春秋",
        "url": "https://so.gushiwen.cn/guwen/book_46653FD803893E4F8B249E400F0AA90B.aspx"
    },
    {
        "name": "管子",
        "url": "https://so.gushiwen.cn/guwen/book_46653FD803893E4FFFD969D27F952D9D.aspx"
    },
    {
        "name": "晏子春秋",
        "url": "https://so.gushiwen.cn/guwen/book_46653FD803893E4FE985BA1120632F1B.aspx"
    },
    {
        "name": "孙子兵法",
        "url": "https://so.gushiwen.cn/guwen/book_46653FD803893E4F5B7146187204196D.aspx"
    },
    {
        "name": "孙膑兵法",
        "url": "https://so.gushiwen.cn/guwen/book_46653FD803893E4FB66BB352A3FCC20D.aspx"
    },
    {
        "name": "吴子兵法",
        "url": "https://so.gushiwen.cn/guwen/book_46653FD803893E4F01AE666A034072DA.aspx"
    },
    {
        "name": "太公兵法",
        "url": "https://so.gushiwen.cn/guwen/book_46653FD803893E4FC5B107509595B915.aspx"
    },
    {
        "name": "司马法",
        "url": "https://so.gushiwen.cn/guwen/book_46653FD803893E4FC47A2346B3A2170E.aspx"
    },
    {
        "name": "尉缭子",
        "url": "https://so.gushiwen.cn/guwen/book_46653FD803893E4F5C22677EEA3861CB.aspx"
    },
    {
        "name": "孝经",
        "url": "https://so.gushiwen.cn/guwen/book_46653FD803893E4F640E24BD63B0F779.aspx"
    }
]

In [205]:
outputpath="/Users/sunyafu/zebra/YIJING/Books/"

In [206]:
downloadbooksfromgushiwen(outputpath, books_from_gushiwen)

下载《尔雅》...
  下载章节《尔雅·释诂》...
  下载章节《尔雅·释言》...
  下载章节《尔雅·释训》...
  下载章节《尔雅·释亲》...
  下载章节《尔雅·释宫》...
  下载章节《尔雅·释器》...
  下载章节《尔雅·释乐》...
  下载章节《尔雅·释天》...
  下载章节《尔雅·释地》...
  下载章节《尔雅·释丘》...
  下载章节《尔雅·释山》...
  下载章节《尔雅·释水》...
  下载章节《尔雅·释草》...
  下载章节《尔雅·释木》...
  下载章节《尔雅·释虫》...
  下载章节《尔雅·释鱼》...
  下载章节《尔雅·释鸟》...
  下载章节《尔雅·释兽》...
  下载章节《尔雅·释畜》...
保存/Users/sunyafu/zebra/YIJING/Books/尔雅.json文件中...
下载《周礼》...
  下载章节《天官冢宰·叙官》...
  下载章节《天官冢宰·大宰》...
  下载章节《天官冢宰·小宰》...
  下载章节《天官冢宰·宰夫》...
  下载章节《天官冢宰·宫正/外饔》...
  下载章节《天官冢宰·亨人/兽医》...
  下载章节《天官冢宰·酒正/掌次》...
  下载章节《天官冢宰·大府/职币》...
  下载章节《天官冢宰·司裘/内树》...
  下载章节《天官冢宰·九嫔/女史》...
  下载章节《天官冢宰·典妇功/夏采》...
  下载章节《地官司徒·叙官》...
  下载章节《地官司徒·大司徒》...
  下载章节《地官司徒·小司徒》...
  下载章节《地官司徒·乡师/比长》...
  下载章节《地官司徒·封人/均人》...
  下载章节《地官司徒·师氏/媒氏》...
  下载章节《地官司徒·司市/掌节》...
  下载章节《地官司徒·遂人/土均》...
  下载章节《地官司徒·草人/羽人》...
  下载章节《地官司徒·掌葛/槁人》...
  下载章节《春官宗伯·叙官》...
  下载章节《春官宗伯·大宗伯》...
  下载章节《春官宗伯·小宗伯》...
  下载章节《春官宗伯·肆师》...
  下载章节《春官宗伯·郁人/典瑞》...
  下载章节《春官宗伯·典命/职丧》...
  下载章节《春官宗伯·大司乐/小师》...
  下载章节《春官宗伯·瞽蒙/司干》