lzjun567 · lzjun567 · Feb 18, 2017 · Feb 18, 2017
diff --git a/runoob2pdf/README.md b/runoob2pdf/README.md
@@ -0,0 +1,53 @@
+#Python 爬虫：把runoob网站上的各类教程转换成 PDF 电子书
+
+### 系统要求
+python3.4以上版本, 不支持python2.x
+
+
+### 准备工具
+
+requests、beautifulsoup 是爬虫两大神器，reuqests 用于网络请求，beautifusoup 用于操作 html 数据。有了这两把梭子，干起活来利索。scrapy 这样的爬虫框架我们就不用了，这样的小程序派上它有点杀鸡用牛刀的意思。此外，既然是把 html 文件转为 pdf，那么也要有相应的库支持， wkhtmltopdf 就是一个非常的工具，它可以用适用于多平台的 html 到 pdf 的转换，pdfkit 是 wkhtmltopdf 的Python封装包。click是一款命令行工具参数工具，用于在命令行传递参数。
+
+首先安装好下面的依赖包
+
+```python
+pip install requests
+pip install beautifulsoup4
+pip install pdfkit
+pip install click
+```
+
+### 安装 wkhtmltopdf
+Windows平台直接在 [http://wkhtmltopdf.org/downloads.html](http://wkhtmltopdf.org/downloads.html) 下载稳定版的 wkhtmltopdf 进行安装，安装完成之后把该程序的执行路径加入到系统环境 $PATH 变量中，否则 pdfkit 找不到 wkhtmltopdf 就出现错误 “No wkhtmltopdf executable found”。Ubuntu 和 CentOS 可以直接用命令行进行安装
+
+```shell
+$ sudo apt-get install wkhtmltopdf  # ubuntu
+$ sudo yum intsall wkhtmltopdf      # centos
+```
+
+### 运行
+```python
+python runoob2pdf.py
+```
+
+### 说明
+执行 python runoob2pdf.py后
+会提示让你输入
+1. runoob网站上的教程主页地址，主页地址就是网页顶部菜单上对应的地址。
+   如效果图。
+2. 输入保存的pdf文件名。
+
+### 效果图
+![image](./runoob2pdf.jpg)
+![image](./runoob2pdf_1.jpg)
+![image](./runoob2pdf_2.jpg)
+![image](./runoob2pdf_3.jpg)
+
+### 特别说明
+感谢《Python 爬虫：把廖雪峰的教程转换成 PDF 电子书》的作者liuzhijun，本项目的代码都是基于他的代码改动后实现。
+
+### Contact me
+>作者：jadentseng 
+>微信： cheney2010  
+
+
diff --git a/runoob2pdf/__init__.py b/runoob2pdf/__init__.py
diff --git a/runoob2pdf/runoob2pdf.jpg b/runoob2pdf/runoob2pdf.jpg
diff --git a/runoob2pdf/runoob2pdf.py b/runoob2pdf/runoob2pdf.py
@@ -0,0 +1,156 @@
+# -*- coding=utf-8 -*-
+import os
+import re
+import time
+from urllib.parse import urlparse
+from bs4 import BeautifulSoup
+import pdfkit
+import requests
+
+__author__ = 'jaden.tseng@foxmail.com'
+
+import click
+
+html_template = """
+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+</head>
+<body>
+{content}
+</body>
+</html>
+
+"""
+
+
+def get_url_list(url):
+    """
+    获取所有URL目录列表
+    :return:
+    """
+    last_position = find_last(url, "/") + 1
+    tutorial_url_head = url[0:last_position]
+    domain = get_domain(url) + "/"
+
+    response = requests.get(url)
+    soup = BeautifulSoup(response.content, "html.parser")
+    menu_tag = soup.find(class_="design")
+    urls = []
+    for a in menu_tag.find_all("a"):
+        href = str(a.get('href'))
+        result = href.find('/')
+        if result == -1:
+            url = tutorial_url_head + href
+        else:
+            url = domain + href
+        urls.append(url)
+    return urls
+
+
+def parse_url_to_html(url, name):
+    """
+    解析URL，返回HTML内容
+    :param url:解析的url
+    :param name: 保存的html文件名
+    :return: html
+    """
+    try:
+        response = requests.get(url)
+        soup = BeautifulSoup(response.content, 'html.parser')
+        # 正文
+        body = soup.find_all(class_="article-intro")
+        # 标题
+        # title = soup.find_all('h1')[1].get_text()
+
+        # 标题加入到正文的最前面，居中显示
+        # center_tag = soup.new_tag("center")
+        # title_tag = soup.new_tag('h1')
+        # title_tag.string = title
+        # center_tag.insert(1, title_tag)
+        # body.insert(1, center_tag)
+        h = str(body)
+        html = h[1:-1]
+        # body中的img标签的src相对路径的改成绝对路径
+        # pattern = "(<img .*?src=\")(.*?)(\")"
+        #
+        # def func(m):
+        #     if not m.group(3).startswith("http"):
+        #         rtn = m.group(1) + domain + m.group(2) + m.group(3)
+        #         return rtn
+        #     else:
+        #         return m.group(1) + m.group(2) + m.group(3)
+        #
+        # html = re.compile(pattern).sub(func, html)
+        html = html_template.format(content=html)
+        html = html.encode("utf-8")
+        with open(name, 'wb') as f:
+            f.write(html)
+        return name
+
+    except Exception as e:
+        # logging.error("解析错误: " + e, exc_info=True)
+        print(e)
+
+
+def save_pdf(htmls, file_name):
+    """
+    把所有html文件保存到pdf文件
+    :param htmls:  html文件列表
+    :param file_name: pdf文件名
+    :return:
+    """
+    options = {
+        'page-size': 'Letter',
+        'margin-top': '0.75in',
+        'margin-right': '0.75in',
+        'margin-bottom': '0.75in',
+        'margin-left': '0.75in',
+        'encoding': "UTF-8",
+        'custom-header': [
+            ('Accept-Encoding', 'gzip')
+        ],
+        'cookie': [
+            ('cookie-name1', 'cookie-value1'),
+            ('cookie-name2', 'cookie-value2'),
+        ],
+        'outline-depth': 10,
+    }
+    pdfkit.from_file(htmls, file_name, options=options)
+
+
+def find_last(string, char):
+    last_position = -1
+    while True:
+        position = string.find(char, last_position + 1)
+        if position == -1:
+            return last_position
+        last_position = position
+
+
+def get_domain(url):
+    r = urlparse(url)
+    return r.scheme + "://" + r.netloc
+
+
+@click.command()
+@click.option('--url', prompt='输入要爬取的runoob教程主页地址', help='runoob网站上某一教程的主页地址')
+@click.option('--file', prompt='输入PDF文件的保存名称', help='不需要后缀.pdf，只需要提供名称即可')
+def main(url, file):
+    start = time.time()
+    urls = get_url_list(url)
+    file_name = u"%s.pdf" % file
+    htmls = [parse_url_to_html(url, str(index) + ".html") for index, url in enumerate(urls)]
+    print(htmls)
+    save_pdf(htmls, file_name)
+
+    for html in htmls:
+        os.remove(html)
+
+    total_time = time.time() - start
+    print(u"总共耗时：%f 秒" % total_time)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/runoob2pdf/runoob2pdf_1.jpg b/runoob2pdf/runoob2pdf_1.jpg
diff --git a/runoob2pdf/runoob2pdf_2.jpg b/runoob2pdf/runoob2pdf_2.jpg
diff --git a/runoob2pdf/runoob2pdf_3.jpg b/runoob2pdf/runoob2pdf_3.jpg