*补充 P36 中 关于豆瓣 tag 的 url 编码*

In [1]:
from lxml import html
import requests, urllib.parse

my_headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36 Edg/83.0.478.56',
}

tag_page = requests.get('https://book.douban.com/tag/', headers=my_headers)
tag_tree = html.fromstring(tag_page.content)
tags = tag_tree.xpath("//tbody/tr/td/a")

tag_list = list()
for tag in tags:
    tag_list.append(tag.text.strip())

tag_url_encoding_list = list()
for text in tag_list:
    tag_url_encoding_list.append(urllib.parse.quote(text))
    
print(tag_list)
print(tag_url_encoding_list)

['小说', '外国文学', '文学', '经典', '中国文学', '随笔', '日本文学', '散文', '村上春树', '诗歌', '童话', '名著', '儿童文学', '古典文学', '余华', '王小波', '杂文', '当代文学', '张爱玲', '外国名著', '钱钟书', '鲁迅', '诗词', '茨威格', '米兰·昆德拉', '杜拉斯', '港台', '漫画', '推理', '绘本', '东野圭吾', '青春', '悬疑', '科幻', '言情', '推理小说', '奇幻', '武侠', '日本漫画', '耽美', '韩寒', '网络小说', '科幻小说', '三毛', '亦舒', '阿加莎·克里斯蒂', '金庸', '安妮宝贝', '穿越', '郭敬明', '轻小说', '魔幻', '青春文学', '几米', 'J.K.罗琳', '幾米', '张小娴', '古龙', '校园', '高木直子', '沧月', '余秋雨', '落落', '历史', '心理学', '哲学', '社会学', '传记', '文化', '艺术', '社会', '政治', '设计', '宗教', '政治学', '建筑', '电影', '数学', '中国历史', '回忆录', '思想', '国学', '人物传记', '艺术史', '人文', '音乐', '绘画', '戏剧', '西方哲学', '近代史', '二战', '军事', '佛教', '考古', '自由主义', '美术', '爱情', '成长', '生活', '旅行', '心理', '女性', '励志', '摄影', '教育', '职场', '美食', '游记', '灵修', '健康', '情感', '人际关系', '两性', '养生', '手工', '家居', '自助游', '经济学', '管理', '经济', '商业', '金融', '投资', '营销', '理财', '创业', '股票', '广告', '企业史', '策划', '科普', '互联网', '科学', '编程', '交互设计', '算法', '用户体验', '科技', 'web', '交互', '通信', 'UE', '神经网络', 'UCD', '程序']
['%E5%B0%8F%E8%AF%B4', '%E5%A4%96%E5%9B%BD%E6%

In [2]:
import urllib.parse
help(urllib.parse.quote)

Help on function quote in module urllib.parse:

quote(string, safe='/', encoding=None, errors=None)
    quote('abc def') -> 'abc%20def'
    
    Each part of a URL, e.g. the path info, the query, etc., has a
    different set of reserved characters that must be quoted. The
    quote function offers a cautious (not minimal) way to quote a
    string for most of these parts.
    
    RFC 3986 Uniform Resource Identifier (URI): Generic Syntax lists
    the following (un)reserved characters.
    
    unreserved    = ALPHA / DIGIT / "-" / "." / "_" / "~"
    reserved      = gen-delims / sub-delims
    gen-delims    = ":" / "/" / "?" / "#" / "[" / "]" / "@"
    sub-delims    = "!" / "$" / "&" / "'" / "(" / ")"
                  / "*" / "+" / "," / ";" / "="
    
    Each of the reserved characters is reserved in some component of a URL,
    but not necessarily in all of them.
    
    The quote function %-escapes all characters that are neither in the
    unreserved chars ("always safe") nor

*官方文档关于 ThreadPoolExecutor 例子*

In [3]:
import concurrent.futures
import urllib.request

URLS = ['http://www.foxnews.com/',
        'http://www.cnn.com/',
        'http://europe.wsj.com/',
        'http://www.bbc.co.uk/',
        'http://some-made-up-domain.com/']

# Retrieve a single page and report the URL and contents
def load_url(url, timeout):
    with urllib.request.urlopen(url, timeout=timeout) as conn:
        return conn.read()

# We can use a with statement to ensure threads are cleaned up promptly
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
    # Start the load operations and mark each future with its URL
    future_to_url = {executor.submit(load_url, url, 60): url for url in URLS}
    for future in concurrent.futures.as_completed(future_to_url):
        url = future_to_url[future]
        try:
            data = future.result()
        except Exception as exc:
            print('%r generated an exception: %s' % (url, exc))
        else:
            print('%r page is %d bytes' % (url, len(data)))

'http://www.foxnews.com/' page is 331286 bytes
'http://some-made-up-domain.com/' page is 64668 bytes
'http://www.bbc.co.uk/' page is 274945 bytes
'http://europe.wsj.com/' generated an exception: HTTP Error 404: Not Found
'http://www.cnn.com/' page is 1142131 bytes


In [4]:
import concurrent.futures
help(concurrent.futures.ThreadPoolExecutor)
print('**********************************************************************')
help(concurrent.futures.as_completed)

Help on class ThreadPoolExecutor in module concurrent.futures.thread:

class ThreadPoolExecutor(concurrent.futures._base.Executor)
 |  ThreadPoolExecutor(max_workers=None, thread_name_prefix='', initializer=None, initargs=())
 |  
 |  This is an abstract base class for concrete asynchronous executors.
 |  
 |  Method resolution order:
 |      ThreadPoolExecutor
 |      concurrent.futures._base.Executor
 |      builtins.object
 |  
 |  Methods defined here:
 |  
 |  __init__(self, max_workers=None, thread_name_prefix='', initializer=None, initargs=())
 |      Initializes a new ThreadPoolExecutor instance.
 |      
 |      Args:
 |          max_workers: The maximum number of threads that can be used to
 |              execute the given calls.
 |          thread_name_prefix: An optional name prefix to give our threads.
 |          initializer: A callable used to initialize worker threads.
 |          initargs: A tuple of arguments to pass to the initializer.
 |  
 |  shutdown(self, wait=T

*字符串格式化的另一种写法*

In [5]:
var = 'world'
print('Hello {}!'.format(var))
print(f'Hello {var}!') # python 3.6 and above

Hello world!
Hello world!
