# urllib

**入门: urlopen**

In [73]:
import urllib.request as req
response = req.urlopen('https://www.python.org/')
response.read().decode('utf-8')[:200]

'<!doctype html>\n<!--[if lt IE 7]>   <html class="no-js ie6 lt-ie7 lt-ie8 lt-ie9">   <![endif]-->\n<!--[if IE 7]>      <html class="no-js ie7 lt-ie8 lt-ie9">          <![endif]-->\n<!--[if IE 8]>      <h'

In [13]:
type(response)

http.client.HTTPResponse

In [14]:
response.code

200

In [15]:
response.status

200

In [16]:
response.getheaders()[:3]

[('Server', 'nginx'),
 ('Content-Type', 'text/html; charset=utf-8'),
 ('X-Frame-Options', 'SAMEORIGIN')]

In [17]:
response.getheader('Server')

'nginx'

In [20]:
import urllib.parse as par

In [21]:
# arg: data

# par.urlencode将参数字典转化为字符串
data = bytes(par.urlencode({'word':'hello'}), encoding='utf-8')
# 附带data的post请求(有data参数只能是post请求)
response = req.urlopen('http://httpbin.org/post', data=data)
response.read()

b'{\n  "args": {}, \n  "data": "", \n  "files": {}, \n  "form": {\n    "word": "hello"\n  }, \n  "headers": {\n    "Accept-Encoding": "identity", \n    "Connection": "close", \n    "Content-Length": "10", \n    "Content-Type": "application/x-www-form-urlencoded", \n    "Host": "httpbin.org", \n    "User-Agent": "Python-urllib/3.6"\n  }, \n  "json": null, \n  "origin": "3.120.235.86", \n  "url": "http://httpbin.org/post"\n}\n'

In [22]:
data

b'word=hello'

In [25]:
# arg: timeout
response = req.urlopen('http://httpbin.org/get', timeout=1)
response.read()

b'{\n  "args": {}, \n  "headers": {\n    "Accept-Encoding": "identity", \n    "Connection": "close", \n    "Host": "httpbin.org", \n    "User-Agent": "Python-urllib/3.6"\n  }, \n  "origin": "3.120.235.86", \n  "url": "http://httpbin.org/get"\n}\n'

In [32]:
# when timeout jump
import socket
import urllib.error

In [46]:
try:
    response = req.urlopen('http://httpbin.org/get', timeout=0.01)
except urllib.error.URLError as e:
    if isinstance(e.reason, socket.timeout):
        print('time out')
# timeout

time out


**请求: request**

In [47]:
# complex request
url = 'http://httpbin.org/post'
headers = {
    # firefox's User-Agent
    'User-Agent': 'Mozilla/5.0 (X11; U; Linux i686) Gecko/20071127 Firefox/2.0.0.11',
    # 域
    'Host': 'httpbin.org'
}

In [49]:
dict = {
    'name': 'Germey'
}
data = bytes(par.urlencode(dict), encoding='utf8')

In [52]:
myreq = req.Request(url=url, data=data, headers=headers, method='POST')
response = req.urlopen(myreq)
result = response.read().decode('utf-8')

In [53]:
import json
json.loads(result)
# problem: null changed to None

{'args': {},
 'data': '',
 'files': {},
 'form': {'name': 'Germey'},
 'headers': {'Accept-Encoding': 'identity',
  'Connection': 'close',
  'Content-Length': '11',
  'Content-Type': 'application/x-www-form-urlencoded',
  'Host': 'httpbin.org',
  'User-Agent': 'Mozilla/5.0 (X11; U; Linux i686) Gecko/20071127 Firefox/2.0.0.11'},
 'json': None,
 'origin': '3.120.235.86',
 'url': 'http://httpbin.org/post'}

In [57]:
# use add_header method
myreq = req.Request(url=url, data=data, method='POST')
myreq.add_header('User-Agent', 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)')
json.loads(req.urlopen(myreq).read().decode('utf-8'))

{'args': {},
 'data': '',
 'files': {},
 'form': {'name': 'Germey'},
 'headers': {'Accept-Encoding': 'identity',
  'Connection': 'close',
  'Content-Length': '11',
  'Content-Type': 'application/x-www-form-urlencoded',
  'Host': 'httpbin.org',
  'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'},
 'json': None,
 'origin': '3.120.235.86',
 'url': 'http://httpbin.org/post'}

**urllib.request.BaseHandler**

In [61]:
# 处理cookies

# 获取cookie
import http.cookiejar

cookie = http.cookiejar.CookieJar()
handler = req.HTTPCookieProcessor(cookie)
opener = req.build_opener(handler)
response = opener.open('http://www.baidu.com')

In [65]:
cookie
# <CookieJar[]> 内有多个Cookie(), 拥有name,value等属性

<CookieJar[Cookie(version=0, name='BAIDUID', value='998BE1D22DA99B5C5DA6E02755BECF8B:FG=1', port=None, port_specified=False, domain='.baidu.com', domain_specified=True, domain_initial_dot=True, path='/', path_specified=True, secure=False, expires=3693420275, discard=False, comment=None, comment_url=None, rest={}, rfc2109=False), Cookie(version=0, name='BIDUPSID', value='998BE1D22DA99B5C5DA6E02755BECF8B', port=None, port_specified=False, domain='.baidu.com', domain_specified=True, domain_initial_dot=True, path='/', path_specified=True, secure=False, expires=3693420275, discard=False, comment=None, comment_url=None, rest={}, rfc2109=False), Cookie(version=0, name='H_PS_PSSID', value='1440_27216_21086_28205_28132_26350_27751_28140_22158', port=None, port_specified=False, domain='.baidu.com', domain_specified=True, domain_initial_dot=True, path='/', path_specified=True, secure=False, expires=None, discard=True, comment=None, comment_url=None, rest={}, rfc2109=False), Cookie(version=0, name

In [67]:
for item in cookie:
    print(item.name+"                "+item.value)

BAIDUID                998BE1D22DA99B5C5DA6E02755BECF8B:FG=1
BIDUPSID                998BE1D22DA99B5C5DA6E02755BECF8B
H_PS_PSSID                1440_27216_21086_28205_28132_26350_27751_28140_22158
PSTM                1545936627
delPer                0
BDSVRTM                0
BD_HOME                0


In [72]:
# 保存为文本
cookie = http.cookiejar.MozillaCookieJar('relatedfiles/cookietest')
handler = req.HTTPCookieProcessor(cookie)
opener = req.build_opener(handler)
response = opener.open('http://www.baidu.com')

cookie.save(ignore_discard=True, ignore_expires=True)

In [76]:
# LWP格式: pass

In [75]:
# 使用cookie
cookie.load('relatedfiles/cookietest', ignore_discard=True, ignore_expires=True)
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)
response = opener.open('http://www.baidu.com')
response.read().decode('utf-8')[:300]

'<!DOCTYPE html>\n<!--STATUS OK-->\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\t\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\t\r\n        \r\n\t\t\t        \r\n\t\r\n\t\t\t        \r\n\t\r\n\t\t\t        \r\n\t\r\n\t\t\t        \r\n\t\t\t    \r\n\r\n\t\r\n'

**处理异常**

In [77]:
from urllib import error, request

In [80]:
# urlerror
try:
    res = request.urlopen('https://example.com/notexist.html')
except error.URLError as e:
    print(e.reason)

Not Found


In [82]:
# httperror (urlerror's son-class) 
try:
    res = request.urlopen('https://example.com/notexist.html')
except error.HTTPError as e:
    print(e.reason, e.code, e.headers, sep='\n')

Not Found
404
Accept-Ranges: bytes
Cache-Control: max-age=604800
Content-Type: text/html; charset=UTF-8
Date: Thu, 27 Dec 2018 20:23:48 GMT
Expires: Thu, 03 Jan 2019 20:23:48 GMT
Last-Modified: Thu, 27 Dec 2018 20:19:00 GMT
Server: ECS (dca/2495)
Vary: Accept-Encoding
X-Cache: 404-HIT
Content-Length: 1270
Connection: close




In [84]:
# better code
try:
    res = request.urlopen('https://example.com/index.html')
except error.HTTPError  as e:
    print(e.reason, e.code, e.headers, sep='\n')
except error.URLError as e:
    print(e.reason)
else:
    print('succed')

succed


In [85]:
# 返回一个对象而不是字符串
try:
    response = request.urlopen('https://www.baidu.com', timeout=0.01)
except error.URLError as e:
    print(type(e.reason))
    if isinstance(e.reason, socket.timeout):
        print('TIME OUT')

<class 'socket.timeout'>
TIME OUT


**解析链接**

In [89]:
# url的识别和分段: urlparse()
from urllib import parse
parse.urlparse('https://tianchi.aliyun.com/user/myGitlab.htm?spm=5176.100150.0.0.791f2009FtTxRl')
# scheme: 协议
# netloc: 域名
# path
# params
# query: 用于get类型的url
# frament: #
# 标准的链接格式
# scheme://netloc/path;params?query#frament

ParseResult(scheme='https', netloc='tianchi.aliyun.com', path='/user/myGitlab.htm', params='', query='spm=5176.100150.0.0.791f2009FtTxRl', fragment='')

In [92]:
# urlunparse()
parse.urlunparse(['https','tianchi.aliyun.com','user/myGitlab.htm','','spm=5176.100150.0.0.791f2009FtTxRl',''])

'https://tianchi.aliyun.com/user/myGitlab.htm?spm=5176.100150.0.0.791f2009FtTxRl'

In [95]:
# urlsplit & urlunsplit: params包含在path中
parse.urlsplit('https://tianchi.aliyun.com/user/myGitlab.htm?spm=5176.100150.0.0.791f2009FtTxRl')

SplitResult(scheme='https', netloc='tianchi.aliyun.com', path='/user/myGitlab.htm', query='spm=5176.100150.0.0.791f2009FtTxRl', fragment='')

In [96]:
parse.urlunsplit(['https','tianchi.aliyun.com','user/myGitlab.htm','spm=5176.100150.0.0.791f2009FtTxRl',''])

'https://tianchi.aliyun.com/user/myGitlab.htm?spm=5176.100150.0.0.791f2009FtTxRl'

In [99]:
# urljoin
# par2 补充 par1, par1可选scheme,netloc和path
parse.urljoin('http://www.baidu.com', 'FAQ.html')

'http://www.baidu.com/FAQ.html'

In [101]:
# par2包含的部分会覆盖par1
parse.urljoin('http://www.baidu.com', 'https://cuiqingcai.com/FAQ.html')

'https://cuiqingcai.com/FAQ.html'

In [107]:
# urlencode: dict=>get par
params = {
    'name': 'germey',
    'age': 22
}
base_url = 'http://www.baidu.com?'
url = base_url + parse.urlencode(params)
url

'http://www.baidu.com?name=germey&age=22'

In [112]:
# parse_qs urlencode逆向
parse.parse_qs(parse.urlsplit(url).query)

{'name': ['germey'], 'age': ['22']}

In [113]:
# parse_qsl =>list
parse.parse_qsl(parse.urlsplit(url).query)

[('name', 'germey'), ('age', '22')]

In [115]:
# unicode => urlcode
# quote()
url = 'https"//www.baidu.com/s?wd=' + parse.quote('中文')
url

'https"//www.baidu.com/s?wd=%E4%B8%AD%E6%96%87'

In [121]:
# urlcode => unicode
# unquote()
a = parse.urlsplit('https://www.google.com/search?ei=uUYlXNvuJIWQmgX884CgCw&q=python3%E7%BD%91%E7%BB%9C%E7%88%AC%E8%99%AB%E5%BC%80%E5%8F%91%E5%AE%9E%E6%88%98&oq=python3%E7%BD%91%E7%BB%9C%E7%88%AC%E8%99%AB%E5%BC%80%E5%8F%91%E5%AE%9E%E6%88%98&gs_l=psy-ab.3..0i71l8.0.0..1784...0.0..0.0.0.......0......gws-wiz.e4qUIr993lc')
b = parse.unquote(a.query)
parse.parse_qs(b)

{'ei': ['uUYlXNvuJIWQmgX884CgCw'],
 'q': ['python3网络爬虫开发实战'],
 'oq': ['python3网络爬虫开发实战'],
 'gs_l': ['psy-ab.3..0i71l8.0.0..1784...0.0..0.0.0.......0......gws-wiz.e4qUIr993lc']}

# 分析Robots协议

In [130]:
from urllib.robotparser import RobotFileParser

# 也可直接使用 rp = RobotFileParser('http://www.jianshu.com/robots.txt')
rp = RobotFileParser()
rp.set_url('http://www.jianshu.com/robots.txt')
# read: 执行读取操作
rp.read()
# 理论上这个是输出True的
print(rp.can_fetch('*', 'https://www.jianshu.com/p/c65e78dc3d1d'))
print(rp.can_fetch('*', "http://www.jianshu.com/search?q=python&page=1&type=collections"))

False
False


http://www.jianshu.com/robots.txt 内容如下

```
User-agent: *
Disallow: /search
Disallow: /convos/
Disallow: /notes/
Disallow: /admin/
Disallow: /adm/
Disallow: /p/0826cf4692f9
Disallow: /p/d8b31d20a867
Disallow: /collections/*/recommended_authors
Disallow: /trial/*
Disallow: /keyword_notes
Disallow: /stats-2017/*

User-agent: trendkite-akashic-crawler
Request-rate: 1/2 # load 1 page per 2 seconds
Crawl-delay: 60

User-agent: YisouSpider
Request-rate: 1/10 # load 1 page per 10 seconds
Crawl-delay: 60
```