# Urllib库
- request
    - urlopen 访问
    - urlretrieve 下载
    - Request 增加User-Agent (headers, data, etc.)
    - ProxyHandler处理器（代理设置）
        1. handler = urllib.request.ProxyHandler({"http/https: IP:Port}")
        2. opener = request.build_opener(handler)
        3. url = "" 或者 req = request.Request(url, headers, data, etc.)
        4. resp = opener.open(url/req)
    - Cookies
        1. 把返回的cookies手动添加到headers中
        2. HTTPCookieProcessor (from http.cookiejar import CookieJar)
            1. cookiejar = Cookiejar()  
               handler = request.HTTPCookieProcessor(cookiejar)  
               opener = request.build_opener(handler)  
            2. 使用opener发送登录的请求  
               opener.open(req) & req = request.Request(url, etc.)  
            3. 访问接下来的网页  
               req = request.Request  
               resp = opener.open(req)
        3. MozillaCookieJar  
            cookiejar = MozillaCookieJar('cookie.txt')  
            handler = request.HTTPCookieProcessor(cookiejar)  
            opener = request.build_opener(handler)
           
- parse
    - urlencode 编码
    - parse_qs 解码
    - urlparse & urlsplit 分析

## urlopen函数

In [18]:
from urllib import request
resp = request.urlopen('https://www.baidu.com')
print(resp.read())



In [19]:
resp.getcode() #200: “请求正常，服务器正常的返回数据。”

200

## urlretrieve函数

In [20]:
request.urlretrieve("https://www.baidu.com", "baidu.html")

('baidu.html', <http.client.HTTPMessage at 0x10f3d6b00>)

## urlencode函数

In [21]:
from urllib import parse
data = {'name': '爬虫基础', 'greet': 'hello world', "age": 100}
qs = parse.urlencode(data)
print(qs)

name=%E7%88%AC%E8%99%AB%E5%9F%BA%E7%A1%80&greet=hello+world&age=100


In [22]:
url = "https://www.baidu.com/s"
params = {'wd':"刘德华"}
qs = parse.urlencode(params)
url = url + "?" + qs
print(url)

resp = request.urlopen(url)
print(resp.read())

https://www.baidu.com/s?wd=%E5%88%98%E5%BE%B7%E5%8D%8E


## parse_qs函数

In [23]:
data = {'name': '爬虫基础', 'greet': 'hello world', "age": 100}
qs = parse.urlencode(data)
print(qs)

result = parse.parse_qs(qs)
print(result)

name=%E7%88%AC%E8%99%AB%E5%9F%BA%E7%A1%80&greet=hello+world&age=100
{'name': ['爬虫基础'], 'greet': ['hello world'], 'age': ['100']}


## urlparse和urlsplit

In [24]:
from urllib import parse
# urlparse
url = 'https://www.baidu.com/s?wd=python&username=abc#1'
result = parse.urlparse(url)
print(result)

ParseResult(scheme='https', netloc='www.baidu.com', path='/s', params='', query='wd=python&username=abc', fragment='1')


In [25]:
print("scheme", result.scheme)
print("netloc", result.netloc)
print("path", result.path)
print("params", result.params)
print("query", result.query)
print("fragment", result.fragment)

scheme https
netloc www.baidu.com
path /s
params 
query wd=python&username=abc
fragment 1


In [26]:
# urlsplit
result = parse.urlsplit(url)
print(result)

SplitResult(scheme='https', netloc='www.baidu.com', path='/s', query='wd=python&username=abc', fragment='1')


In [27]:
print("scheme", result.scheme)
print("netloc", result.netloc)
print("path", result.path)
# print("params", result.params) #urlsplit没有这个
print("query", result.query)
print("fragment", result.fragment)

scheme https
netloc www.baidu.com
path /s
query wd=python&username=abc
fragment 1


In [28]:
#url = https://www.baidu.com/s;hello?wd=python&username=abc#1
#在这里hello是parameters, 一般不常见
#所以除了这点，urlparse和urlsplit是一样的

## request

In [29]:
from urllib import request
url = 'https://www.lagou.com/jobs/list_?labelWords=&fromSearch=true&suginput='

resp = request.urlopen(url)
print(resp.read())
# 拉钩网站有反爬虫机制，因为没有加header，直接被认出。

b'<html><head><meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1"><meta name="renderer" content="webkit"><meta http-equiv="Content-Type" content="text/html; charset=utf-8"></head><script type="text/javascript" src="https://www.lagou.com/utrack/trackMid.js?version=1.0.0.3&t=1535835896"></script><body><input type="hidden" id="KEY" value="rsagIwk3yl2hnrkI98FuQACf9eerWodYa0dPJ"/><script type="text/javascript">kfGNYOsx();</script>\xe9\xa1\xb5\xe9\x9d\xa2\xe5\x8a\xa0\xe8\xbd\xbd\xe4\xb8\xad...<script type="text/javascript" src="https://www.lagou.com/upload/oss.js"></script></body></html>\n'


In [30]:
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
          'Referer': 'https://www.lagou.com/jobs/list_?city=%E4%B8%8A%E6%B5%B7&cl=false&fromSearch=true&labelWords=&suginput='}
req = request.Request(url, headers=headers) #定义好了，但是并没有发送出去
resp = request.urlopen(req) #这时候，不用url，直接用刚刚定义好的request
print(resp.read())

b'<!DOCTYPE html>\n<html>\n<head>\n    <!-- meta -->\n    <meta charset="UTF-8">\n<meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1" />\n<meta name="renderer" content="webkit">\n<meta property="qc:admins" content="23635710066417756375" />\n<meta name="baidu-site-verification" content="QIQ6KC1oZ6" />\n\n<meta content="\xe6\x89\xbe\xe5\xb7\xa5\xe4\xbd\x9c,\xe6\x8b\x9b\xe8\x81\x98\xe7\xbd\x91,\xe6\x8b\x9b\xe8\x81\x98\xe4\xbf\xa1\xe6\x81\xaf,\xe4\xba\x92\xe8\x81\x94\xe7\xbd\x91\xe6\x8b\x9b\xe8\x81\x98" name="keywords">\n\n<meta content="\xe6\x89\xbe\xe5\xb7\xa5\xe4\xbd\x9c,\xe6\x8b\x9b\xe8\x81\x98\xe7\xbd\x91,\xe6\xb1\x82\xe8\x81\x8c\xe7\xbd\x91,\xe4\xba\x92\xe8\x81\x94\xe7\xbd\x91\xe6\x8b\x9b\xe8\x81\x98,\xe6\x8b\x89\xe5\x8b\xbe\xe7\xbd\x91\xe6\x98\xaf\xe4\xba\x92\xe8\x81\x94\xe7\xbd\x91\xe9\xa2\x86\xe5\x9f\x9f\xe5\x9e\x82\xe7\x9b\xb4\xe6\x8b\x9b\xe8\x81\x98\xe7\xbd\x91\xe7\xab\x99,\xe4\xba\x92\xe8\x81\x94\xe7\xbd\x91\xe8\x81\x8c\xe4\xb8\x9a\xe6\x9c\xba\xe4\xbc\x9a\xe5\xb0\xbd\

In [31]:
url = 'https://www.lagou.com/jobs/positionAjax.json?px=new&city=%E4%B8%8A%E6%B5%B7&needAddtionalResult=false'
data = {
    'first': 'true',
    'pn': 1,
    'kd': ''
}
req = request.Request(url, headers=headers, data=parse.urlencode(data).encode('utf-8'), method='POST')
resp = request.urlopen(req)
print(resp.read().decode('utf-8'))

{"requestId":null,"msg":null,"resubmitToken":null,"success":true,"content":{"pageNo":1,"pageSize":15,"hrInfoMap":{"4184829":{"userId":3534413,"receiveEmail":null,"phone":null,"positionName":"部门助理","realName":"胡心烽","portrait":"i/image2/M00/46/AB/CgotOVrV4iOADc-IAATEPU2Phro117.jpg","userLevel":"G1","canTalk":true},"5079701":{"userId":11480326,"receiveEmail":null,"phone":null,"positionName":"HRBP","realName":"Melody","portrait":"i/image2/M01/81/6A/CgotOVt_wfeADFCXAAENQXjFeC8141.jpg","userLevel":"G1","canTalk":true},"5079730":{"userId":9596340,"receiveEmail":null,"phone":null,"positionName":"解决方案VP","realName":"周涛","portrait":"i/image2/M00/31/AA/CgoB5lpAQc2Af7F9AAE2vr6BaEg741.png","userLevel":"G1","canTalk":true},"4702703":{"userId":9739001,"receiveEmail":null,"phone":null,"positionName":"人事经理","realName":"刘毅","portrait":"i/image3/M00/0C/35/Cgq2xlpmvaqAIy8PAAAiXgrmYzA738.png","userLevel":"G1","canTalk":true},"5079698":{"userId":10206667,"receiveEmail":null,"phone":null,"positionName":"人事部"

## ProxyHandler处理器（代理设置）

In [32]:
#这个是没有代理的
resp = request.urlopen('http://httpbin.org/get')
print(resp.read().decode("utf-8"))


# "98.164.230.203" 当前IP
# http://httpbin.org/get可以帮助查看请求的参数信息

{
  "args": {}, 
  "headers": {
    "Accept-Encoding": "identity", 
    "Connection": "close", 
    "Host": "httpbin.org", 
    "User-Agent": "Python-urllib/3.6"
  }, 
  "origin": "108.64.57.86", 
  "url": "http://httpbin.org/get"
}



In [33]:
#这个是使用了代理的
url = "http://httpbin.org/get"
##也可以用req = request.Request("http://httpbin.org/get") #如果需要，可以在里面加data,headers等

# 1. 使用ProxyHandler传入代理构建一个handler
handler = request.ProxyHandler({"http": "124.42.68.152:90"}) #IP:Port
# 2. 使用上面创建的handler构建一个opener
opener = request.build_opener(handler)
# 3. 使用opener去发送一个请求
resp = opener.open(url) ##也可以用req替代url
print(resp.read().decode('utf-8'))


# "124.42.68.152"代理IP

{
  "args": {}, 
  "headers": {
    "Accept-Encoding": "identity", 
    "Connection": "close", 
    "Host": "httpbin.org", 
    "User-Agent": "Python-urllib/3.6"
  }, 
  "origin": "124.42.68.152", 
  "url": "http://httpbin.org/get"
}



## Cookies

In [34]:
import chardet
my_url = 'https://eee.uci.edu/myeee/'
headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
    'Cookie': 'PHPSESSID=tjn8sbe1kfvqu7p2ld9dnjd2f2; ucinetid_auth=0SBspG2z3HbJUyYgvHVz8z9eZ3lq1QaRzCaO93d2AfClDqrZXdoWCo0shcI9SJQi'
}
req = request.Request(url=my_url, headers=headers)
resp = request.urlopen(req)
with open('eee.html', 'w') as fp:
    # write函数必须写入一个str的数据类型
    # resp.read()读出来的是一个bytes数据类型
    # bytes -> decode -> str
    # str -> encode -> bytes
    fp.write(resp.read().decode('utf-8'))

In [35]:
# 另一种方法，走登入界面

## http.cookiejar模块
- CookieJar 存到内存中
- FileCookieJar 存到文件中
- **MozillaCookieJar** 与Mozilla浏览器cookies.txt兼容
- LWPCookieJar 与libwww-per标准的Set-Cookie3文件格式兼容

### Cookiejar

In [36]:
from http.cookiejar import CookieJar

# 1.登入
# 1.1 创建一个cookiejar对象
cookiejar = CookieJar()
# 1.2 使用cookiejar创建一个HTTPCookieProcess对象
handler = request.HTTPCookieProcessor(cookiejar)
# 1.3 使用上一步创建的handler创建一个opener
opener = request.build_opener(handler)

# 1.4 使用opener发送登录的请求（人人网的邮箱和密码）
headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
}

# ucinetid & passowrd 等等参数可以通过inspect->network去提取
# 有些网站的元素是email & passowrd 等，视具体情况而论
data= {
    'ucinetid': "longxiad@uci.edu",
    'password': "Spring15",
    'login_button': "Login"
}
login_url = 'https://login.uci.edu/ucinetid/webauth?return_url=https%3A%2F%2Feee.uci.edu%2Flogout%2F'
req = request.Request(login_url, data=parse.urlencode(data).encode('utf-8'), headers=headers)
#request.urlopen(req) #resp = 不用了，因为我不关心他返回的是什么
opener.open(req)
# 这时候opener就会把收到的cookies的信息自动存到cookiejar下


# 2.访问个人主页
my_url = 'https://eee.uci.edu/myeee/'
# 获取个人主页的页眉的时候，不要新建一个opener
# 而应该使用之前的那个opener，因为之前的那个opener已经包含了
# 登录所需要的cookie信息
req = request.Request(my_url, headers=headers)
resp = opener.open(req)
with open('eee.html', 'w') as fp:
    fp.write(resp.read().decode('utf-8'))

In [37]:
# 可以把上面的过程整理成三步
# 把headers提取出来，因为后面都得用到
headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
}

def get_opener():
    # 1.登入
    # 1.1 创建一个cookiejar对象
    cookiejar = CookieJar()
    # 1.2 使用cookiejar创建一个HTTPCookieProcess对象
    handler = request.HTTPCookieProcessor(cookiejar)
    # 1.3 使用上一步创建的handler创建一个opener
    opener = request.build_opener(handler)
    return opener

def login_eee(opener):
    # 1.4 使用opener发送登录的请求（人人网的邮箱和密码）
    data= {
        'ucinetid': "longxiad@uci.edu",
        'password': "Spring15",
        'login_button': "Login"
    }
    login_url = 'https://login.uci.edu/ucinetid/webauth?return_url=https%3A%2F%2Feee.uci.edu%2Flogout%2F'
    req = request.Request(login_url, data=parse.urlencode(data).encode('utf-8'), headers=headers)
    #request.urlopen(req) #resp = 不用了，因为我不关心他返回的是什么
    opener.open(req)
    
def visit_profile(opener):
    # 2.访问个人主页
    my_url = 'https://eee.uci.edu/myeee/'
    # 获取个人主页的页眉的时候，不要新建一个opener
    # 而应该使用之前的那个opener，因为之前的那个opener已经包含了
    # 登录所需要的cookie信息
    req = request.Request(my_url, headers=headers)
    resp = opener.open(req)
    with open('eee.html', 'w') as fp:
        fp.write(resp.read().decode('utf-8'))

if __name__ == '__main__':
    opener = get_opener()
    login_eee(opener)
    visit_profile(opener)

### MozillaCookieJar

In [45]:
from urllib import request
from http.cookiejar import MozillaCookieJar

cookiejar = MozillaCookieJar('cookie.txt')
handler = request.HTTPCookieProcessor(cookiejar)
opener = request.build_opener(handler)

resp = opener.open('http://httpbin.org/cookies/set?course=abc')

cookiejar.save(ignore_discard=True) #默认保存到'cookie.txt'里

#httpbin.org/cookies/set?course=abc 用来测试
#我们设置了course=abc这个cookie
#但是一旦页面关闭，cookies就过期了，所以不会被保存下来
#所以要设置ignore_discard=True
#以后就可以把这个cookies信息用到以后

In [46]:
#cookiejar.load()可以读取已经写下来的cookies信息
cookiejar = MozillaCookieJar('cookie.txt')
cookiejar.load()
handler = request.HTTPCookieProcessor(cookiejar)
opener = request.build_opener(handler)

resp = opener.open('http://httpbin.org/cookies/set?course=abc')

for cookie in cookiejar:
    print(cookie)

<Cookie course=abc for httpbin.org/>


**这样的话不用每次爬的时候都去登入了**  
**可以直接读取已经保存下来的cookie信息**