# requests

In [2]:
# get
import requests

r = requests.get('http://httpbin.org/get')
print(r.text)
# 使用json.load(xxx)会有小问题比如null=>None

{
  "args": {}, 
  "headers": {
    "Accept": "*/*", 
    "Accept-Encoding": "gzip, deflate", 
    "Connection": "close", 
    "Host": "httpbin.org", 
    "User-Agent": "python-requests/2.21.0"
  }, 
  "origin": "3.120.235.86", 
  "url": "http://httpbin.org/get"
}



In [3]:
data = {
    'name': 'germey',
    'age': 22
}
r = requests.get("http://httpbin.org/get", params=data)
print(r.text)
# 从url可以看出data被加入了url的params部分
# r.text是json格式的

{
  "args": {
    "age": "22", 
    "name": "germey"
  }, 
  "headers": {
    "Accept": "*/*", 
    "Accept-Encoding": "gzip, deflate", 
    "Connection": "close", 
    "Host": "httpbin.org", 
    "User-Agent": "python-requests/2.21.0"
  }, 
  "origin": "3.120.235.86", 
  "url": "http://httpbin.org/get?name=germey&age=22"
}



In [4]:
type(r.text)

str

In [5]:
#json => dict(python)
import json
json.loads(r.text)

{'args': {'age': '22', 'name': 'germey'},
 'headers': {'Accept': '*/*',
  'Accept-Encoding': 'gzip, deflate',
  'Connection': 'close',
  'Host': 'httpbin.org',
  'User-Agent': 'python-requests/2.21.0'},
 'origin': '3.120.235.86',
 'url': 'http://httpbin.org/get?name=germey&age=22'}

In [6]:
# or we can use .json() to get dict
r.json()

{'args': {'age': '22', 'name': 'germey'},
 'headers': {'Accept': '*/*',
  'Accept-Encoding': 'gzip, deflate',
  'Connection': 'close',
  'Host': 'httpbin.org',
  'User-Agent': 'python-requests/2.21.0'},
 'origin': '3.120.235.86',
 'url': 'http://httpbin.org/get?name=germey&age=22'}

**抓取网页**

In [9]:
import re

headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36'
}
r = requests.get("https://www.zhihu.com/explore", headers=headers)
# 正则表达式
pattern = re.compile('explore-feed.*?question_link.*?>(.*?)</a>', re.S)
# findall
re.findall(pattern, r.text)

['\n「百亿保健帝国」权健是一家怎样的公司？其产品和销售体系是否合法合规？可能带来哪些影响？\n',
 '\n为什么「分子 CT 扫描技术」能入选 Science 2018 年度十大突破？\n',
 '\n如果六小龄童是《三体》中的面壁者，故事情节会怎样发展？\n',
 '\n初等数学很难学，不学初等数学可以直接学高等数学吗？\n',
 '\n你知道哪些反常识的知识？\n',
 '\n狗真的会在危险时刻保护人吗？\n',
 '\n为什么在朱一龙眼里看不到欲望?\n',
 '\n怎样看待Bighit于2019年推出新男团？\n',
 '\n为什么赛璐璐时期TV动画的OP和正片的上色差别很大？\n',
 '\n如何评价 MSRA 视觉组最新提出的 Deformable ConvNets V2？\n']

In [10]:
data = {'name': 'germey', 'age': '22'}
r = requests.post("http://httpbin.org/post", data=data)
print(r.text)

{
  "args": {}, 
  "data": "", 
  "files": {}, 
  "form": {
    "age": "22", 
    "name": "germey"
  }, 
  "headers": {
    "Accept": "*/*", 
    "Accept-Encoding": "gzip, deflate", 
    "Connection": "close", 
    "Content-Length": "18", 
    "Content-Type": "application/x-www-form-urlencoded", 
    "Host": "httpbin.org", 
    "User-Agent": "python-requests/2.21.0"
  }, 
  "json": null, 
  "origin": "3.120.235.86", 
  "url": "http://httpbin.org/post"
}



In [14]:
r.status_code

200

In [21]:
requests.codes.ok

200

In [28]:
requests.codes.all_good

200

In [32]:
requests.codes.not_found

404

In [15]:
r.cookies

<RequestsCookieJar[]>

In [16]:
r.history

[]

In [17]:
r.headers

{'Connection': 'keep-alive', 'Server': 'gunicorn/19.9.0', 'Date': 'Fri, 28 Dec 2018 10:42:04 GMT', 'Content-Type': 'application/json', 'Content-Length': '458', 'Access-Control-Allow-Origin': '*', 'Access-Control-Allow-Credentials': 'true', 'Via': '1.1 vegur'}

**高级用法**

In [34]:
file = {'file': open('relatedfiles/cookietest', 'rb')}
r = requests.post('http://httpbin.org/post', files = file)
print(r.text)

{
  "args": {}, 
  "data": "", 
  "files": {
    "file": "# Netscape HTTP Cookie File\n# http://curl.haxx.se/rfc/cookie_spec.html\n# This is a generated file!  Do not edit.\n\n.baidu.com\tTRUE\t/\tFALSE\t3693420707\tBAIDUID\tB4315AA1FBF099CD5087CDF2674E4DAD:FG=1\n.baidu.com\tTRUE\t/\tFALSE\t3693420707\tBIDUPSID\tB4315AA1FBF099CD5087CDF2674E4DAD\n.baidu.com\tTRUE\t/\tFALSE\t\tH_PS_PSSID\t1427_21110_28206_28132_27750_27245_27508\n.baidu.com\tTRUE\t/\tFALSE\t3693420707\tPSTM\t1545937060\n.baidu.com\tTRUE\t/\tFALSE\t\tdelPer\t0\nwww.baidu.com\tFALSE\t/\tFALSE\t\tBDSVRTM\t0\nwww.baidu.com\tFALSE\t/\tFALSE\t\tBD_HOME\t0\n"
  }, 
  "form": {}, 
  "headers": {
    "Accept": "*/*", 
    "Accept-Encoding": "gzip, deflate", 
    "Connection": "close", 
    "Content-Length": "658", 
    "Content-Type": "multipart/form-data; boundary=ab5c9b7d1c0dfa125e07c40114097537", 
    "Host": "httpbin.org", 
    "User-Agent": "python-requests/2.21.0"
  }, 
  "json": null, 
  "origin": "3.120.235.86", 
  "url":

In [36]:
r =requests.get('https://www.baidu.com')
print(r.cookies)

<RequestsCookieJar[<Cookie BDORZ=27315 for .baidu.com/>]>


In [37]:
# 会话维持
# requests.Session()

# no
requests.get('http://httpbin.org/cookies/set/number/123456789')
r = requests.get('http://httpbin.org/cookies')
print(r.text)

{
  "cookies": {}
}



In [38]:
# yes
s = requests.Session()
s.get('http://httpbin.org/cookies/set/number/123456789')
r = s.get('http://httpbin.org/cookies')
print(r.text)

{
  "cookies": {
    "number": "123456789"
  }
}



In [42]:
# SSL
requests.get('https://www.12306.cn').status_code

200

In [None]:
# 代理设置
# 超时设置
# 身份认证
# ...