In [None]:
# default_exp utils

# 进阶的爬虫
> 1. 更复杂的请求
    * 无headers
    * 有headers
    * 有cookies
* JSON 存储数据
* 自动识别列表
* 自动识别下一页
* 自动识别列表上的内容

* [Requests文档](https://requests.readthedocs.io/en/master/)
* [BeautifulSoup文档](https://www.crummy.com/software/BeautifulSoup/bs4/doc/)


In [None]:
# export
import requests,json,re
from bs4 import BeautifulSoup,Tag
from collections import Counter

## 更复杂的请求
> 以豆瓣网为例

一开始没有header的请求会被拒绝，因为服务器会把这次的请求当成是机器人，status_code是418

In [None]:
# hide
search_query = '1234'
url = f'https://www.douban.com/search?q={search_query}'
res = requests.get(url)
res,res.text

(<Response [418]>, '')

加了header之后，服务器就把它当成了Mozilla浏览器，就有response的数据了

In [None]:
# hide
headers = {'user-agent':'Mozilla/5.0'}
res = requests.get(url,headers=headers)
res

<Response [200]>

想要发布一个动态就需要登录状态，而cookie就是登录状态的载体，cookie就是用帐号密码登录后获取的凭证

In [None]:
# hide
url = 'https://www.douban.com/'
data = {'comment':'hello world','ck':'zV8Z','privacy_and_reply_limit':'P,'}

In [None]:
# export
def format_cookie_str(cookie_str):
    '把chrome拷贝过来的cookie字符转化成dict'
    cookies = {}
    for item in cookie_str.split('; '):
        k,v = item.split('=',1)
        cookies[k] = v
    return cookies

In [None]:
cookie_str = 'bid=1i8YWHFPDwI; gr_user_id=5b798ccf-0dc3-41f7-9358-ab221ae5c248; __utmc=30149280; __utmz=30149280.1582040380.5.5.utmcsr=google|utmccn=(organic)|utmcmd=organic|utmctr=(not%20provided); ll="118124"; _pk_ref.100001.8cb4=%5B%22%22%2C%22%22%2C1582185091%2C%22https%3A%2F%2Fwww.google.com%2F%22%5D; _pk_ses.100001.8cb4=*; __utma=30149280.682236232.1580713449.1582121697.1582185095.7; ap_v=0,6.0; viewed="19672873_30243169_4233221"; gr_cs1_6bb1b2b8-0a3e-4e02-9e3c-4f359d514576=user_id%3A0; __utmt_douban=1; dbcl2="140014301:Td6zJ+yn5sA"; ck=zV8Z; gr_session_id_22c937bbd8ebd703f2d8e9445f7dfd03=7a596b44-fe53-45f0-90fa-2c24b2faa365; gr_cs1_7a596b44-fe53-45f0-90fa-2c24b2faa365=user_id%3A1; gr_session_id_22c937bbd8ebd703f2d8e9445f7dfd03_7a596b44-fe53-45f0-90fa-2c24b2faa365=true; push_noty_num=0; push_doumail_num=0; __utmv=30149280.14001; __utmt=1; ps=y; _pk_id.100001.8cb4=7bc8021c269d7e50.1580713448.6.1582185748.1582121848.; __utmb=30149280.18.10.1582185095'
cookies = format_cookie_str(cookie_str)
cookies

{'bid': '1i8YWHFPDwI',
 'gr_user_id': '5b798ccf-0dc3-41f7-9358-ab221ae5c248',
 '__utmc': '30149280',
 '__utmz': '30149280.1582040380.5.5.utmcsr=google|utmccn=(organic)|utmcmd=organic|utmctr=(not%20provided)',
 'll': '"118124"',
 '_pk_ref.100001.8cb4': '%5B%22%22%2C%22%22%2C1582185091%2C%22https%3A%2F%2Fwww.google.com%2F%22%5D',
 '_pk_ses.100001.8cb4': '*',
 '__utma': '30149280.682236232.1580713449.1582121697.1582185095.7',
 'ap_v': '0,6.0',
 'viewed': '"19672873_30243169_4233221"',
 'gr_cs1_6bb1b2b8-0a3e-4e02-9e3c-4f359d514576': 'user_id%3A0',
 '__utmt_douban': '1',
 'dbcl2': '"140014301:Td6zJ+yn5sA"',
 'ck': 'zV8Z',
 'gr_session_id_22c937bbd8ebd703f2d8e9445f7dfd03': '7a596b44-fe53-45f0-90fa-2c24b2faa365',
 'gr_cs1_7a596b44-fe53-45f0-90fa-2c24b2faa365': 'user_id%3A1',
 'gr_session_id_22c937bbd8ebd703f2d8e9445f7dfd03_7a596b44-fe53-45f0-90fa-2c24b2faa365': 'true',
 'push_noty_num': '0',
 'push_doumail_num': '0',
 '__utmv': '30149280.14001',
 '__utmt': '1',
 'ps': 'y',
 '_pk_id.100001.8cb

In [None]:
# hide
res = requests.post(url,headers=headers,data=data,cookies=cookies)
res.text

'<!DOCTYPE html>\n<html lang="zh-cmn-Hans" class="">\n<head>\n    <meta http-equiv="Content-Type" content="text/html; charset=utf-8">\n    <meta name="renderer" content="webkit">\n    <meta name="referrer" content="always">\n    <meta name="google-site-verification" content="ok0wCgT20tBBgo9_zat2iAcimtN4Ftf5ccsh092Xeyw" />\n    <title>豆瓣</title>\n    \n    \n<meta content="提供图书、电影、音乐唱片的推荐、评论和价格比较，以及城市独特的文化生活。" name="description"/>\n\n    <link href="https://img3.doubanio.com/f/shire/db7c8ab218cded634c1180ff7b56d3ca44393bc2/css/frontpage/_init_.css" rel="stylesheet" type="text/css">\n    <script>_head_start = new Date();</script>\n    <script src="https://img3.doubanio.com/f/shire/ebac5abada5de811d65dac57b2e62a89c7ddf55a/js/core/_init_.js" data-cfg-corelib="https://img3.doubanio.com/f/shire/72ced6df41d4d158420cebdd254f9562942464e3/js/jquery.min.js"></script>\n    \n    <style type="text/css"></style>\n    <link rel="stylesheet" href="https://img3.doubanio.com/misc/mixed_static/2008baff28

In [None]:
# hide
# 查看最近发布的5条动态
soup = BeautifulSoup(res.text)
items = soup.find_all(class_='new-status')
for item in items[:5]:
    print(item.find('p').text)

hello world
hello world
hello world
hello world
hello world


In [None]:
# hide
type(items[0]),items[0].attrs

(bs4.element.Tag,
 {'class': ['new-status', 'status-wrapper', 'saying'],
  'data-sid': '2849308445',
  'data-uid': '140014301'})

顺便看看tag的其他属性，上下左右查到其他的tag
* item.parent
* item.parents

* item.contents
* item.children

* item.next_sibling
* item.previous_sibling

## JSON 存储
> 这个格式本质上就是个dict，在MongoDB和Redis中也是这样存储，所以这里就开始学习下

比如这次要把谁在什么时间，说了什么话记下来

In [None]:
# hide
print(items[5]['data-sid'])
print(items[0].find(class_='text').a.text)
print(items[0].find(class_='created_at')['title'])
print(items[4].find(class_='bd').contents[1].text.replace('\n',''))

2849283969
Hawk
2020-03-04 19:37:35
hello world


In [None]:
# hide
contents = {}
for item in items:
    _id = item['data-sid']
    name = item.find(class_='text').a.text    
    created_at = item.find(class_='created_at')['title']
    content = item.find(class_='bd').contents[1].text.replace('\n','')  
    contents[_id] = {}
    contents[_id]['name'] = name
    contents[_id]['created_at'] = created_at  
    contents[_id]['content'] = content
contents

{'2849308445': {'name': 'Hawk',
  'created_at': '2020-03-04 19:37:35',
  'content': 'hello world'},
 '2849286231': {'name': 'Hawk',
  'created_at': '2020-03-04 19:23:46',
  'content': 'hello world'},
 '2849285799': {'name': 'Hawk',
  'created_at': '2020-03-04 19:23:26',
  'content': 'hello world'},
 '2849285223': {'name': 'Hawk',
  'created_at': '2020-03-04 19:23:05',
  'content': 'hello world'},
 '2849284578': {'name': 'Hawk',
  'created_at': '2020-03-04 19:22:42',
  'content': 'hello world'},
 '2849283969': {'name': 'Hawk',
  'created_at': '2020-03-04 19:22:19',
  'content': 'hello world'},
 '2849283502': {'name': 'Hawk',
  'created_at': '2020-03-04 19:21:59',
  'content': 'hello world'},
 '2849282899': {'name': 'Hawk',
  'created_at': '2020-03-04 19:21:34',
  'content': 'hello world'},
 '2849282248': {'name': 'Hawk',
  'created_at': '2020-03-04 19:21:10',
  'content': 'hello world'},
 '2849281687': {'name': 'Hawk',
  'created_at': '2020-03-04 19:20:51',
  'content': 'hello world'},


### 保存和读取json

In [None]:
# hide
with open('./data/01_douban.json', 'w') as f:
    json.dump(contents,f)

In [None]:
# hide
with open('./data/01_douban.json', 'r') as f:
    contents = json.loads(f.read())
contents['2726525573']

{'name': 'Malorie',
 'created_at': '2019-12-14 10:18:01',
 'content': "21 身份危机 |Model MinoritySomeone like me can be a real nightmare, completely aware But I'd rather be a real nightmare than die unaware So save me your prayers 一直很想写这篇日记，却因为太贴近真实的自我..."}

## 自动识别列表
> 思路就是看谁有最多的相同类的children，而且嵌套和样式最丰富的

根据传入的soup，分析哪些是重复的tag
然后选择一个筛选规则，再批量爬数据

先选择一大堆有特征的网页，然后批量测试，是否都能得到想要的结果

In [None]:
url_sample = 'https://a.jd.com/?cateId=142'
res_sample = requests.get(url_sample,headers={'user-agent':'Mozilla/5.0'})
# res_sample = requests.get('https://book.douban.com/tag/%E5%A4%96%E5%9B%BD%E6%96%87%E5%AD%A6',headers={'user-agent':'Mozilla/5.0'})
soup_sample = BeautifulSoup(res_sample.text)

In [None]:
res_sample.text

'<!DOCTYPE html>\n<html class="root61">\n<head>\n    <meta http-equiv="Content-Type" content="text/html; charset=UTF-8">\n    <title>\n 领券中心    </title>\n    <META HTTP-EQUIV="Pragma" CONTENT="no-cache">\n    <META HTTP-EQUIV="Cache-Control" CONTENT="no-cache">\n    <META HTTP-EQUIV="Expires" CONTENT="0">\n    <meta name="Keywords" content="优惠券,领券,京东JD.COM">\n    <meta name="description" content="京东JD.COM领券中心">\n    <link rel="icon" href="//www.jd.com/favicon.ico" mce_href="//www.jd.com/favicon.ico" type="image/x-icon">\n\n    \n    <!--,user/myjd/lib/1.0.0/widget/global/global.css-->\n    <link type="text/css" rel="stylesheet" href="//misc.360buyimg.com/??jdf/1.0.0/unit/ui-base/5.0.0/ui-base.css,jdf/1.0.0/unit/shortcut/5.0.0/shortcut.css,jdf/1.0.0/unit/global-header/5.0.0/global-header.css,jdf/1.0.0/unit/myjd/5.0.0/myjd.css,jdf/1.0.0/unit/nav/5.0.0/nav.css,jdf/1.0.0/unit/shoppingcart/5.0.0/shoppingcart.css,jdf/1.0.0/unit/global-footer/5.0.0/global-footer.css,jdf/1.0.0/unit/service/5.0

In [None]:
#export
def get_child_list(soup): return [c for c in soup.children if isinstance(c,Tag)]

In [None]:
cl = get_child_list(soup_sample.body)

In [None]:
#export
def is_item_list(list):
    '判断list是否是要找的正文中的列表，条件：重复项>1,输出筛选规则'
    if len(list)>1:
        print(Counter([i.name for i in list]))

In [None]:
Counter([c.name for c in cl]).most_common(1)[0]

('script', 8)

In [None]:
is_item_list(cl)

Counter({'script': 8, 'div': 3, 'link': 2})


In [None]:
cl[1].attrs

{'href': '//img3.doubanio.com/dae/accounts/resources/f5f3d66/shire/bundle.css',
 'rel': ['stylesheet'],
 'type': 'text/css'}

In [None]:
def get_child_list(soup): 
    '递归获取子Tag'
    children = []
    for c in soup.children:
        if isinstance(c,Tag):
            children.append(c)
#             print(c.name,c.attrs)
            grandchildren = get_child_list(c)
    c_count = len(children)
    if c_count >= 5 and len(soup.text) > 100:
        '统计tag name'
        max_tag_name,max_tag_count = Counter([c.name for c in children]).most_common(1)[0]
        if max_tag_count >= 5 and max_tag_count/c_count > 0.5 and max_tag_name in ['li','div','tr']:
            print(max_tag_name,max_tag_count,len(soup.text),max_tag_count/c_count)
        '统计class'
        
        '统计id'
#     return children

get_child_list(soup_sample.body)

li 15 109 1.0


In [None]:
def show_list_info(list):
    '显示list的tag和attrs和父tag'

In [None]:
#export
def find_candidate_content_list(soup,result=[]):
    childrens =[i for i in soup.children if isinstance(i, Tag)]
    if (len(childrens))>0:
        # 这里取列表行数大于5，item tag都相同的，而且内容大于500的
        if len(childrens)>5 and len(set([c.name for c in childrens])) == 1 and len(soup.text)>500:
            print('推测：',len(childrens),len(soup.text),soup.name,soup.attrs,childrens[0].name,childrens[0].attrs,)
            result.append([len(childrens),childrens])
        # 继续向下寻找
        for c in childrens:
            find_candidate_content_list(c,result)
    return result

def find_content_list(candidates):
    # 取记录条数最多的
    max_count = 0
    childrens = []
    for c in candidates:
        if int(c[0])>max_count:
            max_count = c[0]
            childrens = c[1]
    print('选择：',max_count)
    return childrens

def get_content_list(url,headers = {'user-agent':'Mozilla/5.0'},cookies={}):
    res = requests.get(url,headers=headers,cookies=cookies)
    if (res.status_code == 200):
        soup = BeautifulSoup(res.text)
        body = soup.body
        candidates = find_candidate_content_list(body)
        return find_content_list(candidates)
    else:
        return res

In [None]:
cs = get_content_list('https://book.douban.com/tag/%E5%A4%96%E5%9B%BD%E6%96%87%E5%AD%A6')

推测： 20 4741 ul {'class': ['subject-list']} li {'class': ['subject-item']}
选择： 20


In [None]:
# 列出一个children的所有tag，属性，和内容
for c in cs[0].find_all(True):
    print(c.name,c.attrs,''.join(c.text.split()))

div {'class': ['pic']} 
a {'class': ['nbg'], 'href': 'https://book.douban.com/subject/1770782/', 'onclick': "moreurl(this,{i:'0',query:'',subject_id:'1770782',from:'book_subject_search'})"} 
img {'class': [], 'src': 'https://img3.doubanio.com/view/subject/s/public/s1727290.jpg', 'width': '90'} 
div {'class': ['info']} 追风筝的人[美]卡勒德·胡赛尼/李继宏/上海人民出版社/2006-5/29.00元8.9(605191人评价)12岁的阿富汗富家少爷阿米尔与仆人哈桑情同手足。然而，在一场风筝比赛后，发生了一件悲惨不堪的事，阿米尔为自己的懦弱感到自责和痛苦，逼走了哈桑，不久，自己也跟...在豆瓣购买去看电子版
h2 {'class': []} 追风筝的人
a {'href': 'https://book.douban.com/subject/1770782/', 'title': '追风筝的人', 'onclick': "moreurl(this,{i:'0',query:'',subject_id:'1770782',from:'book_subject_search'})"} 追风筝的人
div {'class': ['pub']} [美]卡勒德·胡赛尼/李继宏/上海人民出版社/2006-5/29.00元
div {'class': ['star', 'clearfix']} 8.9(605191人评价)
span {'class': ['allstar45']} 
span {'class': ['rating_nums']} 8.9
span {'class': ['pl']} (605191人评价)
p {} 12岁的阿富汗富家少爷阿米尔与仆人哈桑情同手足。然而，在一场风筝比赛后，发生了一件悲惨不堪的事，阿米尔为自己的懦弱感到自责和痛苦，逼走了哈桑，不久，自己也跟...
div {'class': ['ft']} 在豆瓣购买去看电子版
div {

## 自动识别下一页
> 目前只能识别静态页面
#### 将来会考虑
* 增加滚动加载页面
* 增加JavaScript加载的下一页

In [None]:
#export
def is_next_page(tag):
    if tag.name == 'a':
        for text in ['下一页','后页']:
            if text in tag.text:
                return True
    else:
        return False
    
def get_next_page_url(url,headers = {'user-agent':'Mozilla/5.0'},cookies={}):
    res = requests.get(url,headers=headers,cookies=cookies)
    if (res.status_code == 200):
        soup = BeautifulSoup(res.text)
#         print(soup)
        next_page = soup.find(is_next_page)
        if (next_page):
            return next_page['href']
        else:
            print('没有下一页')
            return None
    else:
        return res    

In [None]:
url = 'https://book.douban.com/tag/%E5%A4%96%E5%9B%BD%E6%96%87%E5%AD%A6'
get_next_page_url(url)

'/tag/外国文学?start=20&type=T'

## 自动识别列表内容
> 思路就是样式一样的，是一类的内容

几种特殊类型：
1. img link
* a link
* text
* video #TBD
* 动态内容 #TBD

In [None]:
#export
def get_childrens(soup):
    return [s for s in soup.contents if isinstance(s,Tag)]

def get_prefix(soup):
    if 'class' in soup.attrs and len(soup.attrs['class']) > 0:
        prefix = soup.name+'.'+soup.attrs['class'][0]
    elif 'id' in soup.attrs:
        prefix = soup.name+'.'+soup.attrs['id']
    else:
        prefix = soup.name
    return prefix

def get_item_data(soup,db={}):
    childrens = get_childrens(soup)
    if len(childrens) > 0:
        for c in childrens:
            db[get_prefix(c)] = {}
            get_item_data(c,db[get_prefix(c)])
    else:
        if soup.name == 'a':
            db['href'] = soup['href']
            if 'title' in soup.attrs:
                db['title'] = soup['title']
        elif soup.name == 'img':
            db['src'] = soup['src']
        else:
            db['text'] = soup.text.strip()
    return db

def find_item_id(soup):
    # 找到内容id，思路是a标签，有title，有href，然后取里面数字    
    for a in soup.find_all('a'):
        if 'title' in a.attrs and 'href' in a.attrs:
            id = re.search(r'\d+',a['href']).group(0)
            return id


In [None]:
childrens = get_content_list('https://book.douban.com/tag/%E5%A4%96%E5%9B%BD%E6%96%87%E5%AD%A6')
db = get_item_data(childrens[0])
db

推测： 20 4741 ul {'class': ['subject-list']} li {'class': ['subject-item']}
选择： 20


{'div.pic': {'a.nbg': {'img': {'src': 'https://img3.doubanio.com/view/subject/s/public/s1727290.jpg'}}},
 'div.info': {'h2': {'a': {'href': 'https://book.douban.com/subject/1770782/',
    'title': '追风筝的人'}},
  'div.pub': {'text': '[美] 卡勒德·胡赛尼 / 李继宏 / 上海人民出版社 / 2006-5 / 29.00元'},
  'div.star': {'span.allstar45': {'text': ''},
   'span.rating_nums': {'text': '8.9'},
   'span.pl': {'text': '(605191人评价)'}},
  'p': {'text': '12岁的阿富汗富家少爷阿米尔与仆人哈桑情同手足。然而，在一场风筝比赛后，发生了一件悲惨不堪的事，阿米尔为自己的懦弱感到自责和痛苦，逼走了哈桑，不久，自己也跟...'},
  'div.ft': {'div.collect-info': {'text': ''},
   'div.cart-actions': {'span.market-info': {'a': {'href': 'https://book.douban.com/subject/1770782/?channel=subject_list&platform=web'}}},
   'div.ebook-link': {'a': {'href': 'https://read.douban.com/ebook/1162265/?dcs=tag-buylink&dcm=douban&dct=1770782'}}}}}