In [1]:
from urllib import request
from bs4 import BeautifulSoup

In [2]:
# BeautifulSoup lib note 
url = 'https://movie.douban.com/'
html_page = request.urlopen(url)

In [3]:
# can try out some other parser such as html.parser, lxml for normal html doc. lxml-xml for parsing html to xml
soup = BeautifulSoup(html_page, 'lxml')
soup

<!DOCTYPE html>
<html class="" lang="zh-cmn-Hans">
<head>
<meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
<meta content="webkit" name="renderer"/>
<meta content="always" name="referrer"/>
<title>
        豆瓣电影
</title>
<meta content="cZdR4xxR7RxmM4zE" name="baidu-site-verification"/>
<meta content="no-cache" http-equiv="Pragma"/>
<meta content="Sun, 6 Mar 2005 01:00:00 GMT" http-equiv="Expires"/>
<meta content="format=xhtml; url=http://m.douban.com/movie/" http-equiv="mobile-agent"/>
<meta content="13753521351564752166375" property="qc:admins"/>
<meta content="电影、经典电影、热映、电视剧、美剧、影评、电影院、电影票、排行、推荐" name="keywords"/>
<meta content="豆瓣电影提供最新的电影介绍及评论包括上映影片的影讯查询及购票服务。你可以记录想看、在看和看过的电影电视剧，顺便打分、写影评。根据你的口味，豆瓣电影会推荐好电影给你。" name="description"/>
<link href="https://img3.doubanio.com/f/movie/d59b2715fdea4968a450ee5f6c95c7d7a2030065/pics/movie/apple-touch-icon.png" rel="apple-touch-icon"/>
<link href="https://img3.doubanio.com/f/shire/420c6a4b676c73bc6af48dfcdd18b662f5c223d7/css/dou

In [4]:
# four kinds of object in bs: tag, navigablestring, BeautifulSoup and Comment
tag = soup.a
tag

<a class="nav-login" href="https://www.douban.com/accounts/login?source=movie" rel="nofollow">登录</a>

In [5]:
# each attribute has a name which could also be modified
print(tag.name)
tag.name = "p"
print(tag)

a
<p class="nav-login" href="https://www.douban.com/accounts/login?source=movie" rel="nofollow">登录</p>


In [6]:
# accessing the attributes
tag['class']

['nav-login']

In [7]:
# get the attributes as a dictionary which also means you can modify the attributes the way you modify a dictionary 
tag.attrs

{'class': ['nav-login'],
 'href': 'https://www.douban.com/accounts/login?source=movie',
 'rel': ['nofollow']}

In [8]:
del tag['rel']
tag.attrs

{'class': ['nav-login'],
 'href': 'https://www.douban.com/accounts/login?source=movie'}

In [9]:
# NavigableString: basically, it is the content inside the tag 
print(tag.string)
type(tag.string)

登录


bs4.element.NavigableString

In [10]:
# modification
tag.string.replace_with("Log in")
tag

<p class="nav-login" href="https://www.douban.com/accounts/login?source=movie">Log in</p>

In [11]:
# BeautifulSoup
soup.name

'[document]'

In [12]:
# navigating a tree: note that these operation could be done in the types described above
soup.head

<head>
<meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
<meta content="webkit" name="renderer"/>
<meta content="always" name="referrer"/>
<title>
        豆瓣电影
</title>
<meta content="cZdR4xxR7RxmM4zE" name="baidu-site-verification"/>
<meta content="no-cache" http-equiv="Pragma"/>
<meta content="Sun, 6 Mar 2005 01:00:00 GMT" http-equiv="Expires"/>
<meta content="format=xhtml; url=http://m.douban.com/movie/" http-equiv="mobile-agent"/>
<meta content="13753521351564752166375" property="qc:admins"/>
<meta content="电影、经典电影、热映、电视剧、美剧、影评、电影院、电影票、排行、推荐" name="keywords"/>
<meta content="豆瓣电影提供最新的电影介绍及评论包括上映影片的影讯查询及购票服务。你可以记录想看、在看和看过的电影电视剧，顺便打分、写影评。根据你的口味，豆瓣电影会推荐好电影给你。" name="description"/>
<link href="https://img3.doubanio.com/f/movie/d59b2715fdea4968a450ee5f6c95c7d7a2030065/pics/movie/apple-touch-icon.png" rel="apple-touch-icon"/>
<link href="https://img3.doubanio.com/f/shire/420c6a4b676c73bc6af48dfcdd18b662f5c223d7/css/douban.css" rel="stylesheet" type="text/css"/>
<link h

In [13]:
soup.title

<title>
        豆瓣电影
</title>

In [14]:
# get the first <a> tag
print(soup.a)
# get all the <a> tag
print(soup.find_all('a'))

<a class="nav-register" href="https://www.douban.com/accounts/register?source=movie" rel="nofollow">注册</a>
[<a class="nav-register" href="https://www.douban.com/accounts/register?source=movie" rel="nofollow">注册</a>, <a class="lnk-doubanapp" href="https://www.douban.com/doubanapp/app?channel=top-nav">下载豆瓣客户端</a>, <a href="https://www.douban.com/doubanapp/redirect?channel=top-nav&amp;direct_dl=1&amp;download=iOS">iPhone</a>, <a class="download-android" href="https://www.douban.com/doubanapp/redirect?channel=top-nav&amp;direct_dl=1&amp;download=Android">Android</a>, <a class="tip-link" href="https://www.douban.com/doubanapp/app?channel=qipao">豆瓣 5.0 全新发布</a>, <a class="tip-close" href="javascript: void 0;">×</a>, <a data-moreurl-dict='{"from":"top-nav-click-main","uid":"0"}' href="https://www.douban.com" target="_blank">豆瓣</a>, <a data-moreurl-dict='{"from":"top-nav-click-book","uid":"0"}' href="https://book.douban.com" target="_blank">读书</a>, <a data-moreurl-dict='{"from":"top-nav-click-

In [15]:
# .contents to get the list of sub node 
soup.body.contents

['\n',
 <script type="text/javascript">var _body_start = new Date();</script>,
 '\n',
 <link href="//img3.doubanio.com/dae/accounts/resources/321e246/shire/bundle.css" rel="stylesheet" type="text/css"/>,
 '\n',
 <div class="global-nav" id="db-global-nav">
 <div class="bd">
 <div class="top-nav-info">
 <p class="nav-login" href="https://www.douban.com/accounts/login?source=movie">Log in</p>
 <a class="nav-register" href="https://www.douban.com/accounts/register?source=movie" rel="nofollow">注册</a>
 </div>
 <div class="top-nav-doubanapp">
 <a class="lnk-doubanapp" href="https://www.douban.com/doubanapp/app?channel=top-nav">下载豆瓣客户端</a>
 <div class="more-items" id="top-nav-appintro">
 <p class="appintro-title">豆瓣</p>
 <p class="slogan">我们的精神角落</p>
 <p class="qrcode">扫码直接下载</p>
 <div class="download">
 <a href="https://www.douban.com/doubanapp/redirect?channel=top-nav&amp;direct_dl=1&amp;download=iOS">iPhone</a>
 <span>·</span>
 <a class="download-android" href="https://www.douban.com/douban

In [16]:
# .children to get the list generator of the sub node
for child in soup.body.children:
    print(child)



<script type="text/javascript">var _body_start = new Date();</script>


<link href="//img3.doubanio.com/dae/accounts/resources/321e246/shire/bundle.css" rel="stylesheet" type="text/css"/>


<div class="global-nav" id="db-global-nav">
<div class="bd">
<div class="top-nav-info">
<p class="nav-login" href="https://www.douban.com/accounts/login?source=movie">Log in</p>
<a class="nav-register" href="https://www.douban.com/accounts/register?source=movie" rel="nofollow">注册</a>
</div>
<div class="top-nav-doubanapp">
<a class="lnk-doubanapp" href="https://www.douban.com/doubanapp/app?channel=top-nav">下载豆瓣客户端</a>
<div class="more-items" id="top-nav-appintro">
<p class="appintro-title">豆瓣</p>
<p class="slogan">我们的精神角落</p>
<p class="qrcode">扫码直接下载</p>
<div class="download">
<a href="https://www.douban.com/doubanapp/redirect?channel=top-nav&amp;direct_dl=1&amp;download=iOS">iPhone</a>
<span>·</span>
<a class="download-android" href="https://www.douban.com/doubanapp/redirect?channel=top-nav&amp;di

In [17]:
# use descendants to get all the nested nodes
print(len(list(soup.children))) # get the direct children
print(len(list(soup.descendants)))

3
2974


In [18]:
# .strings or stripped_strings to view things inside tags
for string in soup.stripped_strings:
    print(string)

豆瓣电影
var _head_start = new Date();
Do.global('https://img3.doubanio.com/f/shire/0e82fba9c90a7daffc62dd2f80b23ff31c5e4739/js/separation/prettyfield.js');
        Do.global('https://img3.doubanio.com/f/shire/aef14a3cb852be78af0d48b0534beeea33035c0f/js/core/moreurl.js');
var _vwo_code = (function() {
    var account_id = 249272,
      settings_tolerance = 0,
      library_tolerance = 2500,
      use_existing_jquery = false,
      // DO NOT EDIT BELOW THIS LINE
      f=false,d=document;return{use_existing_jquery:function(){return use_existing_jquery;},library_tolerance:function(){return library_tolerance;},finish:function(){if(!f){f=true;var a=d.getElementById('_vis_opt_path_hides');if(a)a.parentNode.removeChild(a);}},finished:function(){return f;},load:function(a){var b=d.createElement('script');b.src=a;b.type='text/javascript';b.innerText;b.onerror=function(){_vwo_code.finish();};d.getElementsByTagName('head')[0].appendChild(b);},init:function(){settings_timer=setTimeout('_vwo_code.finis

In [19]:
# going up from the current node by using .parent
title_tag = soup.title
print(title_tag)
print(title_tag.parent)

<title>
        豆瓣电影
</title>
<head>
<meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
<meta content="webkit" name="renderer"/>
<meta content="always" name="referrer"/>
<title>
        豆瓣电影
</title>
<meta content="cZdR4xxR7RxmM4zE" name="baidu-site-verification"/>
<meta content="no-cache" http-equiv="Pragma"/>
<meta content="Sun, 6 Mar 2005 01:00:00 GMT" http-equiv="Expires"/>
<meta content="format=xhtml; url=http://m.douban.com/movie/" http-equiv="mobile-agent"/>
<meta content="13753521351564752166375" property="qc:admins"/>
<meta content="电影、经典电影、热映、电视剧、美剧、影评、电影院、电影票、排行、推荐" name="keywords"/>
<meta content="豆瓣电影提供最新的电影介绍及评论包括上映影片的影讯查询及购票服务。你可以记录想看、在看和看过的电影电视剧，顺便打分、写影评。根据你的口味，豆瓣电影会推荐好电影给你。" name="description"/>
<link href="https://img3.doubanio.com/f/movie/d59b2715fdea4968a450ee5f6c95c7d7a2030065/pics/movie/apple-touch-icon.png" rel="apple-touch-icon"/>
<link href="https://img3.doubanio.com/f/shire/420c6a4b676c73bc6af48dfcdd18b662f5c223d7/css/douban.css" rel="stylesh

In [20]:
# .parents to get all the parents 
for parent in soup.a.parents:
    if parent is None:
        print(parent)
    else:
        print(parent.name)

div
div
div
body
html
[document]


In [21]:
# get the next tag which has the same parent of the current node 
soup.div.next_sibling

'\n'

In [22]:
soup.div.next_sibling.next_sibling

<script>
  ;window._GLOBAL_NAV = {
    DOUBAN_URL: "https://www.douban.com",
    N_NEW_NOTIS: 0,
    N_NEW_DOUMAIL: 0
  };
</script>

In [23]:
# to get the right next to the current element 
soup.div.next_element

'\n'

In [24]:
# same goes to previous_siblings
for e in soup.div.next_siblings:
    print(e)



<script>
  ;window._GLOBAL_NAV = {
    DOUBAN_URL: "https://www.douban.com",
    N_NEW_NOTIS: 0,
    N_NEW_DOUMAIL: 0
  };
</script>


<script defer="defer" src="//img3.doubanio.com/dae/accounts/resources/321e246/shire/bundle.js"></script>


<link href="//img3.doubanio.com/dae/accounts/resources/321e246/movie/bundle.css" rel="stylesheet" type="text/css"/>


<div class="nav" id="db-nav-movie">
<div class="nav-wrap">
<div class="nav-primary">
<div class="nav-logo">
<a href="https://movie.douban.com">豆瓣电影</a>
</div>
<div class="nav-search">
<form action="https://movie.douban.com/subject_search" method="get">
<fieldset>
<legend>搜索：</legend>
<label for="inp-query">
</label>
<div class="inp"><input id="inp-query" maxlength="60" name="search_text" placeholder="搜索电影、电视剧、综艺、影人" size="22" value=""/></div>
<div class="inp-btn"><input type="submit" value="搜索"/></div>
<input name="cat" type="hidden" value="1002"/>
</fieldset>
</form>
</div>
</div>
</div>
<div class="nav-secondary">
<div class="na

In [26]:
# Searching a tree
# string filter
soup.find_all('a')

[<a class="nav-register" href="https://www.douban.com/accounts/register?source=movie" rel="nofollow">注册</a>,
 <a class="lnk-doubanapp" href="https://www.douban.com/doubanapp/app?channel=top-nav">下载豆瓣客户端</a>,
 <a href="https://www.douban.com/doubanapp/redirect?channel=top-nav&amp;direct_dl=1&amp;download=iOS">iPhone</a>,
 <a class="download-android" href="https://www.douban.com/doubanapp/redirect?channel=top-nav&amp;direct_dl=1&amp;download=Android">Android</a>,
 <a class="tip-link" href="https://www.douban.com/doubanapp/app?channel=qipao">豆瓣 5.0 全新发布</a>,
 <a class="tip-close" href="javascript: void 0;">×</a>,
 <a data-moreurl-dict='{"from":"top-nav-click-main","uid":"0"}' href="https://www.douban.com" target="_blank">豆瓣</a>,
 <a data-moreurl-dict='{"from":"top-nav-click-book","uid":"0"}' href="https://book.douban.com" target="_blank">读书</a>,
 <a data-moreurl-dict='{"from":"top-nav-click-movie","uid":"0"}' href="https://movie.douban.com">电影</a>,
 <a data-moreurl-dict='{"from":"top-nav-

In [29]:
# regex filter
import re
for tag in soup.find_all(re.compile('^b')):
    print(tag.name)

body
br


In [31]:
# list filter
# match every element in the list
soup.find_all(['a','div'])

[<div class="global-nav" id="db-global-nav">
 <div class="bd">
 <div class="top-nav-info">
 <p class="nav-login" href="https://www.douban.com/accounts/login?source=movie">Log in</p>
 <a class="nav-register" href="https://www.douban.com/accounts/register?source=movie" rel="nofollow">注册</a>
 </div>
 <div class="top-nav-doubanapp">
 <a class="lnk-doubanapp" href="https://www.douban.com/doubanapp/app?channel=top-nav">下载豆瓣客户端</a>
 <div class="more-items" id="top-nav-appintro">
 <p class="appintro-title">豆瓣</p>
 <p class="slogan">我们的精神角落</p>
 <p class="qrcode">扫码直接下载</p>
 <div class="download">
 <a href="https://www.douban.com/doubanapp/redirect?channel=top-nav&amp;direct_dl=1&amp;download=iOS">iPhone</a>
 <span>·</span>
 <a class="download-android" href="https://www.douban.com/doubanapp/redirect?channel=top-nav&amp;direct_dl=1&amp;download=Android">Android</a>
 </div>
 <div id="doubanapp-tip">
 <a class="tip-link" href="https://www.douban.com/doubanapp/app?channel=qipao">豆瓣 5.0 全新发布</a>
 <a

In [41]:
# define customized function with True filter
def has_id_but_no_class(tag):
    return tag.has_attr('id') and not tag.has_attr('class')
soup.div.find_all(has_id_but_no_class)

[<div id="doubanapp-tip">
 <a class="tip-link" href="https://www.douban.com/doubanapp/app?channel=qipao">豆瓣 5.0 全新发布</a>
 <a class="tip-close" href="javascript: void 0;">×</a>
 </div>]

In [43]:
# argument of find_all method
soup.find_all(id='top-nav-appintro')

[<div class="more-items" id="top-nav-appintro">
 <p class="appintro-title">豆瓣</p>
 <p class="slogan">我们的精神角落</p>
 <p class="qrcode">扫码直接下载</p>
 <div class="download">
 <a href="https://www.douban.com/doubanapp/redirect?channel=top-nav&amp;direct_dl=1&amp;download=iOS">iPhone</a>
 <span>·</span>
 <a class="download-android" href="https://www.douban.com/doubanapp/redirect?channel=top-nav&amp;direct_dl=1&amp;download=Android">Android</a>
 </div>
 <div id="doubanapp-tip">
 <a class="tip-link" href="https://www.douban.com/doubanapp/app?channel=qipao">豆瓣 5.0 全新发布</a>
 <a class="tip-close" href="javascript: void 0;">×</a>
 </div>
 </div>]

In [44]:
soup.find_all(id=True)

[<div class="global-nav" id="db-global-nav">
 <div class="bd">
 <div class="top-nav-info">
 <p class="nav-login" href="https://www.douban.com/accounts/login?source=movie">Log in</p>
 <a class="nav-register" href="https://www.douban.com/accounts/register?source=movie" rel="nofollow">注册</a>
 </div>
 <div class="top-nav-doubanapp">
 <a class="lnk-doubanapp" href="https://www.douban.com/doubanapp/app?channel=top-nav">下载豆瓣客户端</a>
 <div class="more-items" id="top-nav-appintro">
 <p class="appintro-title">豆瓣</p>
 <p class="slogan">我们的精神角落</p>
 <p class="qrcode">扫码直接下载</p>
 <div class="download">
 <a href="https://www.douban.com/doubanapp/redirect?channel=top-nav&amp;direct_dl=1&amp;download=iOS">iPhone</a>
 <span>·</span>
 <a class="download-android" href="https://www.douban.com/doubanapp/redirect?channel=top-nav&amp;direct_dl=1&amp;download=Android">Android</a>
 </div>
 <div id="doubanapp-tip">
 <a class="tip-link" href="https://www.douban.com/doubanapp/app?channel=qipao">豆瓣 5.0 全新发布</a>
 <a

In [47]:
soup.find_all(href=re.compile('douban'), id=True)

[]

In [50]:
soup.find_all(href=re.compile('douban'), attrs={'data-moreurl-dict':'{"from":"top-nav-click-market","uid":"0"}'})

[<a data-moreurl-dict='{"from":"top-nav-click-market","uid":"0"}' href="https://market.douban.com/?utm_campaign=douban_top_nav&amp;utm_source=douban&amp;utm_medium=pc_web" target="_blank">市集</a>]