In [1]:
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""

from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc, 'html.parser')

In [2]:
#find() and find_all() are most used searching techniques within tree
soup.find_all('b') #pass tag name as argument
#find_all returns all the tags within set 

[<b>The Dormouse's story</b>]

In [4]:
type(soup.find_all('b'))

bs4.element.ResultSet

In [3]:
soup.find_all('a')

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [5]:
soup.find('a')
#find returns 1st tag mentioned

<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>

### regular expression

In [12]:
import re
for tag in soup.find_all(re.compile("^b")):
    print(tag.name)
    # finds all the tags whose names starts with the letter “b”

body
b


In [13]:
for tag in soup.find_all(re.compile("t")):
    print(tag.name)
#last code gave list which started with b, this one returns the list of tag names which contains t

html
title


## Functions
- If none of matches work, a function can be defined which takes element as argument.
- Function returns true if there is match, else false

In [27]:
def has_class_but_no_id(tag):
    return tag.has_attr('class') and not tag.has_attr('id')

#fetching 'p' tags without mentioning <p> tag
#here function returns tag which has class attribute and not id attribute
#this function picks up only p tags as it can have class and not id attributes
# doesn't pick up <a> and html tags

In [28]:
#Passing this function into find_all()
soup.find_all(has_class_but_no_id)
#this should not return p tags with id. (check code)

[<p class="title"><b>The Dormouse's story</b></p>,
 <p class="story">Once upon a time there were three little sisters; and their names were
 <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
 and they lived at the bottom of a well.</p>,
 <p class="story">...</p>]

In [29]:
import re
def not_lacie(href):
    return href and not re.compile("lacie").search(href)

#compile() function takes source code as input and returns a code object which is ready to be 
#executed and which can later be executed by the exec() function.

# we have 3 href(3 sisters), discard Lacie and print other 2 sister's href

In [30]:
soup.find_all(href=not_lacie)

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [31]:
from bs4 import NavigableString
def surrounded_by_strings(tag):
    return (isinstance(tag.next_element, NavigableString)
            and isinstance(tag.previous_element, NavigableString))

for tag in soup.find_all(surrounded_by_strings):
    print(tag.name)
    
    #this function returns true if tag is surrounded by string objects

body
p
a
a
a
p


### find_all()
- I can pass these attributes: name, attrs, recursive, string, limit, **kwargs 

In [32]:
soup.find_all("title")
# returns title tags

[<title>The Dormouse's story</title>]

In [33]:
# this finds all tag with has both title and <p> tag
soup.find_all("p", "title")

[<p class="title"><b>The Dormouse's story</b></p>]

In [34]:
soup.find_all("a")
# returns all a tags

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [35]:
soup.find_all(id="link2")
# passing attribute with the value of attr.

[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]

In [36]:
# find string(navigable string) which has sisters
soup.find(string=re.compile("sisters"))

'Once upon a time there were three little sisters; and their names were\n'

In [39]:
# there is no word in navigablestring called hello
print(soup.find(string=re.compile("hello")))

None


### Argument: name

In [40]:
soup.find_all("title")

[<title>The Dormouse's story</title>]

### argument: keyword

In [41]:
soup.find_all(id='link2')

[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]

In [42]:
# here, href part will be looked for word elsie
soup.find_all(href=re.compile("elsie"))

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]

In [43]:
soup.find_all(re.compile("elsie")) # here <p> tags will be looked for word elsie. 
# there is no elsie word in p tag

[]

In [44]:
#applying same logic shown above, I have passed value "title", parser looks for this value under
# <p> tag
soup.find_all(re.compile("title"))

[<title>The Dormouse's story</title>]

In [45]:
soup.find_all(id=True)
# I can also pass boolean value True or False
# this returns where id has some values

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [51]:
#soup.find_all(id=False)

In [52]:
# passing multiple attributes and filtering out result
soup.find_all(href=re.compile("elsie"), id='link1')

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]

In [53]:
soup.find_all(href=re.compile("elsie"), id='link1', class="sister")
# obviously doesnt work as elsie and id are part of href. class is out of href tag

SyntaxError: invalid syntax (1536493987.py, line 1)

In [54]:
soup.find_all(class = 'sister') # this doesnt work as class is reserved keyword


SyntaxError: invalid syntax (3061413822.py, line 1)

In [58]:
soup.find_all(class_ = 'sister')

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [55]:
name_soup = BeautifulSoup('<input name="email"/>', 'html.parser')
name_soup.find_all(name="email")

[]

In [56]:
name_soup.find_all(attrs={"name": "email"})

[<input name="email"/>]

### Searching by CSS class

In [57]:
soup.find_all("a", class_="sister")

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [59]:
soup.find_all(class_=re.compile("itl")) #using regex fetch where class name has itl: title 

[<p class="title"><b>The Dormouse's story</b></p>]

In [62]:
soup.find_all(class_=re.compile("sis"))

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [63]:
# fetch using number of character/ length of classname
def has_six_characters(css_class):
    return css_class is not None and len(css_class) == 6

In [64]:
soup.find_all(class_=has_six_characters)

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [65]:
def has_eight_characters(css_class):
    return css_class is not None and len(css_class) == 8
soup.find_all(class_=has_eight_characters)

#there is no class name which has 8 characters

[]

In [66]:
#class can have multiple values assigned, so I can assign value of class(class name)
css_soup = BeautifulSoup('<p class="body strikeout"></p>', 'html.parser')
css_soup.find_all("p", class_="strikeout")

[<p class="body strikeout"></p>]

In [67]:
css_soup.find_all("p", class_="body")

[<p class="body strikeout"></p>]

In [68]:
css_soup.find_all("p", class_="body strikeout")

[<p class="body strikeout"></p>]

In [69]:
#trying to fetch class name which doesnt exist
css_soup.find_all("p", class_="strikeout body")
#returns nothing

[]

In [70]:
css_soup.select("p.strikeout.body") #matches two or more css classes

[<p class="body strikeout"></p>]

In [71]:
soup.find_all("a", attrs={"class": "sister"})

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

### string argument

In [72]:
soup.find_all(string="Elsie")
#returns set

['Elsie']

In [77]:
type(soup.find_all(string="Elsie"))


bs4.element.ResultSet

In [73]:
soup.find_all(string=["Tillie", "Elsie", "Lacie"])

['Elsie', 'Lacie', 'Tillie']

In [78]:
soup.find_all(string=re.compile("Dormouse"))
# using regex returns string/text which contains word Dormouse

["The Dormouse's story", "The Dormouse's story"]

In [79]:
def is_the_only_string_within_a_tag(s):
    return (s == s.parent.string)

In [80]:
soup.find_all(string=is_the_only_string_within_a_tag)

["The Dormouse's story",
 "The Dormouse's story",
 'Elsie',
 'Lacie',
 'Tillie',
 '...']

In [75]:
soup.find_all("a", string="Elsie")
#returns anchor tag which has string Elsie

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]

In [76]:
soup.find_all("a", text="Elsie")
#early version of bs4 has text keyword instead of string

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]

### limit argument

In [81]:
soup.find_all("a", limit=2)
#generally document is so huge and final_all() fetches all tags mentioned.it can be thousands.
# in this case I can limit the result using limit attribute 

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]

### recursive argument
- find_all() examines all the children (descendants). In large documents there can be as many of children possible
- to fetch just direct children (to fetch first I can still use find(), adding recursive attribute will generate direct children)

In [82]:
soup.html.find_all("title")

[<title>The Dormouse's story</title>]

In [83]:
soup.html.find_all("title", recursive=False)

[]

In [84]:
# The <title> tag is beneath the <html> tag, but it’s not directly beneath the <html> tag: 
#the <head> tag is in the way. Beautiful Soup finds the <title> tag when it’s allowed to look 
#at all descendants of the <html> tag, but when recursive=False restricts it to the <html> tag’s 
#immediate children, it finds nothing.

In [86]:
soup.find_all("a")
#this fetches all nchor tags. also next example does same but syntax is different

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [87]:
soup("a") #without using fina_all() I can fetch using soup object

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [88]:
#applying same logic
soup.title.find_all(string=True)

["The Dormouse's story"]

In [89]:
soup.title(string=True)

["The Dormouse's story"]

In [90]:
soup.find_all('title', limit=1)

[<title>The Dormouse's story</title>]

In [91]:
soup.find('title')
#find return single result

<title>The Dormouse's story</title>

In [92]:
print(soup.find("puppies")) #there is no tag called puppies

None


In [93]:
soup.head.title #this fetches title tag

<title>The Dormouse's story</title>

In [94]:
# I can also fetch title tag using 2 function together. find(head) and find(title)
soup.find("head").find("title")

<title>The Dormouse's story</title>

### find_parents() and find_parent()
- syntax:  find_parents(name, attrs, string, limit, **kwargs) 
- these works way up

In [95]:
a_string = soup.find(string="Lacie")
a_string #find lacie and assign it to a_string

'Lacie'

In [99]:
type(a_string)

bs4.element.NavigableString

In [96]:
a_string.find_parents("a") #find parents of Lacie. as Lacie is string, it's immediate parent is 
#<a> tag. it returns set. I can iterate over .parents

[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]

In [101]:
type(a_string.find_parents("a"))

bs4.element.ResultSet

In [100]:
a_string.find_parent("a")

<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>

In [102]:
type(a_string.find_parent("a")) #this return single parent tag

bs4.element.Tag

In [97]:
a_string.find_parent("p")

<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

In [98]:
a_string.find_parents("p", class_="title") #passing attribute along with tag 

[]

In [104]:
print(a_string.find_parent("p", class_="title"))

None


### find_next_siblings() and find_next_sibling()

In [105]:
first_link = soup.a
first_link

<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>

In [106]:
first_link.find_next_siblings("a")

[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [107]:
first_link.find_next_sibling("a")

<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>

In [108]:
first_story_paragraph = soup.find("p", "story")
first_story_paragraph.find_next_sibling("p")

<p class="story">...</p>

### find_previous_siblings() and find_previous_sibling()

In [109]:
last_link = soup.find("a", id="link3")
last_link

<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>

In [110]:

last_link.find_previous_sibling("a")

<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>

In [111]:

last_link.find_previous_siblings("a")

[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]

In [112]:

first_story_paragraph = soup.find("p", "story")
first_story_paragraph.find_previous_sibling("p")

<p class="title"><b>The Dormouse's story</b></p>

### find_all_next() and find_next()

In [113]:
first_link = soup.a
first_link

<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>

In [114]:
first_link.find_all_next(string=True)

['Elsie',
 ',\n',
 'Lacie',
 ' and\n',
 'Tillie',
 ';\nand they lived at the bottom of a well.',
 '\n',
 '...',
 '\n']

In [116]:
first_link.find_next("p")

<p class="story">...</p>

### find_all_previous() and find_previous()

In [117]:
first_link = soup.a
first_link

<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>

In [118]:
first_link.find_all_previous("p")

[<p class="story">Once upon a time there were three little sisters; and their names were
 <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
 and they lived at the bottom of a well.</p>,
 <p class="title"><b>The Dormouse's story</b></p>]

In [119]:

first_link.find_previous("title")

<title>The Dormouse's story</title>

## CSS selectors

In [120]:
soup.select("title")
#fetching using name of tag 

[<title>The Dormouse's story</title>]

In [153]:
soup.select("html > head")

[]

In [123]:
soup.select("p:nth-of-type(3)")

[<p class="story">...</p>]

In [122]:
soup.select("body a") # find tags beneath other tags

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [124]:
soup.select("html head title")

[<title>The Dormouse's story</title>]

In [125]:
soup.select("head > title") 

[<title>The Dormouse's story</title>]

In [126]:
soup.select("p > a")

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [127]:

soup.select("p > a:nth-of-type(2)")

[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]

In [128]:
soup.select("p > #link1") #in p tag where link1

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]

In [129]:
soup.select("body > a")


[]

In [130]:
soup.select("#link1 ~ .sister") # find siblings of tag. except link1

[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [131]:
soup.select("#link1 + .sister") # find link1 where class is sister

[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]

In [132]:
soup.select(".sister") # css class

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [133]:

soup.select("[class~=sister]") 

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [134]:
soup.select("#link1") #find tags by id

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]

In [135]:
soup.select("a#link2")#find tags by id

[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]

In [136]:
soup.select("#link1,#link2") #selectors  from list of selectors

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]

In [137]:
soup.select('a[href]')
#fetch a tag which has href

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [138]:
# find tag using attribute value
soup.select('a[href="http://example.com/elsie"]')

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]

In [139]:
soup.select('a[href^="http://example.com/"]')

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [140]:
soup.select('a[href$="tillie"]')

[<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [141]:
soup.select('a[href*=".com/el"]')

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]

In [142]:
#select_one() to fetch first match
soup.select_one(".sister")

<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>

In [143]:

from bs4 import BeautifulSoup
xml = """<tag xmlns:ns1="http://namespace1/" xmlns:ns2="http://namespace2/">
 <ns1:child>I'm in namespace 1</ns1:child>
 <ns2:child>I'm in namespace 2</ns2:child>
</tag> """
soup = BeautifulSoup(xml, "xml")

In [144]:
soup.select("child")

[<ns1:child>I'm in namespace 1</ns1:child>,
 <ns2:child>I'm in namespace 2</ns2:child>]

In [145]:
soup.select("ns1|child")

[<ns1:child>I'm in namespace 1</ns1:child>]

In [146]:
namespaces = dict(first="http://namespace1/", second="http://namespace2/")
soup.select("second|child", namespaces=namespaces)

[<ns2:child>I'm in namespace 2</ns2:child>]