In [9]:
# import BeautifulSoup
from bs4 import BeautifulSoup

In [10]:
# instanciate BeautifulSoup class
soup = BeautifulSoup("<html>data</html>", "html.parser") #Here we use an HTML Parser

In [11]:
HTML_DOC = """
<html>
    <head><title>The Dormouse's story</title></head>
    <body>
        <p class="title"><b>The Dormouse's story</b></p>

        <p class="story">
            Once upon a time there were three little sisters; and their neames were
            <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
            <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
            <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
            and they lived at the bottom of a well.
        </p>

        <p class="story">...</p>
    </body>
</html>
"""

Let's create a new instance of BeautifulSoup. Here we won't need to import this class again since we did it above.

In [12]:
# create a new instance of BeautifulSoup
soup = BeautifulSoup(HTML_DOC, "html.parser")

# Find HTML content using HTML tag name
You can find the content of an html page by the name of its HTML tag:

In [13]:
# Find <head> tag contained within HTML_DOC
soup.head

<head><title>The Dormouse's story</title></head>

In [14]:
# Find <title> tag contained within HTML_DOC
soup.title

<title>The Dormouse's story</title>

# Find a parent's element
you can find a parent element using loops:

In [15]:
# Find the first <a> tag within HTML_DOC
link = soup.a
link

<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>

In [16]:
# link.parents creates a generator (more info => https://wiki.python.org/moin/Geneators)
## we can use python built-in function list() to see what is inside it
list(link.parents)

[<p class="story">
             Once upon a time there were three little sisters; and their neames were
             <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
             <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
             <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
             and they lived at the bottom of a well.
         </p>,
 <body>
 <p class="title"><b>The Dormouse's story</b></p>
 <p class="story">
             Once upon a time there were three little sisters; and their neames were
             <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
             <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
             <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
             and they lived at the bottom of a well.
         </p>
 <p class="story">...</p>
 </body>,
 <html>
 <head><title>The Dormouse's story<

In [17]:
# Let's now Loop through all parents of link
# Let's also use enumerate() built-in function to get the index number of each iteration
for i, parent in enumerate(link.parents):
    # if we have no more parents
    if parent is None:
        print(parent)
    else:
        print("Parent {} is: {}".format(i, parent.name)) # parent.name will give only the element name as output

Parent 0 is: p
Parent 1 is: body
Parent 2 is: html
Parent 3 is: [document]


# Find a sibling's element
You can select an element's next siblings with .next_siblings parameter:

In [18]:
# Select all the next <a> tag
for sibling in soup.a.next_siblings:
    print(sibling)

,
            
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
 and
            
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>
;
            and they lived at the bottom of a well.
        


Conversely, you can select an element's previous siblings with .previous_siblings parameter

In [19]:
# Let's use .find() method to select <a> tag with id="link3"
# Then display all its previous siblings
for sibling in soup.find(id="link3").previous_siblings:
    print(sibling)


 and
            
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
,
            
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>

            Once upon a time there were three little sisters; and their neames were
            


# Find all items that meet a specific condition
There is a very handy function in beautifulsoup: .find_all() which fetch all the elements of an HTML page that meet certain criteria. For example: 
Ce sont ces fonctions que nous allons utiliser probablement le plus dans notre quotidien

In [20]:
# Select all elements named title
soup.find_all("title")

[<title>The Dormouse's story</title>]

In [21]:
# Select all p elements with the class "title"
soup.find_all("p", "title")

[<p class="title"><b>The Dormouse's story</b></p>]

In [22]:
# Select all <a> tags
soup.find_all("a")

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [23]:
# Select all elements with id= "link2"
soup.find_all(id="link2")

[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]

In [24]:
# Select all strings that contain "sisters"
## Here we use re package which is used for regular expression
## More Info here > http://docs.python.org/3/library/re.html
import re

soup.find(string=re.compile("sisters"))


'\n            Once upon a time there were three little sisters; and their neames were\n            '

# Find elements using CSS
Finally, content can be found via CSS selectors, by using the .select() method:

In [25]:
# Select all <a> tags with class of "sister"
soup.select("a.sister") # le point est le sélecteur pour les classes

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [26]:
# Select all <a> tags with id="link1"
soup.select("a#link1")

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]

For deeply nested elements, you can specifiy a path, just like you would do in plain CSS:

In [28]:
# Select all <a> with id="link1" that are contained within <p> with class="story"
## NB this is exactly like writing soup.select("a#link1")
soup.select("p.story a#link1") #p.story dont la classe est story

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]

# Extract text
You can use .get_text() to extract the encapsulated text within an HTML tag:

In [29]:
# Access the lust item that soup.select("a#link1") outputs
## Use get_text() to get only the string part
soup.select("a#link1")[0].get_text()

'Elsie'

In [30]:
# Let's use list comprehensions to select all text from all <a> tags of class="sister"
[a.get_text() for a in soup.select("a.sister")]

['Elsie', 'Lacie', 'Tillie']

# Extract a property
Sometimes, it is extremely useful to extract the value of a property within an HTML tag. For example, you might want to extract all the URLs of a given webpage. You can use the .get() method to extract a given property from an element:

In [31]:
# Extract href property from <a> tags with id="link1"
soup.select("a#link1")[0].get('href')

'http://example.com/elsie'