# Beautiful Soup


In [8]:
import requests
from bs4 import BeautifulSoup


# Get a simple html page


In [9]:
response = requests.get('http://localhost:5000/')
response.text


'<!doctype html><html lang="en"><head><meta charset="utf-8"/><link rel="icon" href="/favicon.ico"/><meta name="viewport" content="width=device-width,initial-scale=1"/><meta name="theme-color" content="#000000"/><meta name="description" content="Web site created using create-react-app"/><link rel="apple-touch-icon" href="/logo192.png"/><link rel="manifest" href="/manifest.json"/><title>React App</title><script defer="defer" src="/static/js/main.940cc95d.js"></script><link href="/static/css/main.073c9b0a.css" rel="stylesheet"></head><body><noscript>You need to enable JavaScript to run this app.</noscript><div id="root"></div></body></html>'

In [10]:
soup = BeautifulSoup(response.text, 'html.parser')


In [11]:
print(soup.prettify())


<!DOCTYPE html>
<html lang="en">
 <head>
  <meta charset="utf-8"/>
  <link href="/favicon.ico" rel="icon"/>
  <meta content="width=device-width,initial-scale=1" name="viewport"/>
  <meta content="#000000" name="theme-color"/>
  <meta content="Web site created using create-react-app" name="description"/>
  <link href="/logo192.png" rel="apple-touch-icon"/>
  <link href="/manifest.json" rel="manifest"/>
  <title>
   React App
  </title>
  <script defer="defer" src="/static/js/main.940cc95d.js">
  </script>
  <link href="/static/css/main.073c9b0a.css" rel="stylesheet"/>
 </head>
 <body>
  <noscript>
   You need to enable JavaScript to run this app.
  </noscript>
  <div id="root">
  </div>
 </body>
</html>


# Get an element by tag

In [12]:
soup.title


<title>React App</title>

# Get the name of the tag

In [13]:
soup.title.name


'title'

# Get the inner text of the tag


In [14]:
soup.title.string


'React App'

# Reference the parent


In [15]:
soup.title.parent.name


'head'

In [16]:
soup.link


<link href="/favicon.ico" rel="icon"/>

In [17]:
soup.find_all('link')


[<link href="/favicon.ico" rel="icon"/>,
 <link href="/logo192.png" rel="apple-touch-icon"/>,
 <link href="/manifest.json" rel="manifest"/>,
 <link href="/static/css/main.073c9b0a.css" rel="stylesheet"/>]

In [18]:
soup.find_all(id='root')


[<div id="root"></div>]

In [19]:
with open('simple.html', 'r') as fp:
    soup = BeautifulSoup(fp, 'html.parser')
soup


<!DOCTYPE html>

<html>
<head>
<title>
        The Dormouse's story
       </title>
</head>
<body>
<p class="title">
<b>
         The Dormouse's story
        </b>
</p>
<p class="story">
        Once upon a time there were three little sisters; and their names were
        <a class="sister" href="http://example.com/elsie" id="link1">
         Elsie
        </a>
        ,
        <a class="sister" href="http://example.com/lacie" id="link2">
         Lacie
        </a>
        and
        <a class="sister" href="http://example.com/tillie" id="link3">
         Tillie
        </a>
        ; and they lived at the bottom of a well.
       </p>
<p class="story">
        ...
       </p>
</body>
</html>

In [20]:
soup.a


<a class="sister" href="http://example.com/elsie" id="link1">
         Elsie
        </a>

In [23]:
soup.a['href']

'http://example.com/elsie'

In [24]:
soup.find_all('a')


[<a class="sister" href="http://example.com/elsie" id="link1">
          Elsie
         </a>,
 <a class="sister" href="http://example.com/lacie" id="link2">
          Lacie
         </a>,
 <a class="sister" href="http://example.com/tillie" id="link3">
          Tillie
         </a>]

## Get the text content


In [25]:
soup.get_text()


"\n\n\n\n        The Dormouse's story\n       \n\n\n\n\n         The Dormouse's story\n        \n\n\n        Once upon a time there were three little sisters; and their names were\n        \n         Elsie\n        \n        ,\n        \n         Lacie\n        \n        and\n        \n         Tillie\n        \n        ; and they lived at the bottom of a well.\n       \n\n        ...\n       \n\n\n"

## Get by id


In [26]:
soup.find(id='link3')


<a class="sister" href="http://example.com/tillie" id="link3">
         Tillie
        </a>

### Get by class


In [27]:
soup.find_all(attrs={"class": "story"})


[<p class="story">
         Once upon a time there were three little sisters; and their names were
         <a class="sister" href="http://example.com/elsie" id="link1">
          Elsie
         </a>
         ,
         <a class="sister" href="http://example.com/lacie" id="link2">
          Lacie
         </a>
         and
         <a class="sister" href="http://example.com/tillie" id="link3">
          Tillie
         </a>
         ; and they lived at the bottom of a well.
        </p>,
 <p class="story">
         ...
        </p>]

In [28]:
soup.find_all(class_="story")


[<p class="story">
         Once upon a time there were three little sisters; and their names were
         <a class="sister" href="http://example.com/elsie" id="link1">
          Elsie
         </a>
         ,
         <a class="sister" href="http://example.com/lacie" id="link2">
          Lacie
         </a>
         and
         <a class="sister" href="http://example.com/tillie" id="link3">
          Tillie
         </a>
         ; and they lived at the bottom of a well.
        </p>,
 <p class="story">
         ...
        </p>]

## CSS Selectors


In [29]:
soup.select("p.title")


[<p class="title">
 <b>
          The Dormouse's story
         </b>
 </p>]

In [30]:
elsie =soup.select('a#link1')
elsie

[<a class="sister" href="http://example.com/elsie" id="link1">
          Elsie
         </a>]

# Reference the siblings

In [31]:
elsie[0].find_next_sibling()

<a class="sister" href="http://example.com/lacie" id="link2">
         Lacie
        </a>

In [32]:
elsie[0].find_previous_sibling()

## Reference the parent


In [33]:
story = elsie[0].find_parent()
story


<p class="story">
        Once upon a time there were three little sisters; and their names were
        <a class="sister" href="http://example.com/elsie" id="link1">
         Elsie
        </a>
        ,
        <a class="sister" href="http://example.com/lacie" id="link2">
         Lacie
        </a>
        and
        <a class="sister" href="http://example.com/tillie" id="link3">
         Tillie
        </a>
        ; and they lived at the bottom of a well.
       </p>

In [34]:
list(story.children)


['\n        Once upon a time there were three little sisters; and their names were\n        ',
 <a class="sister" href="http://example.com/elsie" id="link1">
          Elsie
         </a>,
 '\n        ,\n        ',
 <a class="sister" href="http://example.com/lacie" id="link2">
          Lacie
         </a>,
 '\n        and\n        ',
 <a class="sister" href="http://example.com/tillie" id="link3">
          Tillie
         </a>,
 '\n        ; and they lived at the bottom of a well.\n       ']

## Find  by attribute value


In [35]:
soup.select("a[href]")


[<a class="sister" href="http://example.com/elsie" id="link1">
          Elsie
         </a>,
 <a class="sister" href="http://example.com/lacie" id="link2">
          Lacie
         </a>,
 <a class="sister" href="http://example.com/tillie" id="link3">
          Tillie
         </a>]

In [36]:
soup.select('a[href="http://example.com/lacie"]')


[<a class="sister" href="http://example.com/lacie" id="link2">
          Lacie
         </a>]