In [1]:
# Load necessary libraries

import requests
from bs4 import BeautifulSoup as bs

In [4]:
# Load webpage content
r = requests.get("https://keithgalli.github.io/web-scraping/example.html")

# Convert to a beautifulsoup object
soup = bs(r.content)

# Print out our html
print(soup.prettify())

<html>
 <head>
  <title>
   HTML Example
  </title>
 </head>
 <body>
  <div align="middle">
   <h1>
    HTML Webpage
   </h1>
   <p>
    Link to more interesting example:
    <a href="https://keithgalli.github.io/web-scraping/webpage.html">
     keithgalli.github.io/web-scraping/webpage.html
    </a>
   </p>
  </div>
  <h2>
   A Header
  </h2>
  <p>
   <i>
    Some italicized text
   </i>
  </p>
  <h2>
   Another header
  </h2>
  <p id="paragraph-id">
   <b>
    Some bold text
   </b>
  </p>
 </body>
</html>



In [7]:
# Using find and find_all
first_header = soup.find("h2")
print(first_header)

headers = soup.find_all("h2")
print(headers)

<h2>A Header</h2>
[<h2>A Header</h2>, <h2>Another header</h2>]


In [10]:
# Pass in a list of element to look for using find and find_all
first_header = soup.find(["h1", "h2"])
print(first_header)

headers = soup.find_all(["h1","h2"])
print(headers)

<h1>HTML Webpage</h1>
[<h1>HTML Webpage</h1>, <h2>A Header</h2>, <h2>Another header</h2>]


In [11]:
# Pass in attribute using find and find_all
paragraph = soup.find_all("p", attrs = {"id": "paragraph-id"})
print(paragraph)

[<p id="paragraph-id"><b>Some bold text</b></p>]


In [14]:
# nest find and find_all calls
body = soup.find('body')
div = body.find('div')
header = div.find("h1")
print(header)

<h1>HTML Webpage</h1>


In [30]:
# Searching for specific strings
import re

paragraphs = soup.find_all("p", string=re.compile("Some"))
print(paragraphs)

headers = soup.find_all("h2", string=re.compile("(H|h)eader"))
print(headers)

[<p><i>Some italicized text</i></p>, <p id="paragraph-id"><b>Some bold text</b></p>]
[<h2>A Header</h2>, <h2>Another header</h2>]


In [34]:
# Select (CSS selector)

content = soup.select("div p")
print(content)

paragraphs = soup.select("h2 ~ p")
print(paragraphs)

[<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>]
[<p><i>Some italicized text</i></p>, <p id="paragraph-id"><b>Some bold text</b></p>]


In [35]:
body_text = soup.select("p#paragraph-id b")
print(body_text)

[<b>Some bold text</b>]


In [37]:
# Grabbing element with specific property
print(soup.select("[align=middle]"))

[<div align="middle">
<h1>HTML Webpage</h1>
<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
</div>]


In [39]:
## Getting different properties of the HTML 
# Using string

header = soup.find("h2")
print(header)
print(header.string)

<h2>A Header</h2>
A Header


In [41]:
# Use get_text if multiple child element
div = soup.find("div")
print(div.prettify())
print(div.get_text())

<div align="middle">
 <h1>
  HTML Webpage
 </h1>
 <p>
  Link to more interesting example:
  <a href="https://keithgalli.github.io/web-scraping/webpage.html">
   keithgalli.github.io/web-scraping/webpage.html
  </a>
 </p>
</div>


HTML Webpage
Link to more interesting example: keithgalli.github.io/web-scraping/webpage.html



In [49]:
## Get a specific property from an element
link = soup.find("a")
print(link)
print(link['href'])

paragraphs = soup.select("p#paragraph-id")
print(paragraphs)
print(paragraphs[0]['id'])

<a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a>
https://keithgalli.github.io/web-scraping/webpage.html
[<p id="paragraph-id"><b>Some bold text</b></p>]
paragraph-id
