## Load necessary libraries

In [9]:
import requests
from bs4 import BeautifulSoup as bs

## Load our first page

In [10]:
# Load the webpage content
r = requests.get("https://keithgalli.github.io/web-scraping/example.html")

# Convert to beautiful soup object
soup = bs(r.content)

# print html
print(soup.prettify())

<html>
 <head>
  <title>
   HTML Example
  </title>
 </head>
 <body>
  <div align="middle">
   <h1>
    HTML Webpage
   </h1>
   <p>
    Link to more interesting example:
    <a href="https://keithgalli.github.io/web-scraping/webpage.html">
     keithgalli.github.io/web-scraping/webpage.html
    </a>
   </p>
  </div>
  <h2>
   A Header
  </h2>
  <p>
   <i>
    Some italicized text
   </i>
  </p>
  <h2>
   Another header
  </h2>
  <p id="paragraph-id">
   <b>
    Some bold text
   </b>
  </p>
 </body>
</html>



## Start using beautiful soup to scrape

### find and find_all

In [11]:
first_header = soup.find('h2') # finds only first element that matches the command
first_header

<h2>A Header</h2>

In [12]:
headers = soup.find_all('h2') # finds all element thats matches the command
headers

[<h2>A Header</h2>, <h2>Another header</h2>]

In [18]:
# Pass in a list of elements to look for
first_header = soup.find(['h1', 'h2']) # only gets the first appearance
first_header

<h1>HTML Webpage</h1>

In [17]:
first_header = soup.find_all(['h1', 'h2']) # gets all the appearance
first_header

[<h1>HTML Webpage</h1>, <h2>A Header</h2>, <h2>Another header</h2>]

#### Attributes in find/find_all function.

In [24]:
# You can pass in attributes to teh find/find_all function
paragraph = soup.find_all('p', attrs={"id":"paragraph-id"})
paragraph

[<p id="paragraph-id"><b>Some bold text</b></p>]

In [25]:
# you can nest find/find_all calls
body = soup.find('body')
body

<body>
<div align="middle">
<h1>HTML Webpage</h1>
<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
</div>
<h2>A Header</h2>
<p><i>Some italicized text</i></p>
<h2>Another header</h2>
<p id="paragraph-id"><b>Some bold text</b></p>
</body>

In [26]:
div = body.find('div')
div

<div align="middle">
<h1>HTML Webpage</h1>
<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
</div>

In [27]:
header = div.find('h1')
header

<h1>HTML Webpage</h1>

In [31]:
# We can search specific string in our find/find_all calls
print(soup.prettify())

<html>
 <head>
  <title>
   HTML Example
  </title>
 </head>
 <body>
  <div align="middle">
   <h1>
    HTML Webpage
   </h1>
   <p>
    Link to more interesting example:
    <a href="https://keithgalli.github.io/web-scraping/webpage.html">
     keithgalli.github.io/web-scraping/webpage.html
    </a>
   </p>
  </div>
  <h2>
   A Header
  </h2>
  <p>
   <i>
    Some italicized text
   </i>
  </p>
  <h2>
   Another header
  </h2>
  <p id="paragraph-id">
   <b>
    Some bold text
   </b>
  </p>
 </body>
</html>



In [36]:
paragraphs = soup.find_all('p', string='Some bold text') # we have write exact text for it to get it
paragraphs

[<p id="paragraph-id"><b>Some bold text</b></p>]

In [38]:
# or we can use regex to match just specific words
import re

paragraphs = soup.find_all('p', string = re.compile('Some')) # gets all paragraph elements where "Some" is present
paragraphs

[<p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [45]:
headers = soup.find_all('h2', string=re.compile('(H|h)eader'))
headers

[<h2>A Header</h2>, <h2>Another header</h2>]

### select (CSS selector)

In [97]:
print(soup.body.prettify())

<body>
 <div align="middle">
  <h1>
   HTML Webpage
  </h1>
  <p>
   Link to more interesting example:
   <a href="https://keithgalli.github.io/web-scraping/webpage.html">
    keithgalli.github.io/web-scraping/webpage.html
   </a>
  </p>
 </div>
 <h2>
  A Header
 </h2>
 <p>
  <i>
   Some italicized text
  </i>
 </p>
 <h2>
  Another header
 </h2>
 <p id="paragraph-id">
  <b>
   Some bold text
  </b>
 </p>
</body>



In [103]:
content = soup.select('div p') # selects p inside div
content

[<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>]

In [104]:
paragraphs = soup.select("h2 ~ p") # selects all parahraphs after h2
paragraphs

[<p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [109]:
bold_text = soup.select('p#paragraph-id b')
bold_text

[<b>Some bold text</b>]

In [111]:
paragraphs = soup.select("body > p") # direct descendent of body
paragraphs

[<p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [115]:
for paragraph in paragraphs:
    print(paragraph.select('i')) # iterate through all i elements in paragraphs var

[<i>Some italicized text</i>]
[]


In [118]:
# Grab element with specific property
soup.select('[align=middle]')

[<div align="middle">
 <h1>HTML Webpage</h1>
 <p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
 </div>]