In [1]:
import requests #pip install requests
from bs4 import BeautifulSoup as bs

In [2]:
#Loading the webpage content
r = requests.get("https://keithgalli.github.io/web-scraping/example.html")

# Converting to a beautiful soup object
soup = bs(r.content)

#Print out our html
print(soup.prettify())                 

<html>
 <head>
  <title>
   HTML Example
  </title>
 </head>
 <body>
  <div align="middle">
   <h1>
    HTML Webpage
   </h1>
   <p>
    Link to more interesting example:
    <a href="https://keithgalli.github.io/web-scraping/webpage.html">
     keithgalli.github.io/web-scraping/webpage.html
    </a>
   </p>
  </div>
  <h2>
   A Header
  </h2>
  <p>
   <i>
    Some italicized text
   </i>
  </p>
  <h2>
   Another header
  </h2>
  <p id="paragraph-id">
   <b>
    Some bold text
   </b>
  </p>
 </body>
</html>



Start using Beautiful Soup to Scrape

find and find_all

In [3]:
first_header = soup.find("h2") #will find the first element which matches the description in the parameters
first_header

<h2>A Header</h2>

In [4]:
headers = soup.find_all("h2")
headers

[<h2>A Header</h2>, <h2>Another header</h2>]

In [5]:
#we can pass in a list of elements to look for
first_header = soup.find(["h1","h2"]) 
first_header
#will find the first occurence of any of the items in the list

<h1>HTML Webpage</h1>

In [6]:
headers = soup.find_all(["h1","h2"])
headers

[<h1>HTML Webpage</h1>, <h2>A Header</h2>, <h2>Another header</h2>]

In [7]:
#We can pass in the attributes to the find/find_all function
paragraph = soup.find_all("p",attrs={"id":"paragraph-id"})
paragraph 

[<p id="paragraph-id"><b>Some bold text</b></p>]

In [8]:
#We can nest find/find_all calls
body = soup.find('body')
body

<body>
<div align="middle">
<h1>HTML Webpage</h1>
<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
</div>
<h2>A Header</h2>
<p><i>Some italicized text</i></p>
<h2>Another header</h2>
<p id="paragraph-id"><b>Some bold text</b></p>
</body>

In [9]:
div = body.find("div")
div

<div align="middle">
<h1>HTML Webpage</h1>
<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
</div>

In [10]:
header = div.find("h1")
header

<h1>HTML Webpage</h1>

In [11]:
#We can search for specific strings in our find/find_all call
paragraph = soup.find_all("p",string="Some bold text")
paragraph

[<p id="paragraph-id"><b>Some bold text</b></p>]

In [12]:
import re

paragraph = soup.find_all("p",string=re.compile("Some"))
paragraph

[<p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [13]:
headers = soup.find_all("h2",string=re.compile("(H|h)eader"))
headers

[<h2>A Header</h2>, <h2>Another header</h2>]

### Select (CSS selector)

In [14]:
print(soup.body.prettify())

<body>
 <div align="middle">
  <h1>
   HTML Webpage
  </h1>
  <p>
   Link to more interesting example:
   <a href="https://keithgalli.github.io/web-scraping/webpage.html">
    keithgalli.github.io/web-scraping/webpage.html
   </a>
  </p>
 </div>
 <h2>
  A Header
 </h2>
 <p>
  <i>
   Some italicized text
  </i>
 </p>
 <h2>
  Another header
 </h2>
 <p id="paragraph-id">
  <b>
   Some bold text
  </b>
 </p>
</body>



In [15]:
content = soup.select("p")
content

[<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>,
 <p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [16]:
#Selecting paragraphs in the div
content = soup.select("div p")
content

[<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>]

In [17]:
#Getting paragraphs that are preceded by a heder (h2)
paragraph = soup.select("h2 ~ p")
paragraph

[<p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [18]:
#Grab specific elements with id
#Grabbing the bold text in paragraph with id = paragraph-id
boldtext = soup.select("p#paragraph-id b")
boldtext

[<b>Some bold text</b>]

In [19]:
#can run nested calls
paragraphs = soup.select("body > p") # Body ke ander jo paragraph hai wo
paragraphs


[<p><i>Some italicized text</i></p>,
 <p id="paragraph-id"><b>Some bold text</b></p>]

In [20]:
for paragraph in paragraphs:
    print(paragraph.select("i"))

[<i>Some italicized text</i>]
[]


In [21]:
#Grabbing an element with specific property
soup.select("[align=middle]")


[<div align="middle">
 <h1>HTML Webpage</h1>
 <p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
 </div>]

### Getting different properties of HTML 

In [22]:
header = soup.find("h2")
header

<h2>A Header</h2>

In [23]:
header.string

'A Header'

In [24]:
div = soup.find("div")
print(div.prettify())

<div align="middle">
 <h1>
  HTML Webpage
 </h1>
 <p>
  Link to more interesting example:
  <a href="https://keithgalli.github.io/web-scraping/webpage.html">
   keithgalli.github.io/web-scraping/webpage.html
  </a>
 </p>
</div>



In [25]:
print(div.string)
#It is because to print the HTML Webpage or the paragraph

None


In [26]:
#If multiple childs elements used get_text
print(div.get_text())


HTML Webpage
Link to more interesting example: keithgalli.github.io/web-scraping/webpage.html



In [27]:
#Get a specific property from an element
link = soup.find("a")
link

<a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a>

In [28]:
link["href"]

'https://keithgalli.github.io/web-scraping/webpage.html'

In [29]:
paragraphs = soup.select("p#paragraph-id")
paragraphs[0]['id']

'paragraph-id'

### Code Navigation

In [30]:
#path syntax
soup.body

<body>
<div align="middle">
<h1>HTML Webpage</h1>
<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
</div>
<h2>A Header</h2>
<p><i>Some italicized text</i></p>
<h2>Another header</h2>
<p id="paragraph-id"><b>Some bold text</b></p>
</body>

In [31]:
soup.body.div


<div align="middle">
<h1>HTML Webpage</h1>
<p>Link to more interesting example: <a href="https://keithgalli.github.io/web-scraping/webpage.html">keithgalli.github.io/web-scraping/webpage.html</a></p>
</div>

In [32]:
soup.body.div.h1.string

'HTML Webpage'

In [33]:
#Three terms: Parent, Sibling and Child 
print(soup.body.prettify())

<body>
 <div align="middle">
  <h1>
   HTML Webpage
  </h1>
  <p>
   Link to more interesting example:
   <a href="https://keithgalli.github.io/web-scraping/webpage.html">
    keithgalli.github.io/web-scraping/webpage.html
   </a>
  </p>
 </div>
 <h2>
  A Header
 </h2>
 <p>
  <i>
   Some italicized text
  </i>
 </p>
 <h2>
  Another header
 </h2>
 <p id="paragraph-id">
  <b>
   Some bold text
  </b>
 </p>
</body>



In [34]:
soup.body.find("div").find_next_siblings()

[<h2>A Header</h2>,
 <p><i>Some italicized text</i></p>,
 <h2>Another header</h2>,
 <p id="paragraph-id"><b>Some bold text</b></p>]