In [7]:
!pip install bs4




In [8]:
from bs4 import BeautifulSoup # this module helps in web scrapping.
import requests  # this module helps us to download a web page

Example HTML

In [9]:
%%html
<!DOCTYPE html>
<html>
<head>
<title>Page Title</title>
</head>
<body>
<h3><b id='boldest'>Lebron James</b></h3>
<p> Salary: $ 92,000,000 </p>
<h3> Stephen Curry</h3>
<p> Salary: $85,000, 000 </p>
<h3> Kevin Durant </h3>
<p> Salary: $73,200, 000</p>
</body>
</html>



We can store it as a string in the variable HTML:

In [10]:
html="<!DOCTYPE html><html><head><title>Page Title</title></head><body><h3><b id='boldest'>Lebron James</b></h3><p> Salary: $ 92,000,000 </p><h3> Stephen Curry</h3><p> Salary: $85,000, 000 </p><h3> Kevin Durant </h3><p> Salary: $73,200, 000</p></body></html>"

To parse a document, pass it into the BeautifulSoup constructor, the BeautifulSoup object, which represents the document as a nested data structure:

In [11]:
soup = BeautifulSoup(html, 'html.parser')

The document is converted to Unicode and HTML entities are converted to Unicode characters.

We can use the method prettify() to display the HTML in the nested structure:

In [12]:
print(soup.prettify())

<!DOCTYPE html>
<html>
 <head>
  <title>
   Page Title
  </title>
 </head>
 <body>
  <h3>
   <b id="boldest">
    Lebron James
   </b>
  </h3>
  <p>
   Salary: $ 92,000,000
  </p>
  <h3>
   Stephen Curry
  </h3>
  <p>
   Salary: $85,000, 000
  </p>
  <h3>
   Kevin Durant
  </h3>
  <p>
   Salary: $73,200, 000
  </p>
 </body>
</html>


Downloading And Scraping The Contents Of A Web Page

In [13]:
url = "http://www.ibm.com"

In [14]:
data = requests.get(url).text

In [16]:
soup = BeautifulSoup(data, "html.parser")

In [17]:
for link in soup.find_all('a',href=True):
    print(link.get('href'))

https://www.ibm.com/ca/en

https://www.ibm.com/ca-en/products?lnk=hpmps_bupr_caen&lnk2=link
https://www.ibm.com/industries?lnk=hpmps_buin_caen&lnk2=link
https://www.ibm.com/artificial-intelligence?lnk=hpmps_buai_caen&lnk2=link
https://www.ibm.com/automation?lnk=hpmps_buau_caen&lnk2=link
https://www.ibm.com/blockchain?lnk=hpmps_bubc_caen&lnk2=link
https://www.ibm.com/business-operations?lnk=hpmps_buop_caen&lnk2=link
https://www.ibm.com/cloud?lnk=hpmps_bucl_caen&lnk2=link
https://www.ibm.com/analytics?lnk=hpmps_buda_caen&lnk2=link
https://www.ibm.com/it-infrastructure?lnk=hpmps_buit_caen&lnk2=link
https://www.ibm.com/security?lnk=hpmps_buse_caen&lnk2=link
https://www.ibm.com/supply-chain?lnk=hpmps_busc_caen&lnk2=link
https://www.ibm.com/financing?lnk=hpmps_bufi_caen&lnk2=link
https://www.ibm.com/ca-en/products?lnk=hpmps_buall_caen&lnk2=link

https://www.ibm.com/services/process?lnk=hpmsc_bups_caen&lnk2=link
https://www.ibm.com/services/ibmix/?lnk=hpmsc_budbs_caen&lnk2=link
https://www.ib

# in html anchor/link 

If we set the  <code>href</code> attribute to True, regardless of what the value is, the code finds all tags with <code>href</code> value:


In [18]:
for link in soup.find_all('img'):
    print(link)
    print(link.get('src'))

<img alt="Whales swimming away from a melting piece of ice" class="" loading="lazy" src="//1.cms.s81c.com/sites/default/files/2021-06-07/Sustainability_site_card_1.jpg"/>
//1.cms.s81c.com/sites/default/files/2021-06-07/Sustainability_site_card_1.jpg
<img alt="A group of impalas' silhouettes in front of a sunset" class="" loading="lazy" src="//1.cms.s81c.com/sites/default/files/2021-06-07/Impala_blog_card.jpg"/>
//1.cms.s81c.com/sites/default/files/2021-06-07/Impala_blog_card.jpg
<img alt="Aerial view of a forest next to a corn field" class="" loading="lazy" src="//1.cms.s81c.com/sites/default/files/2021-05-30/Sustainability_at_Think_2021_Card.jpg"/>
//1.cms.s81c.com/sites/default/files/2021-05-30/Sustainability_at_Think_2021_Card.jpg
<img alt="" class="" loading="lazy" src="//1.cms.s81c.com/sites/default/files/2021-05-30/Automation_Page_Card.jpg"/>
//1.cms.s81c.com/sites/default/files/2021-05-30/Automation_Page_Card.jpg
<img alt="Screenshot of IBM Cloud Pak for Business Automation" cla

Scrape data from HTML tables into a DataFrame using BeautifulSoup and Pandas

In [27]:
import pandas as pd

In [19]:
#The below url contains html tables with data about world population.
url = "https://en.wikipedia.org/wiki/World_population"

In [20]:
# get the contents of the webpage in text format and store in a variable called data
data  = requests.get(url).text

In [21]:
soup = BeautifulSoup(data,"html.parser")

In [22]:
#find all html tables in the web page
tables = soup.find_all('table') # in html table is represented by the tag <table>

In [23]:
# we can see how many tables were found by checking the length of the tables list
len(tables)

26


Assume that we are looking for the "10 most densly populated countries table", we can look through the tables list and find the right one we are look for based on the data in each table or we can search for the table name if it is in the table but this option might not always work.

In [25]:
for index, table in enumerate(tables):
    if ("10 most densely populated countries" in str(table)):
        table_index = index
print(table_index)

5


When you use enumerate(), the function gives you back two loop variables:

The count of the current iteration
The value of the item at the current iteration

In this case, the table would be at the 5th spot in the index

In [26]:
print(tables[table_index].prettify())

<table class="wikitable sortable" style="text-align:right">
 <caption>
  10 most densely populated countries
  <small>
   (with population above 5 million)
  </small>
 </caption>
 <tbody>
  <tr>
   <th>
    Rank
   </th>
   <th>
    Country
   </th>
   <th>
    Population
   </th>
   <th>
    Area
    <br/>
    <small>
     (km
     <sup>
      2
     </sup>
     )
    </small>
   </th>
   <th>
    Density
    <br/>
    <small>
     (pop/km
     <sup>
      2
     </sup>
     )
    </small>
   </th>
  </tr>
  <tr>
   <td>
    1
   </td>
   <td align="left">
    <span class="flagicon">
     <img alt="" class="thumbborder" data-file-height="2880" data-file-width="4320" decoding="async" height="15" src="//upload.wikimedia.org/wikipedia/commons/thumb/4/48/Flag_of_Singapore.svg/23px-Flag_of_Singapore.svg.png" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/4/48/Flag_of_Singapore.svg/35px-Flag_of_Singapore.svg.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/4/48/Flag_of_Singa

In [28]:
population_data = pd.DataFrame(columns=["Rank", "Country", "Population", "Area", "Density"])

for row in tables[table_index].tbody.find_all("tr"):
    col = row.find_all("td")
    if (col != []):
        rank = col[0].text
        country = col[1].text
        population = col[2].text.strip()
        area = col[3].text.strip()
        density = col[4].text.strip()
        population_data = population_data.append({"Rank":rank, "Country":country, "Population":population, "Area":area, "Density":density}, ignore_index=True)

population_data

Unnamed: 0,Rank,Country,Population,Area,Density
0,1,Singapore,5704000,710,8033
1,2,Bangladesh,170800000,143998,1186
2,3,Lebanon,6856000,10452,656
3,4,Taiwan,23604000,36193,652
4,5,South Korea,51781000,99538,520
5,6,Rwanda,12374000,26338,470
6,7,Haiti,11578000,27065,428
7,8,Netherlands,17600000,41526,424
8,9,Israel,9360000,22072,424
9,10,India,1377930000,3287240,419


tr is table row td is data cell th is header cell