In [3]:
from bs4 import BeautifulSoup # this module helps in web scrapping.
import requests  # this module helps us to download a web page

In [4]:
%%html
<!DOCTYPE html>
<html>
<head>
<title>Page Title</title>
</head>
<body>
<h3><b id='boldest'>Lebron James</b></h3>
<p> Salary: $ 92,000,000 </p>
<h3> Stephen Curry</h3>
<p> Salary: $85,000, 000 </p>
<h3> Kevin Durant </h3>
<p> Salary: $73,200, 000</p>
</body>
</html>

In [5]:
# The HTML is now stored in a string
html="<!DOCTYPE html><html><head><title>Page Title</title></head><body><h3><b id='boldest'>Lebron James</b></h3><p> Salary: $ 92,000,000 </p><h3> Stephen Curry</h3><p> Salary: $85,000, 000 </p><h3> Kevin Durant </h3><p> Salary: $73,200, 000</p></body></html>"

In [6]:
# To parse a document, pass it into the BeautifulSoup constructor, the BeautifulSoup object, which represents the document as a nested data structure:

soup = BeautifulSoup(html, "html.parser")

In [7]:
# to view in nested structure

print(soup.prettify())

<!DOCTYPE html>
<html>
 <head>
  <title>
   Page Title
  </title>
 </head>
 <body>
  <h3>
   <b id="boldest">
    Lebron James
   </b>
  </h3>
  <p>
   Salary: $ 92,000,000
  </p>
  <h3>
   Stephen Curry
  </h3>
  <p>
   Salary: $85,000, 000
  </p>
  <h3>
   Kevin Durant
  </h3>
  <p>
   Salary: $73,200, 000
  </p>
 </body>
</html>


In [8]:
tag_object=soup.title
print("tag object:",tag_object)

tag object: <title>Page Title</title>


In [9]:
print("tag object type:",type(tag_object))

tag object type: <class 'bs4.element.Tag'>


In [10]:
tag_object=soup.h3
tag_object

<h3><b id="boldest">Lebron James</b></h3>

In [11]:
tag_child =tag_object.b
tag_child

<b id="boldest">Lebron James</b>

In [12]:
parent_tag=tag_child.parent
parent_tag

<h3><b id="boldest">Lebron James</b></h3>

In [13]:
tag_object.parent

<body><h3><b id="boldest">Lebron James</b></h3><p> Salary: $ 92,000,000 </p><h3> Stephen Curry</h3><p> Salary: $85,000, 000 </p><h3> Kevin Durant </h3><p> Salary: $73,200, 000</p></body>

In [14]:
sibling_1=tag_object.next_sibling
sibling_1

<p> Salary: $ 92,000,000 </p>

In [15]:
sibling_2=sibling_1.next_sibling
sibling_2

<h3> Stephen Curry</h3>

In [16]:
sibling_3 = sibling_2.next_sibling
sibling_3

<p> Salary: $85,000, 000 </p>

In [17]:
tag_child['id']

'boldest'

In [18]:
tag_child.attrs # accessing the dictionary of attributes


{'id': 'boldest'}

In [19]:
# We can also obtain the content if the attribute of the tag using the Python get() method.
tag_child.get('id')

'boldest'

In [20]:
tag_string=tag_child.string # extracting the string
tag_string

'Lebron James'

In [21]:
type(tag_string)

bs4.element.NavigableString

In [22]:
unicode_string = str(tag_string)
unicode_string

'Lebron James'

In [23]:
# # Filters allow you to find complex patterns, the simplest filter is a string. In this section we will pass a string to a different filter method and Beautiful Soup will perform a match against that exact string.  Consider the following HTML of rocket launches:

# %%html
# # <!DOCTYPE html>
# <table>
#   <tr>
#     <td id='flight' >Flight No</td>
#     <td>Launch site</td> 
#     <td>Payload mass</td>
#   </tr>
#   <tr> 
#     <td>1</td>
#     <td><a href='https://en.wikipedia.org/wiki/Florida'>Florida</a></td>
#     <td>300 kg</td>
#   </tr>
#   <tr>
#     <td>2</td>
#     <td><a href='https://en.wikipedia.org/wiki/Texas'>Texas</a></td>
#     <td>94 kg</td>
#   </tr>
#   <tr>
#     <td>3</td>
#     <td><a href='https://en.wikipedia.org/wiki/Florida'>Florida<a> </td>
#     <td>80 kg</td>
#   </tr>
# </table>

In [24]:
table="<table><tr><td id='flight'>Flight No</td><td>Launch site</td> <td>Payload mass</td></tr><tr> <td>1</td><td><a href='https://en.wikipedia.org/wiki/Florida'>Florida<a></td><td>300 kg</td></tr><tr><td>2</td><td><a href='https://en.wikipedia.org/wiki/Texas'>Texas</a></td><td>94 kg</td></tr><tr><td>3</td><td><a href='https://en.wikipedia.org/wiki/Florida'>Florida<a> </td><td>80 kg</td></tr></table>"

In [25]:
table_bs = BeautifulSoup(table, "html.parser")
table_bs

<table><tr><td id="flight">Flight No</td><td>Launch site</td> <td>Payload mass</td></tr><tr> <td>1</td><td><a href="https://en.wikipedia.org/wiki/Florida">Florida<a></a></a></td><td>300 kg</td></tr><tr><td>2</td><td><a href="https://en.wikipedia.org/wiki/Texas">Texas</a></td><td>94 kg</td></tr><tr><td>3</td><td><a href="https://en.wikipedia.org/wiki/Florida">Florida<a> </a></a></td><td>80 kg</td></tr></table>

In [26]:
table_rows=table_bs.find_all('tr')
table_rows

[<tr><td id="flight">Flight No</td><td>Launch site</td> <td>Payload mass</td></tr>,
 <tr> <td>1</td><td><a href="https://en.wikipedia.org/wiki/Florida">Florida<a></a></a></td><td>300 kg</td></tr>,
 <tr><td>2</td><td><a href="https://en.wikipedia.org/wiki/Texas">Texas</a></td><td>94 kg</td></tr>,
 <tr><td>3</td><td><a href="https://en.wikipedia.org/wiki/Florida">Florida<a> </a></a></td><td>80 kg</td></tr>]

In [27]:
first_row =table_rows[0]
first_row

<tr><td id="flight">Flight No</td><td>Launch site</td> <td>Payload mass</td></tr>

In [28]:
first_row =table_rows[0]
first_row

<tr><td id="flight">Flight No</td><td>Launch site</td> <td>Payload mass</td></tr>

In [29]:
for i,row in enumerate(table_rows):
    print("row",i)
    cells=row.find_all('td')
    for j,cell in enumerate(cells):
        print('colunm',j,"cell",cell)

row 0
colunm 0 cell <td id="flight">Flight No</td>
colunm 1 cell <td>Launch site</td>
colunm 2 cell <td>Payload mass</td>
row 1
colunm 0 cell <td>1</td>
colunm 1 cell <td><a href="https://en.wikipedia.org/wiki/Florida">Florida<a></a></a></td>
colunm 2 cell <td>300 kg</td>
row 2
colunm 0 cell <td>2</td>
colunm 1 cell <td><a href="https://en.wikipedia.org/wiki/Texas">Texas</a></td>
colunm 2 cell <td>94 kg</td>
row 3
colunm 0 cell <td>3</td>
colunm 1 cell <td><a href="https://en.wikipedia.org/wiki/Florida">Florida<a> </a></a></td>
colunm 2 cell <td>80 kg</td>


In [30]:
for i,row in enumerate(table_rows):
    print("row",i)
    cells=row.find_all('td')
    for j,cell in enumerate(cells):
        print('colunm',j,"cell",cell)

row 0
colunm 0 cell <td id="flight">Flight No</td>
colunm 1 cell <td>Launch site</td>
colunm 2 cell <td>Payload mass</td>
row 1
colunm 0 cell <td>1</td>
colunm 1 cell <td><a href="https://en.wikipedia.org/wiki/Florida">Florida<a></a></a></td>
colunm 2 cell <td>300 kg</td>
row 2
colunm 0 cell <td>2</td>
colunm 1 cell <td><a href="https://en.wikipedia.org/wiki/Texas">Texas</a></td>
colunm 2 cell <td>94 kg</td>
row 3
colunm 0 cell <td>3</td>
colunm 1 cell <td><a href="https://en.wikipedia.org/wiki/Florida">Florida<a> </a></a></td>
colunm 2 cell <td>80 kg</td>


In [31]:
table_bs.find_all(id="flight")

[<td id="flight">Flight No</td>]

In [32]:
# If we set the href attribute to True, regardless of what the value is, the code finds all tags with href value:
table_bs.find_all(href=True)

[<a href="https://en.wikipedia.org/wiki/Florida">Florida<a></a></a>,
 <a href="https://en.wikipedia.org/wiki/Texas">Texas</a>,
 <a href="https://en.wikipedia.org/wiki/Florida">Florida<a> </a></a>]

In [33]:
table_bs.find_all(href=False)

[<table><tr><td id="flight">Flight No</td><td>Launch site</td> <td>Payload mass</td></tr><tr> <td>1</td><td><a href="https://en.wikipedia.org/wiki/Florida">Florida<a></a></a></td><td>300 kg</td></tr><tr><td>2</td><td><a href="https://en.wikipedia.org/wiki/Texas">Texas</a></td><td>94 kg</td></tr><tr><td>3</td><td><a href="https://en.wikipedia.org/wiki/Florida">Florida<a> </a></a></td><td>80 kg</td></tr></table>,
 <tr><td id="flight">Flight No</td><td>Launch site</td> <td>Payload mass</td></tr>,
 <td id="flight">Flight No</td>,
 <td>Launch site</td>,
 <td>Payload mass</td>,
 <tr> <td>1</td><td><a href="https://en.wikipedia.org/wiki/Florida">Florida<a></a></a></td><td>300 kg</td></tr>,
 <td>1</td>,
 <td><a href="https://en.wikipedia.org/wiki/Florida">Florida<a></a></a></td>,
 <a></a>,
 <td>300 kg</td>,
 <tr><td>2</td><td><a href="https://en.wikipedia.org/wiki/Texas">Texas</a></td><td>94 kg</td></tr>,
 <td>2</td>,
 <td><a href="https://en.wikipedia.org/wiki/Texas">Texas</a></td>,
 <td>94 k

<h2 id="DSCW">Downloading And Scraping The Contents Of A Web Page</h2> 


In [34]:
url = "http://www.ibm.com"
data  = requests.get(url).text

In [35]:
soup = BeautifulSoup(data,"html.parser")  # create a soup object using the variable 'data'

In [36]:
for link in soup.find_all('a',href=True):  # in html anchor/link is represented by the tag <a>
    print(link.get('href'))


/products/linuxone-emperor-4
https://www.ibm.com/consulting/
https://www.ibm.com/cloud/aiops
https://www.ibm.com/cloud/campaign/cloud-simplicity
https://www.ibm.com/about/secure-your-business/
https://www.ibm.com/analytics/data-fabric
/products/offers-and-discounts?lnk=hpv18t5
/cloud/free?lnk=hpv18t1
/products/cloud-pak-for-data?lnk=hpv18t2
/cloud/watson-assistant?lnk=hpv18t3
/security/identity-access-management/cloud-identity?lnk=hpv18t4
https://developer.ibm.com/depmodels/cloud/?lnk=hpv18ct16
https://developer.ibm.com/technologies/artificial-intelligence?lnk=hpv18ct19
https://developer.ibm.com/?lnk=hpv18ct9
https://www.ibm.com/docs/en?lnk=hpv18ct14
https://www.redbooks.ibm.com/?lnk=ushpv18ct10
https://www.ibm.com/support/home/?lnk=hpv18ct11
https://www.ibm.com/training/?lnk=hpv18ct15
/cloud/hybrid?lnk=hpv18pt14
/cloud/learn/public-cloud?lnk=hpv18ct1
/watson?lnk=ushpv18pt17
/garage?lnk=hpv18pt13
/blockchain?lnk=hpv18pt4
https://www.ibm.com/thought-leadership/institute-business-value/?

## Scrape  all images  Tags


In [37]:
for link in soup.find_all('img'):# in html image is represented by the tag <img>
    print(link)
    print(link.get('src'))

<img alt="Two consultants work together in a data center" class="" loading="lazy" src="//1.cms.s81c.com/sites/default/files/2022-02-16/consulting-five-levers-444x254_8.jpg"/>
//1.cms.s81c.com/sites/default/files/2022-02-16/consulting-five-levers-444x254_8.jpg
<img alt="A technician works on the IBM Quantum computer" class="" loading="lazy" src="//1.cms.s81c.com/sites/default/files/2022-02-16/automate-five-levers-444x254_8.jpg"/>
//1.cms.s81c.com/sites/default/files/2022-02-16/automate-five-levers-444x254_8.jpg
<img alt="Two medical engineers review data" class="" loading="lazy" src="//1.cms.s81c.com/sites/default/files/2022-02-16/cloud-five-levers-444x254_8.jpg"/>
//1.cms.s81c.com/sites/default/files/2022-02-16/cloud-five-levers-444x254_8.jpg
<img alt="A developer works at his station" class="" loading="lazy" src="//1.cms.s81c.com/sites/default/files/2022-02-16/security-five-levers-444x254_8.jpg"/>
//1.cms.s81c.com/sites/default/files/2022-02-16/security-five-levers-444x254_8.jpg
<img 

## Scrape data from HTML tables


In [38]:
#The below url contains an html table with data about colors and color codes.
url = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-DA0321EN-SkillsNetwork/labs/datasets/HTMLColorCodes.html"

In [39]:
# get the contents of the webpage in text format and store in a variable called data
data  = requests.get(url).text
soup = BeautifulSoup(data,"html.parser")

In [40]:
#find a html table in the web page
table = soup.find('table') # in html table is represented by the tag <table>

In [41]:
#Get all rows from the table
for row in table.find_all('tr'): # in html table row is represented by the tag <tr>
    # Get all columns in each row.
    cols = row.find_all('td') # in html a column is represented by the tag <td>
    color_name = cols[2].string # store the value in column 3 as color_name
    color_code = cols[3].string # store the value in column 4 as color_code
    print("{}--->{}".format(color_name,color_code))

Color Name--->None
lightsalmon--->#FFA07A
salmon--->#FA8072
darksalmon--->#E9967A
lightcoral--->#F08080
coral--->#FF7F50
tomato--->#FF6347
orangered--->#FF4500
gold--->#FFD700
orange--->#FFA500
darkorange--->#FF8C00
lightyellow--->#FFFFE0
lemonchiffon--->#FFFACD
papayawhip--->#FFEFD5
moccasin--->#FFE4B5
peachpuff--->#FFDAB9
palegoldenrod--->#EEE8AA
khaki--->#F0E68C
darkkhaki--->#BDB76B
yellow--->#FFFF00
lawngreen--->#7CFC00
chartreuse--->#7FFF00
limegreen--->#32CD32
lime--->#00FF00
forestgreen--->#228B22
green--->#008000
powderblue--->#B0E0E6
lightblue--->#ADD8E6
lightskyblue--->#87CEFA
skyblue--->#87CEEB
deepskyblue--->#00BFFF
lightsteelblue--->#B0C4DE
dodgerblue--->#1E90FF


In [42]:
import pandas as pd

#The below url contains html tables with data about world population.
url = "https://en.wikipedia.org/wiki/World_population"

In [43]:
# get the contents of the webpage in text format and store in a variable called data
data  = requests.get(url).text
soup = BeautifulSoup(data,"html.parser")

In [44]:
#find all html tables in the web page
tables = soup.find_all('table') # in html table is represented by the tag <table>

In [45]:
# we can see how many tables were found by checking the length of the tables list
len(tables)

25

In [46]:
for index,table in enumerate(tables):
    if ("10 most densely populated countries" in str(table)):
        table_index = index
print(table_index)

5


In [47]:
print(tables[table_index].prettify())

<table class="wikitable sortable" style="text-align:right">
 <caption>
  10 most densely populated countries
  <small>
   (with population above 5 million)
  </small>
 </caption>
 <tbody>
  <tr>
   <th>
    Rank
   </th>
   <th>
    Country
   </th>
   <th>
    Population
   </th>
   <th>
    Area
    <br/>
    <small>
     (km
     <sup>
      2
     </sup>
     )
    </small>
   </th>
   <th>
    Density
    <br/>
    <small>
     (pop/km
     <sup>
      2
     </sup>
     )
    </small>
   </th>
  </tr>
  <tr>
   <td>
    1
   </td>
   <td align="left">
    <span class="flagicon">
     <img alt="" class="thumbborder" data-file-height="600" data-file-width="900" decoding="async" height="15" src="//upload.wikimedia.org/wikipedia/commons/thumb/4/48/Flag_of_Singapore.svg/23px-Flag_of_Singapore.svg.png" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/4/48/Flag_of_Singapore.svg/35px-Flag_of_Singapore.svg.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/4/48/Flag_of_Singapo

## Scrape data from HTML tables into a DataFrame using BeautifulSoup and read_html



In [48]:
import pandas as pd
pd.read_html(str(tables[5]), flavor='bs4')

[   Rank      Country  Population  Area (km2)  Density (pop/km2)
 0     1    Singapore     5704000         710               8033
 1     2   Bangladesh   173690000      143998               1206
 2     3    Palestine     5266785        6020                847
 3     4      Lebanon     6856000       10452                656
 4     5       Taiwan    23604000       36193                652
 5     6  South Korea    51781000       99538                520
 6     7       Rwanda    12374000       26338                470
 7     8       Israel     9600000       22072                435
 8     9        Haiti    11578000       27065                428
 9    10  Netherlands    17760000       41526                428]

In [49]:
population_data_read_html = pd.read_html(str(tables[5]), flavor='bs4')[0]

population_data_read_html

Unnamed: 0,Rank,Country,Population,Area (km2),Density (pop/km2)
0,1,Singapore,5704000,710,8033
1,2,Bangladesh,173690000,143998,1206
2,3,Palestine,5266785,6020,847
3,4,Lebanon,6856000,10452,656
4,5,Taiwan,23604000,36193,652
5,6,South Korea,51781000,99538,520
6,7,Rwanda,12374000,26338,470
7,8,Israel,9600000,22072,435
8,9,Haiti,11578000,27065,428
9,10,Netherlands,17760000,41526,428


In [50]:
dataframe_list = pd.read_html(url, flavor='bs4')

In [51]:
len(dataframe_list)

25

In [55]:
dataframe_list[5]

Unnamed: 0,Rank,Country,Population,Area (km2),Density (pop/km2)
0,1,Singapore,5704000,710,8033
1,2,Bangladesh,173690000,143998,1206
2,3,Palestine,5266785,6020,847
3,4,Lebanon,6856000,10452,656
4,5,Taiwan,23604000,36193,652
5,6,South Korea,51781000,99538,520
6,7,Rwanda,12374000,26338,470
7,8,Israel,9600000,22072,435
8,9,Haiti,11578000,27065,428
9,10,Netherlands,17760000,41526,428
