In [1]:
!pip install html5lib

Collecting html5lib
  Downloading html5lib-1.1-py2.py3-none-any.whl (112 kB)
     -------------------------------------- 112.2/112.2 kB 6.4 MB/s eta 0:00:00
Installing collected packages: html5lib
Successfully installed html5lib-1.1


In [2]:
!pip install bs4
!pip install requests

Collecting bs4
  Downloading bs4-0.0.1.tar.gz (1.1 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: bs4
  Building wheel for bs4 (setup.py): started
  Building wheel for bs4 (setup.py): finished with status 'done'
  Created wheel for bs4: filename=bs4-0.0.1-py3-none-any.whl size=1257 sha256=a270433de2182379037eb4d1e484e933f6b3138f0debf3050cac1a29b4cd7a29
  Stored in directory: c:\users\admin\appdata\local\pip\cache\wheels\e4\62\1d\d4d1bc4f33350ff84227f89b258edb552d604138e3739f5c83
Successfully built bs4
Installing collected packages: bs4
Successfully installed bs4-0.0.1


In [6]:
pip install beautifulsoup4


Note: you may need to restart the kernel to use updated packages.


# Beautiful Soup Objects

Consider the following HTML

In [1]:
%%html
<!DOCTYPE html>
<html>
<head>
<title>Page Title</title>
</head>
<body>
<h3><b id='boldest'>Lebron James</b></h3>
<p> Salary: $ 92,000,000 </p>
<h3> Stephen Curry</h3>
<p> Salary: $85,000, 000 </p>
<h3> Kevin Durant </h3>
<p> Salary: $73,200, 000</p>
</body>
</html>

In [2]:
html="<!DOCTYPE html><html><head><title>Page Title</title></head><body><h3><b id='boldest'>Lebron James</b></h3><p> Salary: $ 92,000,000 </p><h3> Stephen Curry</h3><p> Salary: $85,000, 000 </p><h3> Kevin Durant </h3><p> Salary: $73,200, 000</p></body></html>"

In [4]:
from bs4 import BeautifulSoup
import requests


In [6]:
soup = BeautifulSoup(html, 'html5lib')

In [7]:
# display HTML in the nested structured
print(soup.prettify())

<!DOCTYPE html>
<html>
 <head>
  <title>
   Page Title
  </title>
 </head>
 <body>
  <h3>
   <b id="boldest">
    Lebron James
   </b>
  </h3>
  <p>
   Salary: $ 92,000,000
  </p>
  <h3>
   Stephen Curry
  </h3>
  <p>
   Salary: $85,000, 000
  </p>
  <h3>
   Kevin Durant
  </h3>
  <p>
   Salary: $73,200, 000
  </p>
 </body>
</html>


Tags

In [8]:
tag_object = soup.title
print('tag object:', tag_object)

tag object: <title>Page Title</title>


In [9]:
type(tag_object)

bs4.element.Tag

In [11]:
# get the first h3 tag
tag_object = soup.h3
tag_object

<h3><b id="boldest">Lebron James</b></h3>

Children, Parents and Siblings

In [13]:
tag_child = tag_object.b
tag_child

<b id="boldest">Lebron James</b>

In [14]:
tag_parent = tag_child.parent
tag_parent

<h3><b id="boldest">Lebron James</b></h3>

In [15]:
tag_parent.parent

<body><h3><b id="boldest">Lebron James</b></h3><p> Salary: $ 92,000,000 </p><h3> Stephen Curry</h3><p> Salary: $85,000, 000 </p><h3> Kevin Durant </h3><p> Salary: $73,200, 000</p></body>

In [18]:
sibling1 = tag_object.next_sibling
sibling1

<p> Salary: $ 92,000,000 </p>

In [20]:
# find the salary of Stephen Curry
sibling2 = sibling1.next_sibling.next_sibling
sibling2

<p> Salary: $85,000, 000 </p>

HTML Attributes

In [21]:
print(tag_child)

<b id="boldest">Lebron James</b>


In [22]:
tag_child['id']

'boldest'

In [23]:
# access attrs -> dict
tag_child.attrs

{'id': 'boldest'}

In [24]:
tag_child.get('id')

'boldest'

Navigable String

In [25]:
tag_string = tag_child.string
tag_string

'Lebron James'

In [26]:
type(tag_string)

bs4.element.NavigableString

In [28]:
# convert to python str
unicode_str = str(tag_string)
type(unicode_str)

str

Filter

In [29]:
%%html
<table>
  <tr>
    <td id='flight' >Flight No</td>
    <td>Launch site</td> 
    <td>Payload mass</td>
   </tr>
  <tr> 
    <td>1</td>
    <td><a href='https://en.wikipedia.org/wiki/Florida'>Florida</a></td>
    <td>300 kg</td>
  </tr>
  <tr>
    <td>2</td>
    <td><a href='https://en.wikipedia.org/wiki/Texas'>Texas</a></td>
    <td>94 kg</td>
  </tr>
  <tr>
    <td>3</td>
    <td><a href='https://en.wikipedia.org/wiki/Florida'>Florida<a> </td>
    <td>80 kg</td>
  </tr>
</table>

0,1,2
Flight No,Launch site,Payload mass
1,Florida,300 kg
2,Texas,94 kg
3,Florida,80 kg


In [30]:
table = "<table><tr><td id='flight'>Flight No</td><td>Launch site</td> <td>Payload mass</td></tr><tr> <td>1</td><td><a href='https://en.wikipedia.org/wiki/Florida'>Florida<a></td><td>300 kg</td></tr><tr><td>2</td><td><a href='https://en.wikipedia.org/wiki/Texas'>Texas</a></td><td>94 kg</td></tr><tr><td>3</td><td><a href='https://en.wikipedia.org/wiki/Florida'>Florida<a> </td><td>80 kg</td></tr></table>"

In [31]:
table_bs = BeautifulSoup(table, 'html5lib')

In [33]:
# find_all(name, attrs, recursive, string, limit, **kwargs)
# - name
table_rows = table_bs.find_all('tr') # return a list with tr tag
table_rows

[<tr><td id="flight">Flight No</td><td>Launch site</td> <td>Payload mass</td></tr>,
 <tr> <td>1</td><td><a href="https://en.wikipedia.org/wiki/Florida">Florida</a><a></a></td><td>300 kg</td></tr>,
 <tr><td>2</td><td><a href="https://en.wikipedia.org/wiki/Texas">Texas</a></td><td>94 kg</td></tr>,
 <tr><td>3</td><td><a href="https://en.wikipedia.org/wiki/Florida">Florida</a><a> </a></td><td>80 kg</td></tr>]

In [36]:
first_row = table_rows[0]
first_row.td

<td id="flight">Flight No</td>

In [37]:
for i, row in enumerate(table_rows):
    print('row', i, 'is', row)

row 0 is <tr><td id="flight">Flight No</td><td>Launch site</td> <td>Payload mass</td></tr>
row 1 is <tr> <td>1</td><td><a href="https://en.wikipedia.org/wiki/Florida">Florida</a><a></a></td><td>300 kg</td></tr>
row 2 is <tr><td>2</td><td><a href="https://en.wikipedia.org/wiki/Texas">Texas</a></td><td>94 kg</td></tr>
row 3 is <tr><td>3</td><td><a href="https://en.wikipedia.org/wiki/Florida">Florida</a><a> </a></td><td>80 kg</td></tr>


In [38]:
for i, row in enumerate(table_rows):
    print('row', i)
    cells = row.find_all('td')
    for j, cell in enumerate(cells):
        print('column', j, 'cell', cell)

row 0
column 0 cell <td id="flight">Flight No</td>
column 1 cell <td>Launch site</td>
column 2 cell <td>Payload mass</td>
row 1
column 0 cell <td>1</td>
column 1 cell <td><a href="https://en.wikipedia.org/wiki/Florida">Florida</a><a></a></td>
column 2 cell <td>300 kg</td>
row 2
column 0 cell <td>2</td>
column 1 cell <td><a href="https://en.wikipedia.org/wiki/Texas">Texas</a></td>
column 2 cell <td>94 kg</td>
row 3
column 0 cell <td>3</td>
column 1 cell <td><a href="https://en.wikipedia.org/wiki/Florida">Florida</a><a> </a></td>
column 2 cell <td>80 kg</td>


In [40]:
list_input = table_bs.find_all(['tr', 'td'])
list_input

[<tr><td id="flight">Flight No</td><td>Launch site</td> <td>Payload mass</td></tr>,
 <td id="flight">Flight No</td>,
 <td>Launch site</td>,
 <td>Payload mass</td>,
 <tr> <td>1</td><td><a href="https://en.wikipedia.org/wiki/Florida">Florida</a><a></a></td><td>300 kg</td></tr>,
 <td>1</td>,
 <td><a href="https://en.wikipedia.org/wiki/Florida">Florida</a><a></a></td>,
 <td>300 kg</td>,
 <tr><td>2</td><td><a href="https://en.wikipedia.org/wiki/Texas">Texas</a></td><td>94 kg</td></tr>,
 <td>2</td>,
 <td><a href="https://en.wikipedia.org/wiki/Texas">Texas</a></td>,
 <td>94 kg</td>,
 <tr><td>3</td><td><a href="https://en.wikipedia.org/wiki/Florida">Florida</a><a> </a></td><td>80 kg</td></tr>,
 <td>3</td>,
 <td><a href="https://en.wikipedia.org/wiki/Florida">Florida</a><a> </a></td>,
 <td>80 kg</td>]

In [41]:
# - attributes: if this arg is not recognized -> filter on tag
table_bs.find_all(id='flight')

[<td id="flight">Flight No</td>]

In [42]:
list_input = table_bs.find_all(href="https://en.wikipedia.org/wiki/Florida")
list_input

[<a href="https://en.wikipedia.org/wiki/Florida">Florida</a>,
 <a href="https://en.wikipedia.org/wiki/Florida">Florida</a>]

In [43]:
# find all href
table_bs.find_all(href=True)

[<a href="https://en.wikipedia.org/wiki/Florida">Florida</a>,
 <a href="https://en.wikipedia.org/wiki/Texas">Texas</a>,
 <a href="https://en.wikipedia.org/wiki/Florida">Florida</a>]

In [45]:
soup.find_all(id='boldest')

[<b id="boldest">Lebron James</b>]

In [48]:
# - string
table_bs.find_all(string='Florida')

['Florida', 'Florida']

# Downloading and Scraping the Contents of a Web Page

In [49]:
url = "http://www.ibm.com"
data = requests.get(url).text

soup = BeautifulSoup(data, 'html5lib')

for link in soup.find_all('a', href=True):
    print(link.get('href'))

https://www.ibm.com/fr-fr/cloud?lnk=hpfrpr1


In [50]:
for link in soup.find_all('img'):
    print(link)
    print(link.get('src'))

<img alt="Portraits des consultants IBM" class="bx--image__img" id="image--1662819746" loading="lazy" src="/content/dam/adobe-cms/default-images/home-consultants.component.crop-16by9-xl.ts=1695221366270.jpg/content/adobe-cms/fr/fr/homepage/_jcr_content/root/table_of_contents/simple_image"/>
/content/dam/adobe-cms/default-images/home-consultants.component.crop-16by9-xl.ts=1695221366270.jpg/content/adobe-cms/fr/fr/homepage/_jcr_content/root/table_of_contents/simple_image


Scrape data from HTMl tables

In [51]:
url = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-DA0321EN-SkillsNetwork/labs/datasets/HTMLColorCodes.html"

In [52]:
data = requests.get(url).text
soup = BeautifulSoup(data, 'html5lib')

In [53]:
table = soup.find('table')

In [54]:
for row in table.find_all('tr'):
    cols = row.find_all('td')
    color_name = cols[2].string
    color_code = cols[3].text
    print("{}---->{}".format(color_name, color_code))

Color Name---->Hex Code#RRGGBB
lightsalmon---->#FFA07A
salmon---->#FA8072
darksalmon---->#E9967A
lightcoral---->#F08080
coral---->#FF7F50
tomato---->#FF6347
orangered---->#FF4500
gold---->#FFD700
orange---->#FFA500
darkorange---->#FF8C00
lightyellow---->#FFFFE0
lemonchiffon---->#FFFACD
papayawhip---->#FFEFD5
moccasin---->#FFE4B5
peachpuff---->#FFDAB9
palegoldenrod---->#EEE8AA
khaki---->#F0E68C
darkkhaki---->#BDB76B
yellow---->#FFFF00
lawngreen---->#7CFC00
chartreuse---->#7FFF00
limegreen---->#32CD32
lime---->#00FF00
forestgreen---->#228B22
green---->#008000
powderblue---->#B0E0E6
lightblue---->#ADD8E6
lightskyblue---->#87CEFA
skyblue---->#87CEEB
deepskyblue---->#00BFFF
lightsteelblue---->#B0C4DE
dodgerblue---->#1E90FF


In [56]:
print(data)

<html>
   <body>
      <h1>Partital List  of HTML5 Supported Colors</h1>
<table border ="1" class="main-table">
   <tr>
      <td>Number </td>
      <td>Color</td>
      <td>Color Name</td>
      <td>Hex Code<br>#RRGGBB</td>
      <td>Decimal Code<br>(R,G,B)</td>
   </tr>
   <tr>
      <td>1</td>
      <td style="background:lightsalmon;">&nbsp;</td>
      <td>lightsalmon</td>
      <td>#FFA07A</td>
      <td>rgb(255,160,122)</td>
   </tr>
   <tr>
      <td>2</td>
      <td style="background:salmon;">&nbsp;</td>
      <td>salmon</td>
      <td>#FA8072</td>
      <td>rgb(250,128,114)</td>
   </tr>
   <tr>
      <td>3</td>
      <td style="background:darksalmon;">&nbsp;</td>
      <td>darksalmon</td>
      <td>#E9967A</td>
      <td>rgb(233,150,122)</td>
   </tr>
   <tr>
      <td>4</td>
      <td style="background:lightcoral;">&nbsp;</td>
      <td>lightcoral</td>
      <td>#F08080</td>
      <td>rgb(240,128,128)</td>
   </tr>
   <tr>
      <td>5</td>
      <td style="background:coral;">

In [58]:
import numpy as np
a=np.array([0,1,0,1,0]) 
b=np.array([1,0,1,0,1])
a*b 



array([0, 0, 0, 0, 0])