# Web Scraping

In [5]:
!pip install html5lib



In [6]:
# !pip install requests
!pip install bs4



In [7]:
from bs4 import BeautifulSoup # this module helps in web scrapping.
import requests  # this module helps us to download a web page

## Beautiful Soup Object

In [8]:
%%html
<!DOCTYPE html>
<html>
<head>
<title>Page Title</title>
</head>
<body>
<h3><b id='boldest'>Lebron James</b></h3>
<p> Salary: $ 92,000,000 </p>
<h3> Stephen Curry</h3>
<p> Salary: $85,000, 000 </p>
<h3> Kevin Durant </h3>
<p> Salary: $73,200, 000</p>
</body>
</html>

In [9]:
# We can store it as a string in the variable HTML:
html = "<!DOCTYPE html><html><head><title>Page Title</title></head><body><h3> \
<b id='boldest'>Lebron James</b></h3><p> Salary: $ 92,000,000 </p> \
<h3>Stephen Curry</h3><p> Salary: $85,000,000</p> \
<h3>Kevin Durant</h3><p> Salary: $73,200,000</p></body></html>"

In [10]:
# To parse a document, pass it into the BeautifulSoup constructor. The BeautifulSoup object represents the document as a nested data structure:
soup = BeautifulSoup(html, 'html5lib')

In [11]:
# We can use the method prettify() to display the HTML in the nested structure:
print(soup.prettify())

<!DOCTYPE html>
<html>
 <head>
  <title>
   Page Title
  </title>
 </head>
 <body>
  <h3>
   <b id="boldest">
    Lebron James
   </b>
  </h3>
  <p>
   Salary: $ 92,000,000
  </p>
  <h3>
   Stephen Curry
  </h3>
  <p>
   Salary: $85,000,000
  </p>
  <h3>
   Kevin Durant
  </h3>
  <p>
   Salary: $73,200,000
  </p>
 </body>
</html>



## Tags

In [12]:
tag_object = soup.title
print("tag object:", tag_object)

tag object: <title>Page Title</title>


In [13]:
# we can see the tag type bs4.element.Tag
print("tag object type:", type(tag_object))

tag object type: <class 'bs4.element.Tag'>


In [14]:
# If there is more than one Tag with the same name, the first element with that Tag name is called, this corresponds to the most paid player:
tag_object = soup.h3
tag_object

<h3> <b id="boldest">Lebron James</b></h3>

## Children, Parents, and Siblings


In [15]:
# As stated above the Tag object is a tree of objects we can access the child of the tag or navigate down the branch as follows:
tag_child = tag_object.b
tag_child

<b id="boldest">Lebron James</b>

In [16]:
# You can access the parent with the  parent
parent_tag = tag_child.parent
parent_tag

<h3> <b id="boldest">Lebron James</b></h3>

In [17]:
tag_object

<h3> <b id="boldest">Lebron James</b></h3>

In [18]:
# tag_object parent is the body element.
tag_object.parent

<body><h3> <b id="boldest">Lebron James</b></h3><p> Salary: $ 92,000,000 </p> <h3>Stephen Curry</h3><p> Salary: $85,000,000</p> <h3>Kevin Durant</h3><p> Salary: $73,200,000</p></body>

In [19]:
sibling_1 = tag_object.next_sibling
sibling_1

<p> Salary: $ 92,000,000 </p>

In [20]:
sibling_2 = sibling_1.next_sibling
sibling_2

' '

## Exercise: next_sibling

In [21]:
# Use the object sibling_2 and the method next_sibling to find the salary of Stephen Curry:
sibling_2.next_sibling

<h3>Stephen Curry</h3>

## HTML Attributes

In [22]:
# If the tag has attributes, the tag id="boldest" has an attribute id whose value is boldest. You can access a tag's attributes by treating the tag like a dictionary:
tag_child['id']

'boldest'

In [23]:
# You can access that dictionary directly as attrs
tag_child.attrs

{'id': 'boldest'}

In [24]:
# We can also obtain the content of the attribute of the tag using the Python get() method.
tag_child.get('id')

'boldest'

## Navigable String

In [25]:
# n our HTML we can obtain the name of the first player by extracting the string of the Tag object tag_child as follows:
tag_string = tag_child.string
tag_string

'Lebron James'

In [26]:
# we can verify the type is Navigable String
type(tag_string)

bs4.element.NavigableString

In [27]:
# A NavigableString is similar to a Python string or Unicode string. To be more precise, the main difference is that it also supports some BeautifulSoup features. We can convert it to string object in Python:
unicode_string = str(tag_string)
unicode_string

'Lebron James'

## Filter

Filters allow you to find complex patterns, the simplest filter is a string. In this section we will pass a string to a different filter method and Beautiful Soup will perform a match against that exact string. Consider the following HTML of rocket launches:

In [28]:
%%html
<table>
  <tr>
    <td id='flight' >Flight No</td>
    <td>Launch site</td>
    <td>Payload mass</td>
   </tr>
  <tr>
    <td>1</td>
    <td><a href='https://en.wikipedia.org/wiki/Florida'>Florida</a></td>
    <td>300 kg</td>
  </tr>
  <tr>
    <td>2</td>
    <td><a href='https://en.wikipedia.org/wiki/Texas'>Texas</a></td>
    <td>94 kg</td>
  </tr>
  <tr>
    <td>3</td>
    <td><a href='https://en.wikipedia.org/wiki/Florida'>Florida<a> </td>
    <td>80 kg</td>
  </tr>
</table>

0,1,2
Flight No,Launch site,Payload mass
1,Florida,300 kg
2,Texas,94 kg
3,Florida,80 kg


In [29]:
# We can store it as a string in the variable table:
table = "<table><tr><td id='flight'>Flight No</td><td>Launch site</td> \
<td>Payload mass</td></tr><tr> <td>1</td> \
<td><a href='https://en.wikipedia.org/wiki/Florida'>Florida<a></td> \
<td>300 kg</td></tr><tr><td>2</td> \
<td><a href='https://en.wikipedia.org/wiki/Texas'>Texas</a></td> \
<td>94 kg</td></tr><tr><td>3</td> \
<td><a href='https://en.wikipedia.org/wiki/Florida'>Florida<a> </td> \
<td>80 kg</td></tr></table>"

In [30]:
table_bs = BeautifulSoup(table, 'html5lib')

## find_All

The find_all() method looks through a tag's descendants and retrieves all descendants that match your filters.

The Method signature for find_all(name, attrs, recursive, string, limit, **kwargs)

### Name

In [31]:
table_rows = table_bs.find_all('tr')
table_rows

[<tr><td id="flight">Flight No</td><td>Launch site</td> <td>Payload mass</td></tr>,
 <tr> <td>1</td> <td><a href="https://en.wikipedia.org/wiki/Florida">Florida</a><a></a></td> <td>300 kg</td></tr>,
 <tr><td>2</td> <td><a href="https://en.wikipedia.org/wiki/Texas">Texas</a></td> <td>94 kg</td></tr>,
 <tr><td>3</td> <td><a href="https://en.wikipedia.org/wiki/Florida">Florida</a><a> </a></td> <td>80 kg</td></tr>]

In [32]:
first_row = table_rows[0]
first_row

<tr><td id="flight">Flight No</td><td>Launch site</td> <td>Payload mass</td></tr>

In [33]:
# The type is tag
print(type(first_row))

<class 'bs4.element.Tag'>


In [34]:
# We can obtain the child
first_row.td

<td id="flight">Flight No</td>

In [35]:
# If we iterate through the list, each element corresponds to a row in the table:
for i, row in enumerate(table_rows):
  print("row", i, "is", row)

row 0 is <tr><td id="flight">Flight No</td><td>Launch site</td> <td>Payload mass</td></tr>
row 1 is <tr> <td>1</td> <td><a href="https://en.wikipedia.org/wiki/Florida">Florida</a><a></a></td> <td>300 kg</td></tr>
row 2 is <tr><td>2</td> <td><a href="https://en.wikipedia.org/wiki/Texas">Texas</a></td> <td>94 kg</td></tr>
row 3 is <tr><td>3</td> <td><a href="https://en.wikipedia.org/wiki/Florida">Florida</a><a> </a></td> <td>80 kg</td></tr>


As row is a cell object, we can apply the method find_all to it and extract table cells in the object cells using the tag td, this is all the children with the name td. The result is a list, each element corresponds to a cell and is a Tag object, we can iterate through this list as well. We can extract the content using the string attribute.

In [36]:
for i, row in enumerate(table_rows):
    print("row", i)
    cells = row.find_all('td')
    for j, cell in enumerate(cells):
        print('colunm', j, "cell", cell)

row 0
colunm 0 cell <td id="flight">Flight No</td>
colunm 1 cell <td>Launch site</td>
colunm 2 cell <td>Payload mass</td>
row 1
colunm 0 cell <td>1</td>
colunm 1 cell <td><a href="https://en.wikipedia.org/wiki/Florida">Florida</a><a></a></td>
colunm 2 cell <td>300 kg</td>
row 2
colunm 0 cell <td>2</td>
colunm 1 cell <td><a href="https://en.wikipedia.org/wiki/Texas">Texas</a></td>
colunm 2 cell <td>94 kg</td>
row 3
colunm 0 cell <td>3</td>
colunm 1 cell <td><a href="https://en.wikipedia.org/wiki/Florida">Florida</a><a> </a></td>
colunm 2 cell <td>80 kg</td>


In [37]:
list_input = table_bs.find_all(name=["tr", "td"])
list_input

[<tr><td id="flight">Flight No</td><td>Launch site</td> <td>Payload mass</td></tr>,
 <td id="flight">Flight No</td>,
 <td>Launch site</td>,
 <td>Payload mass</td>,
 <tr> <td>1</td> <td><a href="https://en.wikipedia.org/wiki/Florida">Florida</a><a></a></td> <td>300 kg</td></tr>,
 <td>1</td>,
 <td><a href="https://en.wikipedia.org/wiki/Florida">Florida</a><a></a></td>,
 <td>300 kg</td>,
 <tr><td>2</td> <td><a href="https://en.wikipedia.org/wiki/Texas">Texas</a></td> <td>94 kg</td></tr>,
 <td>2</td>,
 <td><a href="https://en.wikipedia.org/wiki/Texas">Texas</a></td>,
 <td>94 kg</td>,
 <tr><td>3</td> <td><a href="https://en.wikipedia.org/wiki/Florida">Florida</a><a> </a></td> <td>80 kg</td></tr>,
 <td>3</td>,
 <td><a href="https://en.wikipedia.org/wiki/Florida">Florida</a><a> </a></td>,
 <td>80 kg</td>]

### Attributes

If the argument is not recognized it will be turned into a filter on the tag's attributes. For example with the id argument, Beautiful Soup will filter against each tag's id attribute. For example, the first td elements have a value of id of flight, therefore we can filter based on that id value.

In [38]:
table_bs.find_all(id="flight")

[<td id="flight">Flight No</td>]

In [39]:
# We can find all the elements that have links to the Florida Wikipedia page:
list_input = table_bs.find_all(href="https://en.wikipedia.org/wiki/Florida")
list_input

[<a href="https://en.wikipedia.org/wiki/Florida">Florida</a>,
 <a href="https://en.wikipedia.org/wiki/Florida">Florida</a>]

In [40]:
# If we set the href attribute to True, regardless of what the value is, the code finds all anchor tags with href value:
table_bs.find_all('a', href=True)

[<a href="https://en.wikipedia.org/wiki/Florida">Florida</a>,
 <a href="https://en.wikipedia.org/wiki/Texas">Texas</a>,
 <a href="https://en.wikipedia.org/wiki/Florida">Florida</a>]

### Exercise: find_All

In [41]:
# Using the logic above, find all the anchor tags without href value
table_bs.find_all('a', href=False)

[<a></a>, <a> </a>]

In [42]:
# Using the soup object soup, find the element with the id attribute content set to "boldest".
soup.find_all(id="boldest")

[<b id="boldest">Lebron James</b>]

### string

In [43]:
# With string you can search for strings instead of tags, where we find all the elments with Florida:
table_bs.find_all(string="Florida")

['Florida', 'Florida']

### find

The find_all() method scans the entire document looking for results. It’s useful if you are looking for one element, as you can use the find() method to find the first element in the document. Consider the following two tables:

In [44]:
%%html
<h3>Rocket Launch </h3>

<p>
<table class='rocket'>
  <tr>
    <td>Flight No</td>
    <td>Launch site</td>
    <td>Payload mass</td>
  </tr>
  <tr>
    <td>1</td>
    <td>Florida</td>
    <td>300 kg</td>
  </tr>
  <tr>
    <td>2</td>
    <td>Texas</td>
    <td>94 kg</td>
  </tr>
  <tr>
    <td>3</td>
    <td>Florida </td>
    <td>80 kg</td>
  </tr>
</table>
</p>
<p>

<h3>Pizza Party</h3>


<table class='pizza'>
  <tr>
    <td>Pizza Place</td>
    <td>Orders</td>
    <td>Slices </td>
   </tr>
  <tr>
    <td>Domino's Pizza</td>
    <td>10</td>
    <td>100</td>
  </tr>
  <tr>
    <td>Little Caesars</td>
    <td>12</td>
    <td >144 </td>
  </tr>
  <tr>
    <td>Papa John's </td>
    <td>15 </td>
    <td>165</td>
  </tr>


0,1,2
Flight No,Launch site,Payload mass
1,Florida,300 kg
2,Texas,94 kg
3,Florida,80 kg

0,1,2
Pizza Place,Orders,Slices
Domino's Pizza,10,100
Little Caesars,12,144
Papa John's,15,165


In [45]:
# We store the HTML as a Python string and assign two_tables:
two_tables="<h3>Rocket Launch </h3> \
<p><table class='rocket'> \
<tr><td>Flight No</td><td>Launch site</td><td>Payload mass</td></tr> \
<tr><td>1</td><td>Florida</td><td>300 kg</td></tr> \
<tr><td>2</td><td>Texas</td><td>94 kg</td></tr> \
<tr><td>3</td><td>Florida </td><td>80 kg</td></tr></table></p>\
<p><h3>Pizza Party</h3> \
<table class='pizza'> \
<tr><td>Pizza Place</td><td>Orders</td><td>Slices </td></tr> \
<tr><td>Domino's Pizza</td><td>10</td><td>100</td></tr> \
<tr><td>Little Caesars</td><td>12</td><td >144 </td></tr> \
<tr><td>Papa John's</td><td>15 </td><td>165</td></tr>"

In [46]:
# We create a BeautifulSoup object two_tables_bs
two_tables_bs = BeautifulSoup(two_tables, 'html.parser')

In [47]:
# We can find the first table using the tag name table
two_tables_bs.find("table")

<table class="rocket"> <tr><td>Flight No</td><td>Launch site</td><td>Payload mass</td></tr> <tr><td>1</td><td>Florida</td><td>300 kg</td></tr> <tr><td>2</td><td>Texas</td><td>94 kg</td></tr> <tr><td>3</td><td>Florida </td><td>80 kg</td></tr></table>

In [48]:
# We can filter on the class attribute to find the second table, but because class is a keyword in Python, we add an underscore to differentiate them.
two_tables_bs.find("table", class_='pizza')

<table class="pizza"> <tr><td>Pizza Place</td><td>Orders</td><td>Slices </td></tr> <tr><td>Domino's Pizza</td><td>10</td><td>100</td></tr> <tr><td>Little Caesars</td><td>12</td><td>144 </td></tr> <tr><td>Papa John's</td><td>15 </td><td>165</td></tr></table>

##Downloading And Scraping The Contents Of A Web Page

In [49]:
url = "http://www.ibm.com"

In [50]:
# We use get to download the contents of the webpage in text format and store in a variable called data:
data = requests.get(url).text

In [51]:
# We create a BeautifulSoup object using the BeautifulSoup constructor
soup = BeautifulSoup(data,"html5lib")        # create a soup object using the variable 'data'

In [52]:
# Scrape all links
for link in soup.find_all('a', href=True):     # in html anchor/link is represented by the tag <a>
  print(link.get('href'))

https://www.ibm.com/hybrid-cloud?lnk=hpUSbt1


###Scrape all images Tags

In [53]:
for link in soup.find_all('img'):      # in html image is represented by the tag <img>
  print(link)
  print(link.get('src'))

Scrape data from HTML tables

In [54]:
# The below url contains an html table with data about colors and color codes.
url = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-DA0321EN-SkillsNetwork/labs/datasets/HTMLColorCodes.html"

In [55]:
# Before proceeding to scrape a web site, you need to examine the contents and the way data is organized on the website. Open the above url in your browser and check how many rows and columns there are in the color table.
# # get the contents of the webpage in text format and store in a variable called data
data = requests.get(url).text

In [56]:
soup = BeautifulSoup(data, "html5lib")

In [57]:
# find a html table in the web page
table = soup.find('table')  # in html table is represented by the tag <table>

In [58]:
# Get all rows from the table
for row in table.find_all('tr'):  # in html table row represented by tag <tr>
    # Get all columns in each row.
    cols = row.find_all('td')  # in html a column is represented by tag <td>
    color_name = cols[2].string  # store the value in column 3 as color_name
    color_code = cols[3].text  # store the value in column 4 as color_code
    print("{}--->{}".format(color_name, color_code))

Color Name--->Hex Code#RRGGBB
lightsalmon--->#FFA07A
salmon--->#FA8072
darksalmon--->#E9967A
lightcoral--->#F08080
coral--->#FF7F50
tomato--->#FF6347
orangered--->#FF4500
gold--->#FFD700
orange--->#FFA500
darkorange--->#FF8C00
lightyellow--->#FFFFE0
lemonchiffon--->#FFFACD
papayawhip--->#FFEFD5
moccasin--->#FFE4B5
peachpuff--->#FFDAB9
palegoldenrod--->#EEE8AA
khaki--->#F0E68C
darkkhaki--->#BDB76B
yellow--->#FFFF00
lawngreen--->#7CFC00
chartreuse--->#7FFF00
limegreen--->#32CD32
lime--->#00FF00
forestgreen--->#228B22
green--->#008000
powderblue--->#B0E0E6
lightblue--->#ADD8E6
lightskyblue--->#87CEFA
skyblue--->#87CEEB
deepskyblue--->#00BFFF
lightsteelblue--->#B0C4DE
dodgerblue--->#1E90FF


## Scrapping tables from a Web page using Pandas

Particularly for extracting tabular data from a web page, you may also use the read_html() method of the Pandas library.

In [59]:
# The below url contains an html table with data about colors and color codes.
url = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBM-DA0321EN-SkillsNetwork/labs/datasets/HTMLColorCodes.html"

In [60]:
# You may extract all the tables from the given webpage simply by using the following commands.
import pandas as pd
tables = pd.read_html(url)
tables

[         0      1               2                 3                     4
 0   Number  Color      Color Name  Hex Code #RRGGBB  Decimal Code (R,G,B)
 1        1    NaN     lightsalmon           #FFA07A      rgb(255,160,122)
 2        2    NaN          salmon           #FA8072      rgb(250,128,114)
 3        3    NaN      darksalmon           #E9967A      rgb(233,150,122)
 4        4    NaN      lightcoral           #F08080      rgb(240,128,128)
 5        5    NaN           coral           #FF7F50       rgb(255,127,80)
 6        6    NaN          tomato           #FF6347        rgb(255,99,71)
 7        7    NaN       orangered           #FF4500         rgb(255,69,0)
 8        8    NaN            gold           #FFD700        rgb(255,215,0)
 9        9    NaN          orange           #FFA500        rgb(255,165,0)
 10      10    NaN      darkorange           #FF8C00        rgb(255,140,0)
 11      11    NaN     lightyellow           #FFFFE0      rgb(255,255,224)
 12      12    NaN    lem

In [61]:
tables[0]

Unnamed: 0,0,1,2,3,4
0,Number,Color,Color Name,Hex Code #RRGGBB,"Decimal Code (R,G,B)"
1,1,,lightsalmon,#FFA07A,"rgb(255,160,122)"
2,2,,salmon,#FA8072,"rgb(250,128,114)"
3,3,,darksalmon,#E9967A,"rgb(233,150,122)"
4,4,,lightcoral,#F08080,"rgb(240,128,128)"
5,5,,coral,#FF7F50,"rgb(255,127,80)"
6,6,,tomato,#FF6347,"rgb(255,99,71)"
7,7,,orangered,#FF4500,"rgb(255,69,0)"
8,8,,gold,#FFD700,"rgb(255,215,0)"
9,9,,orange,#FFA500,"rgb(255,165,0)"
