# BeautifulSoup Tutorial

## basic-level web scraping exercises

we will master beautifulsoup with concrete exercises

In [26]:
# import the necessary libraries
import requests
from bs4 import BeautifulSoup

### 1. Extract the title of a webpage.

In [27]:
# create a variable containing the URL of the desired website
url = "https://www.github.com"

# send a request to the website and store the response
response = requests.get(url)

# parse the HTML content of the website using BeautifulSoup
soup = BeautifulSoup(response.text, "html.parser")

# extract the title of the website from the parsed HTML
title = soup.title.string

# print the title of the website
print(title)

GitHub: Let’s build from here · GitHub


### 2. Extract all the links from a webpage.

In [51]:
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
links = soup.find_all("a")

# for link in links:
#     print(link.get("href"))
print(len(links))

109


### 3. Extract all the images from a webpage.

In [29]:
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
images = soup.find_all("img")

# for image in images:
#     print(image.get("src"))
print(len(images))

53


### 4. Extract all the text from a webpage.

In [30]:
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
text = soup.get_text()

print(len(text))

10610


### 5. Extract the first paragraph of a webpage.

In [33]:
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
paragraph = soup.p.string

print(paragraph.strip())


Harnessed for productivity. Designed for collaboration. Celebrated for built-in security. Welcome to the platform developers love.


### 6. Extract all the headers from a webpage.

In [32]:
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
headers = soup.find_all(["h1", "h2", "h3", "h4", "h5", "h6"])

# for header in headers:
#     print(header.string)
print(len(headers))

19


### 7. Extract all the tables from a webpage. 

In [37]:
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
tables = soup.find_all("table")

# for table in tables:
#     print(table)
len(tables)

0

### 8. Extract all the list items from a webpage.

In [43]:
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
lists = soup.find_all("li")

# for li in lists:
#     if li.string is not None:
#         print(li.string.strip())

print(len(lists))

92


### 9. Extract all the divs with a specific class from a webpage.

In [46]:
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
divs = soup.find_all("div", class_="d-none")

# for div in divs:
#     print(div)
print(len(divs))

19


### 10. Extract all the paragraphs with a specific attribute from a webpage.

In [52]:
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
paragraphs = soup.find_all("p", attrs={"class": "f1-mktg"})
# for p in paragraphs:
#     print(p.string)
print(len(paragraphs))

1


## intermediate-level web scraping exercises

### 1. Scraping table data from a webpage using Beautiful Soup:

In [73]:
# the URL of the webpage containing the table
url = 'https://www.w3schools.com/html/html_tables.asp'

# make a GET request to the webpage
response = requests.get(url)

# parse the HTML content using Beautiful Soup
soup = BeautifulSoup(response.content, 'html.parser')

# find the table element by its ID attribute
table = soup.find(id='customers')

# find all rows of the table (excluding the header row)
rows = table.find_all('tr')
# print the header
print('\t'.join([col.text.strip().ljust(30) for col in rows[0].find_all('th')]))
print('\t'.join(['**************'.rjust(30) for col in rows[0].find_all('th')]))
# loop through each row and extract the data from the columns
for row in rows[1:]:
    # get the column values as a list
    columns = row.find_all('td')
    # print the values of each column separated by a tab
    print('\t'.join([col.text.strip().ljust(30) for col in columns]))


Company                       	Contact                       	Country                       
                **************	                **************	                **************
Alfreds Futterkiste           	Maria Anders                  	Germany                       
Centro comercial Moctezuma    	Francisco Chang               	Mexico                        
Ernst Handel                  	Roland Mendel                 	Austria                       
Island Trading                	Helen Bennett                 	UK                            
Laughing Bacchus Winecellars  	Yoshi Tannamuri               	Canada                        
Magazzini Alimentari Riuniti  	Giovanni Rovelli              	Italy                         


### 2. Scraping data from multiple pages using a loop:

In [None]:
# the base URL of the webpage
url = 'https://www.goodreads.com/quotes/tag/love?page='

# loop through page numbers 1 to 3
for page in range(1, 3):
    # construct the full URL of the page
    page_url = url + str(page)
    # make a GET request to the page
    response = requests.get(page_url)
    # parse the HTML content using Beautiful Soup
    soup = BeautifulSoup(response.content, 'html.parser')
    # find the elements containing the data you want
    # and do something with them (e.g., print)
    # print(soup.title.string)
    s = soup.select('.leftContainer > .mediumText')
    print(s[0].text.strip().split('\n')[1])


Showing 1-30 of 90,391
Showing 31-60 of 90,391


### 3. Scraping data from nested HTML elements:

In [101]:
# the URL of the webpage containing the nested elements
url = 'https://www.goodreads.com/quotes'

# make a GET request to the webpage
response = requests.get(url)

# parse the HTML content using Beautiful Soup
soup = BeautifulSoup(response.content, 'html.parser')

# find the parent element that contains the nested elements
parent = soup.find('div', class_='quote')

# find the nested elements within the parent element
nested_elements = parent.find_all('div', class_='quoteText')
print(sou)
# loop through the nested elements and extract the data
for nested_element in nested_elements:
    # extract the data from the nested element and do something with it
    # ...
    pass


<div class="quote">
<div class="quoteDetails">
<a class="leftAlignedImage" href="/author/show/3565.Oscar_Wilde">
<img alt="Oscar Wilde" src="https://images.gr-assets.com/authors/1673611182p2/3565.jpg"/>
</a>
<div class="quoteText">
      “Be yourself; everyone else is already taken.”
  <br/>  ―
  <span class="authorOrTitle">
    Oscar Wilde
  </span>
</div>
<div class="quoteFooter">
<div class="greyText smallText left">
     tags:
       <a href="/quotes/tag/attributed-no-source">attributed-no-source</a>,
       <a href="/quotes/tag/be-yourself">be-yourself</a>,
       <a href="/quotes/tag/gilbert-perreira">gilbert-perreira</a>,
       <a href="/quotes/tag/honesty">honesty</a>,
       <a href="/quotes/tag/inspirational">inspirational</a>,
       <a href="/quotes/tag/misattributed-oscar-wilde">misattributed-oscar-wilde</a>,
       <a href="/quotes/tag/quote-investigator">quote-investigator</a>
</div>
<div class="right">
<a class="smallText" href="/quotes/19884-be-yourself-everyone-else-