In [None]:
# Install needed dependencies
# !pip install lxml
# !pip install requests
# !pip install beautifulsoup4

In [None]:
# Import dependencies
from bs4 import BeautifulSoup
import requests

In [None]:
# Specify the URL to get data from
url="https://webscraper.io/test-sites/e-commerce/allinone/computers/laptops"

# Get data from webpage using HTTP and store in a response object
page = requests.get(url)

# Print the status code of response
page.status_code

In [None]:
# Display the content of the response
page.text

In [None]:
# Using a parser, extract the page content into a tree of tags 
soup = BeautifulSoup (page.text, "html.parser")
print(soup.prettify())

### Navigate the tree
- Tag (name, attributes)
- Navigable String
- BeautifulSoup (Generated above. It represents the parsed document as a whole. Can be treated as Tag object)
- Comments and special characters

#### Tag

In [None]:
# title tag
title_tag = soup.title
print(title_tag)
print(f"Type: {type(title_tag)}")

In [None]:
# Tag Name 
title_tag.name

In [None]:
# Display the parent of current tag
title_tag.parent.name

In [None]:
# Display the top navigation header of webpage
header_tag = soup.header
header_tag

In [None]:
header_tag['class']

In [None]:
# Access tag dictionary directly as .attrs:
header_tag.attrs

- Multiple values of an attribute are presented as a list as shown in above 'class'.
- The values of tag attributes can be modified as a dictionary. (del tag['class'])

In [None]:
# Get a value that is always a list
header_tag.get_attribute_list("class")

#### NavigableString

In [None]:
# The string contained in tag
title_tag.string

In [None]:
type(title_tag.string)

In [None]:
# Displays the first div encountered
# Container holding top navigation
soup.div

In [None]:
# Display the style class of first div
soup.div['class']

In [None]:
# Display the first paragraph encountered in header tag
soup.header.p

In [None]:
# Get the content of first paragraph in header
soup.header.p.string

### Searching the tree
- find_all(): looks through a tag’s descendants and retrieves all descendants that match your filters
- find(): returns only one result

In [None]:
soup.find_all('title', limit=1)

In [None]:
soup.find("title") # faster than above

In [None]:
soup.find('div', {'class':'container test-site'})

In [None]:
soup.find('h4', {'class':'pull-right price'})

In [None]:
# Pass a string to a search method and Beautiful Soup will perform a match against that exact string
all_anchor_tags = soup.find_all('a')
all_anchor_tags

In [None]:
# A regular expression object can be passed to the find_all().
# Beautiful Soup will filter against that regular expression using its search() method.

import re
for tag in soup.find_all(re.compile("^b")):   #finds all the tags whose names start with the letter “b”
    print(tag.name)

In [None]:
# finds all the tags whose names contain the letter 't'
for tag in soup.find_all(re.compile("t")):
    print(tag.name)

In [None]:
# A list can be passed to find_all()
soup.find_all(["a", "b"])

In [None]:
# To find all the tags used in the tree except the text strings
for tag in soup.find_all(True):
    print(tag.name)

In [None]:
# define a function that takes an element as its only argument. 
# The function should return True if the argument matches, and False otherwise.

In [None]:
def has_class_but_no_id(tag):
    return tag.has_attr('class') and not tag.has_attr('id')

In [None]:
soup.find_all(has_class_but_no_id)

In [None]:
# Search by css class
soup.find_all("p", class_="pull-right")

In [None]:
soup.find_all('h4', {'class':'pull-right price'})[6:]

In [None]:
soup.find_all('p', class_ = re.compile('pull'), limit = 3)

In [None]:
soup.find_all(string = ['Cloud Scraper', 'Tutorials'])

#### Comment object 
- is just a special type of NavigableString:

In [None]:
markup = "<b><!--Hey, buddy. Want to buy a used parser?--></b>"
soup1 = BeautifulSoup(markup, 'html.parser')
comment = soup1.b.string
type(comment)