#### Beautiful Soup Crash Course

##### Note: Installed lxml parsing library via PIP

In [1]:
from bs4 import BeautifulSoup
import requests
import csv

#### Option #1 - reading a HTML file saved to the repo folder

In [2]:
with open('ComputerScraperTest.htm', 'r') as local_file:
    content = local_file.read()
    soup = BeautifulSoup(content, 'lxml')

#### Option #2 (preferred) - using Request library to fetch a remote HTML web page

In [3]:
page_to_scrape = requests.get("https://webscraper.io/test-sites/e-commerce/allinone/computers/laptops")
remote_soup = BeautifulSoup(page_to_scrape.text, 'lxml')

#### Preparing CSV file for writing
* newline parameter resolves issue of blank line in between each record

In [4]:
file = open("computer_listings.csv", "w", newline='')
writer = csv.writer(file)
writer.writerow(["Name","Description","Price","Average_Rating","Number_of_Reviews"])

57

#### Desired Data:
* Product Name - a tag with class title, title attribute holds name
* Product Price - h4 tag with class "pull-right price"
* Product Description - p tag class description, inner text holds description
* Average Rating - p tag with data-rating attribute, holds string version of rating
* Number of Reviews - p tag with calass "pull-right" holds number of reviews.

#### Test - attempting to pull product names

In [5]:
product_names = remote_soup.findAll("a", attrs={"class":"title"})
for item in product_names:
    print(item.text)

Asus VivoBook X4...
Prestigio SmartB...
Prestigio SmartB...
Aspire E1-510
Lenovo V110-15IA...
Lenovo V110-15IA...
Hewlett Packard...
Acer Aspire 3 A3...
Acer Aspire A315...
Acer Aspire ES1-...
Acer Aspire 3 A3...
Acer Aspire 3 A3...
Asus VivoBook Ma...
Asus VivoBook E5...
Lenovo ThinkPad...
Acer Aspire 3 A3...
Lenovo V110-15IS...
Acer Aspire ES1-...
Asus VivoBook 15...
Packard 255 G2
Asus EeeBook R41...
Acer Aspire 3 A3...
Acer Aspire ES1-...
Acer Extensa 15...
Acer Aspire ES1-...
Lenovo V110-15IS...
Acer Aspire A315...
Lenovo V110-15IK...
Asus VivoBook 15...
Acer Aspire ES1-...
Lenovo V510 Blac...
Acer Aspire ES1-...
Lenovo V510 Blac...
Acer Swift 1 SF1...
Dell Vostro 15
Acer Aspire 3 A3...
Dell Vostro 15 (...
Lenovo V510 Blac...
HP 250 G3
Acer Spin 5
HP 350 G1
Aspire E1-572G
Pavilion
Acer Aspire A515...
Dell Inspiron 15
Asus VivoBook S1...
ProBook
Inspiron 15
Asus ROG STRIX G...
Acer Nitro 5 AN5...
Asus ROG STRIX G...
Lenovo ThinkPad...
ThinkPad Yoga
Lenovo ThinkPad...
Dell Inspiron 

#### Pulling all product cards, and saving to a variable
* All product cards live in a div with class "thumbnail"

In [6]:
product_cards = remote_soup.find_all('div', attrs={"class":"thumbnail"})
# Alternative approach:
# product_cards = remote_soup.find_all('div', class_="thumbnail")

#### Drilling down into cards to identify key pieces, then writing to CSV file
* Using structure ["Name","Description","Price","Average_Rating","Number_of_Reviews"]

In [7]:
for card in product_cards:
    #print(card)
    product_name =  card.find('a', class_="title")['title'] 
    description = card.find('p', class_="description").text
    price = card.find("h4", class_="pull-right price").text
    number_of_reviews = card.find('p', class_="pull-right").text.split()[0]
    # Scraping for Average Review - checking if the p tag contains attribute data-rating using True
    average_rating = card.find("p", {"data-rating":True})["data-rating"]
    #print([product_name,description,price,average_rating,number_of_reviews])
    writer.writerow([product_name,description,price,average_rating,number_of_reviews])

#### Closing the CSV file

In [8]:
file.close()