# Amazon Web Scraping Project with BeautifulSoup

In [1]:
# Import the libraries that we need for this project

from bs4 import BeautifulSoup
import requests
import smtplib
import time
import datetime

In [2]:
# Add URL
URL = 'https://www.nike.com/ca/t/liverpool-fc-strike-football-jacket-kdH9Cz/DB6861-376'

In [3]:
# Get headers from httpbin.org/get
headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.109 Safari/537.36", }


In [4]:
# Request HTML data
page=requests.get(URL, headers = headers)

In [5]:
# Pass BeautifulSoup's page.content

bs1 = BeautifulSoup(page.content, 'html.parser')

print(bs1)


<!DOCTYPE html>

<!-- segmenter ran at 2022-03-29T16:28:48.002Z -->
<script>
if (!(() =>
            document.cookie.includes('audience_segmentation_performed'))([])) {
  (fragmentPath => {
  const error = new Error(
    // eslint-disable-next-line prefer-template
    'ESI fragment ' + fragmentPath + ' requested, but not found in client',
  );
  // eslint-disable-next-line no-undef
  if (window.newrelic) {
    console.warn(error);
    // eslint-disable-next-line no-undef
    newrelic.noticeError(error, {
      type: 'ESI_LOAD_ERROR',
      fragment: fragmentPath,
    });
  } else {
    throw error; // let another monitoring system catch the error.
  }
})('https://www.nike.com/fragments/audience')
} else {
  (fragmentPath => {
  // eslint-disable-next-line no-undef
  if (window.newrelic) {
    // eslint-disable-next-line no-undef
    newrelic.addPageAction('ESI_LOAD', { fragment: fragmentPath });
  }
})('https://www.nike.com/fragments/audience')
}
</script>
<html lang="en-CA">
<head>
<

In [6]:
# Make the code more readable with prettify
bs2 = BeautifulSoup(bs1.prettify(), 'html.parser') 

print(bs2)

<!DOCTYPE html>

<!-- segmenter ran at 2022-03-29T16:28:48.002Z -->
<script>
 if (!(() =>
            document.cookie.includes('audience_segmentation_performed'))([])) {
  (fragmentPath => {
  const error = new Error(
    // eslint-disable-next-line prefer-template
    'ESI fragment ' + fragmentPath + ' requested, but not found in client',
  );
  // eslint-disable-next-line no-undef
  if (window.newrelic) {
    console.warn(error);
    // eslint-disable-next-line no-undef
    newrelic.noticeError(error, {
      type: 'ESI_LOAD_ERROR',
      fragment: fragmentPath,
    });
  } else {
    throw error; // let another monitoring system catch the error.
  }
})('https://www.nike.com/fragments/audience')
} else {
  (fragmentPath => {
  // eslint-disable-next-line no-undef
  if (window.newrelic) {
    // eslint-disable-next-line no-undef
    newrelic.addPageAction('ESI_LOAD', { fragment: fragmentPath });
  }
})('https://www.nike.com/fragments/audience')
}
</script>
<html lang="en-CA">
<head>
<

In [7]:
# Pull the product page title and price
title = bs2.find(id='pdp_product_title').get_text()
subtitle = bs2.find("h2", {"class":"headline-5 pb1-sm d-sm-ib", "data-test": True}).get_text()
price = bs2.find("div", {"class":"product-price css-11s12ax is--current-price css-tpaepq", "data-test": True}).get_text()

print(title)
print(subtitle)
print(price)


            Liverpool F.C. Strike
           

            Men's Nike Therma-FIT Football Jacket
           

               $242
              


In [8]:
# Remove extra white space.  
title = title.strip()
subtitle = subtitle.strip()
price = price.strip()[1:]

print(title)
print(subtitle)
print(price)

#NOTE: do not run it multiple times if you saved the object into the variable that we initially used as this will keep removing characters that we need.

Liverpool F.C. Strike
Men's Nike Therma-FIT Football Jacket
242


In [9]:
# Add a timestamp of when we pulled the data

today = datetime.date.today() 

print(today)

2022-03-29


In [10]:
# Adding data to a csv file
import csv

header = ['Title', 'Subtitle', 'Price', 'ScrapeDate']

data = [title, subtitle, price, today] # We need to change the data type from a str to a list

In [11]:
# Adding the data to the csv file
with open('NikeBSdata.csv', 'w', newline='', encoding='UTF8') as f:
    writer = csv.writer(f)
    writer.writerow(header) # only for initial insertion of data into the csv
    writer.writerow(data)

In [12]:
# Use pandas within the notebook to check the csv file
import pandas as pd

df = pd.read_csv(r'/Users/matty/Downloads/Amazon Web Scraping Project/NikeBSdata.csv')

df

Unnamed: 0,Title,Subtitle,Price,ScrapeDate
0,Liverpool F.C. Strike,Men's Nike Therma-FIT Football Jacket,242,2022-03-29


In [13]:
# Appending new data to the csv
with open('NikeBSdata.csv', 'a+', newline='', encoding='UTF8') as f:
    writer = csv.writer(f)
    writer.writerow(data)

In [14]:
# This process seems manually tedious so we can try and automate it by creating a function

def pricecheck():

    URL = 'https://www.nike.com/ca/t/liverpool-fc-strike-football-jacket-kdH9Cz/DB6861-376'

    headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.109 Safari/537.36"}

    page=requests.get(URL, headers = headers)

    bs1 = BeautifulSoup(page.content, 'html.parser')
    bs2 = BeautifulSoup(bs1.prettify(), 'html.parser')

    title = bs2.find(id='pdp_product_title').get_text()
    subtitle = bs2.find("h2", {"class":"headline-5 pb1-sm d-sm-ib", "data-test": True}).get_text()
    price = bs2.find("div", {"class":"product-price css-11s12ax is--current-price css-tpaepq", "data-test": True}).get_text()
    
    title = title.strip()
    subtitle = subtitle.strip()
    price = price.strip()[1:]

    today = datetime.date.today() 
    
    header = ['Title', 'Subtitle', 'Price', 'ScrapeDate']
    data = [title, subtitle, price, today]
    
    with open('NikeBSdata.csv', 'a+', newline='', encoding='UTF8') as f:
        writer = csv.writer(f)
        writer.writerow(data)

In [15]:
# Now we add a timer that will run it automatically

while(True):
    pricecheck()
    time.sleep(86400) # in seconds

KeyboardInterrupt: 

In [17]:
df = pd.read_csv(r'/Users/matty/Downloads/Amazon Web Scraping Project/NikeBSdata.csv')

df

Unnamed: 0,Title,Subtitle,Price,ScrapeDate
0,Liverpool F.C. Strike,Men's Nike Therma-FIT Football Jacket,242,2022-03-29
1,Liverpool F.C. Strike,Men's Nike Therma-FIT Football Jacket,242,2022-03-29
2,Liverpool F.C. Strike,Men's Nike Therma-FIT Football Jacket,242,2022-03-29
