In [26]:
# import libraries

import pandas as pd
from bs4 import BeautifulSoup
import requests
import time
import datetime
import csv

import smtplib

## Scraping Amazon Page for a Single Product

The code below scrapes a single Amazon page from the given URL, getting the product name, price, and current date. This information is written (or appended) to a csv file and a while loop will check the price daily and append it to the csv file.

In [131]:
# Connect to Website

URL = 'https://www.amazon.com/Henkelion-Carrier-Approved-Carriers-Collapsible/dp/B07JZ31KX9/ref=sr_1_10?crid=3OO1WPMT19XXP&keywords=airplane+pet+carrier&qid=1694806450&sprefix=airplane+pet+carrier%2Caps%2C160&sr=8-10'

# User-Agent from http://httpbin.org/get
headers = {
        'dnt': '1',
        'upgrade-insecure-requests': '1',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/117.0',
        'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'sec-fetch-site': 'same-origin',
        'sec-fetch-mode': 'navigate',
        'sec-fetch-user': '?1',
        'sec-fetch-dest': 'document',
        'referer': 'https://www.amazon.com/',
        'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',}

page = requests.get(URL, headers = headers)

soup1 = BeautifulSoup(page.content, "lxml")

soup2 = BeautifulSoup(soup1.prettify(), "html.parser")

# print(soup2)

In [27]:
# Get title of the product
title = soup2.find(id = 'productTitle').get_text().strip()
print(title)

# Get price of the product
price = soup2.find_all('span', class_ = 'a-price-whole')[0].get_text().split()[0].strip() + '.' \
        + soup2.find_all('span', class_ = 'a-price-fraction')[0].get_text().strip()
print(price)

#Get today's date
today = datetime.date.today()
print(today)

Henkelion Cat, Dog Carrier for Small Medium Cats Puppies up to 15 Lbs, TSA Airline Approved Carrier Soft Sided, Collapsible Travel Puppy Carrier - Grey
23.99
2023-09-15


In [28]:
# Writes data to a new csv; comment out if don't want it to create a new csv
header = ['Title', 'Price', 'Date']
data = [title, price, today]

with open('AmazonScraperData.csv', 'w', newline = '', encoding = 'UTF8') as f:
    writer = csv.writer(f)
    writer.writerow(header)
    writer.writerow(data)

In [31]:
df = pd.read_csv(r'C:\Users\molly\AmazonScraperData.csv')

df.head()

Unnamed: 0,Title,Price,Date
0,"Henkelion Cat, Dog Carrier for Small Medium Ca...",23.99,2023-09-15


In [None]:
# Append data to the csv

with open('AmazonScraperData.csv', 'a+', newline = '', encoding = 'UTF8') as f:
    writer = csv.writer(f)
    writer.writerow(data)

In [32]:
def check_price():
    URL = 'https://www.amazon.com/Henkelion-Carrier-Approved-Carriers-Collapsible/dp/B07JZ31KX9/ref=sr_1_10?crid=3OO1WPMT19XXP&keywords=airplane+pet+carrier&qid=1694806450&sprefix=airplane+pet+carrier%2Caps%2C160&sr=8-10'

    headers = {
            'dnt': '1',
            'upgrade-insecure-requests': '1',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/117.0',
            'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
            'sec-fetch-site': 'same-origin',
            'sec-fetch-mode': 'navigate',
            'sec-fetch-user': '?1',
            'sec-fetch-dest': 'document',
            'referer': 'https://www.amazon.com/',
            'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',}

    page = requests.get(URL, headers = headers)

    soup1 = BeautifulSoup(page.content, "lxml")

    soup2 = BeautifulSoup(soup1.prettify(), "html.parser")
    
    title = soup2.find(id = 'productTitle').get_text().strip()

    price = soup2.find_all('span', class_ = 'a-price-whole')[0].get_text().split()[0].strip() + '.' \
            + soup2.find_all('span', class_ = 'a-price-fraction')[0].get_text().strip()

    today = datetime.date.today()
    
    header = ['Title', 'Price', 'Date']
    data = [title, price, today]
    
    with open('AmazonScraperData.csv', 'a+', newline = '', encoding = 'UTF8') as f:
        writer = csv.writer(f)
        writer.writerow(data)

In [None]:
# Checks the price every day for single product
while(True):
    check_price()
    time.sleep(86400)

## Scraping Amazon Best Sellers under Pet Supplies 

The code below scrapes the best sellers for pet supplies on Amazon based on the number of pages we want to scrape. It gets the product name, rating, number of ratings, and price.

In [33]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [101]:
def get_data(pageNo):
    URL = 'https://www.amazon.com/Best-Sellers-Pet-Supplies/zgbs/pet-supplies/ref=zg_bs_pg_' + str(pageNo) + '_pet-supplies?_encoding=UTF8&pg=' + str(pageNo)

    headers = {
            'dnt': '1',
            'upgrade-insecure-requests': '1',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/117.0',
            'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
            'sec-fetch-site': 'same-origin',
            'sec-fetch-mode': 'navigate',
            'sec-fetch-user': '?1',
            'sec-fetch-dest': 'document',
            'referer': 'https://www.amazon.com/',
            'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8',}

    page = requests.get(URL, headers = headers)

    soup = BeautifulSoup(page.content, "lxml")
    
    result = []
    for d in soup.find_all('div', attrs={'id': 'gridItemRoot'}):
        name = d.find_all(attrs={'class': 'a-link-normal'})[1]
        name = name.find('div')
        
        rating = d.find('span', attrs={'class': 'a-icon-alt'})
        
        usersrated = d.find('span', attrs={'class': 'a-size-small'})
        
        price = d.find('span', attrs={'class': 'a-color-price'})
        
        
        data = []
        if name is not None:
            data.append(name.get_text().strip())
        else:
            data.append('unknown-product')
        if rating is not None:
            data.append(rating.get_text().split()[0])
        else:
            data.append('-1')
        if usersrated is not None:
            data.append(usersrated.get_text().replace(',', ''))
        else:
            data.append('0')
        if price is not None:
            data.append(price.find('span').get_text().strip()[1:])
        else:
            data.append('0')
#         print(data)
        result.append(data)
    return result                               

In [106]:
no_pages = 10
results = []
for i in range(1, no_pages+1):
    results.append(get_data(i))

# Flattens the list of lists into a single list
flatten = lambda l: [item for sublist in l for item in sublist]

df = pd.DataFrame(flatten(results), columns = ['product_name', 'rating', 'customers_rated', 'price'])
df.to_csv('amazon_products.csv', index = False, encoding = 'utf-8')

In [107]:
# Read the csv file we saved

df = pd.read_csv('amazon_products.csv')

In [108]:
df.shape

(240, 4)

In [109]:
df.head()

Unnamed: 0,product_name,rating,customers_rated,price
0,"Earth Rated Dog Poop Bags, New Look, Guarantee...",4.8,132770,13.72
1,Amazon Basics Dog and Puppy Pee Pads with Leak...,4.5,217908,28.59
2,"Dr. Elsey's Precious Cat Ultra Cat Litter, 18 ...",4.3,69955,11.69
3,Purina Fancy Feast Gravy Lovers Poultry and Be...,4.7,81899,23.2
4,TEMPTATIONS Classic Crunchy and Soft Cat Treat...,4.8,97541,8.48


In [111]:
# Convert numeric columns to numeric 

df['rating'] = pd.to_numeric(df['rating'])
df['customers_rated'] = pd.to_numeric(df['customers_rated'])
df['price'] = pd.to_numeric(df['price'])

df.dtypes

product_name        object
rating             float64
customers_rated      int64
price              float64
dtype: object

In [112]:
# Replace 0 values with NaN

df.replace(str(0), np.nan, inplace = True)
df.replace(0, np.nan, inplace = True)

In [113]:
#Find number of NaNs in the df

print(len(df) - df.count())

product_name       0
rating             0
customers_rated    0
price              5
dtype: int64


There are 5 prices that aren't available which will be removed

In [115]:
df = df.dropna()
df.shape

(235, 4)

In [None]:
# Remove duplicates if there are any

df.drop_duplicates(keep = 'first', inplace = True)

In [127]:
# Find the highest priced items

data = df.sort_values(['price'], axis = 0, ascending = False)[:20]
data
# ax = sns.scatterplot(
#     data,
#     x = data['product_name'],
#     y = data['price']
# )
# ax.set_xticklabels(ax.get_xticklabels(), rotation = 90)
# plt.show()

Unnamed: 0,product_name,rating,customers_rated,price
78,"Hill's Science Diet Dry Dog Food, Adult, Sensi...",4.7,17373,79.99
221,Hill's Science Diet Adult Large Breed Dry Dog ...,4.8,7049,78.99
191,Blue Buffalo Wilderness High Protein Natural A...,4.6,660,78.17
223,Wellness Complete Health Dry Dog Food with Gra...,4.5,7565,69.98
19,"PetSafe ScoopFree Crystal Litter Tray Refills,...",4.6,35972,69.95
138,Purina Pro Plan High Protein Dog Food With Pro...,4.7,10234,69.48
33,Purina Pro Plan High Protein Dog Food With Pro...,4.7,18428,69.48
76,Seresto Large Dog Vet-Recommended Flea & Tick ...,4.5,96135,67.98
11,Blue Buffalo Life Protection Formula Natural A...,4.7,33625,64.98
44,Advantage II Large Cat Vet-Recommended Flea Tr...,4.5,66349,63.98


In [129]:
# Find top rated items with over 1000 customer ratings

data = df[df['customers_rated'] > 1000]
data = data.sort_values(['rating'], axis = 0, ascending = False)[:20]
data

Unnamed: 0,product_name,rating,customers_rated,price
0,"Earth Rated Dog Poop Bags, New Look, Guarantee...",4.8,132770,13.72
67,"Purina ONE Natural, Low Fat, Weight Control, I...",4.8,21192,33.92
48,"FELINE GREENIES Adult Dental Cat Treats, Oven ...",4.8,52864,10.98
38,GREENIES Original Large Natural Dog Dental Car...,4.8,23967,39.28
36,IAMS PROACTIVE HEALTH Adult Indoor Weight Cont...,4.8,35772,18.98
158,"Hill's Science Diet Dry Dog Food, Adult, Small...",4.8,11033,18.99
34,TEMPTATIONS MIXUPS Crunchy and Soft Cat Treats...,4.8,42614,8.48
159,"Hill's Science Diet Dry Cat Food, Adult, Perfe...",4.8,9108,37.49
163,Vceoa Carriers Soft-Sided Pet Carrier for Cats,4.8,26878,19.99
169,"Canine Carry Outs Beef Flavor Dog Treats, 47 O...",4.8,8571,9.98


In [130]:
# Find which products have the most ratings

data = data.sort_values(['customers_rated'], axis = 0, ascending = False)[:20]
data

Unnamed: 0,product_name,rating,customers_rated,price
0,"Earth Rated Dog Poop Bags, New Look, Guarantee...",4.8,132770,13.72
8,TEMPTATIONS Classic Crunchy and Soft Cat Treat...,4.8,67454,15.48
13,"Milk-Bone Original Dog Biscuits, Medium Crunch...",4.8,56102,14.98
48,"FELINE GREENIES Adult Dental Cat Treats, Oven ...",4.8,52864,10.98
34,TEMPTATIONS MIXUPS Crunchy and Soft Cat Treats...,4.8,42614,8.48
55,Best Pet Supplies Dog Poop Bags for Waste Refu...,4.8,40039,6.99
73,"FELINE GREENIES Adult Dental Cat Treats, Savor...",4.8,37164,10.98
36,IAMS PROACTIVE HEALTH Adult Indoor Weight Cont...,4.8,35772,18.98
46,GREENIES Original Petite Natural Dog Dental Ca...,4.8,31047,37.87
163,Vceoa Carriers Soft-Sided Pet Carrier for Cats,4.8,26878,19.99
