Amazon Web Scraping Using Python

In [14]:
# import libraries
# smtplib is used to send emails to yourself

import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import datetime
import smtplib


In [15]:
# we have to tell beautiful soup where to get the data from

URL = 'https://www.amazon.ca/Analyst-Definition-Scientist-Computer-Science/dp/B0CG2L51GZ'

# we will need headers
# get user-agent

headers = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36 OPR/109.0.0.0",
    "Accept-Encoding": "gzip, deflate",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    "DNT": "1",
    "Connection": "close",
    "Upgrade-Insecure-Requests": "1"
}

# we are requesting -> pulling the data from the webpage and using the libraries
page = requests.get(URL, headers=headers)

# we will now use the soup library
soup1 = BeautifulSoup(page.content, "html.parser")

# this will pull all the html on the webpage

#print(soup1)


In [16]:
# soup.pretify function will make the html that's being pulled look 'pretty'
# better format

soup2 = BeautifulSoup(soup1.prettify(), 'html.parser')

#print(soup2)

In [17]:
# let's try to get the title of the listing item
#use strip to get rid of trailing and leading spaces

title = soup2.find(id='productTitle').get_text().strip()

# let's also get the price of the item

price= soup2.find('span',class_='a-price-whole').get_text().strip()

print(title)
print(price)

Data Analyst Tshirt Funny Definition Data Scientist Computer Science Gift T-Shirt for Men Women
39
                      
                       .


In [18]:
# its important to have a timestamp of when you collected the data
# we will use the library from datatime package

today = datetime.date.today()

print(today)

# we are going to create a csv to copy the data into it
# create csv

import csv

# we want headers 
# we also want data
# we need to make sure our data is a list

header = ['Title', 'Price', 'Date']

# we need to make sure our data is a list
data = [title,price,today]

#type(data) to check is its a list

# create csv and name it
# 'w' means write
# newline -> when we insert data we want space
# everythingelse is default plug ins

with open('AmazonWebScraperProject.csv', 'w', newline ='', encoding ='UTF8') as f:
    writer = csv.writer(f)
    # create header, the inital insertion of the data
    writer.writerow(header)
    # inserting data
    writer.writerow(data)
    
# this will create  csv file in the specified jupyter folder 

2024-05-10


In [19]:
# we are going to create a df for the csv we just created
# instead of opening the file everytime we can just use python to open it 

df = pd.read_csv(r'/Users/abu/Data Science Bootcamp/Python for Data Science/AmazonWebScraperProject.csv')


print(df)

                                               Title  \
0  Data Analyst Tshirt Funny Definition Data Scie...   

                                               Price        Date  
0  39\n                      \n                  ...  2024-05-10  


In [25]:
# assuming you want to get data over time
# we can appeding data to the csv
# use a+ to append data to the csv
# we can also automate this step instead of running the script everytime 
# it can run in the background


with open('AmasonWebScraperProject.csv', 'a+', newline ='', encoding ='UTF8') as f:
    writer = csv.writer(f)
    writer.writerow(data)

    

In [24]:
# everything we just wrote we will need to plug it in the function below
# we will use this function we make on a timer


def check_price():
    URL = 'https://www.amazon.ca/Analyst-Definition-Scientist-Computer-Science/dp/B0CG2L51GZ'
    
    headers = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36 OPR/109.0.0.0",
    "Accept-Encoding": "gzip, deflate",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    "DNT": "1",
    "Connection": "close",
    "Upgrade-Insecure-Requests": "1"
    }
    
    page = requests.get(URL, headers=headers)
    
    soup1 = BeautifulSoup(page.content, "html.parser")
    
    soup2 = BeautifulSoup(soup1.prettify(), 'html.parser')
    
    title = soup2.find(id='productTitle').get_text().strip()
    
    price= soup2.find('span',class_='a-price-whole').get_text().strip()

    #import datatime
    
    today = datetime.date.today()
    
    import csv
    
    header = ['Title', 'Price', 'Date']
    data = [title,price,today]
    
    with open('AmazonWebScraperProject.csv', 'a+', newline ='', encoding ='UTF8') as f:
        writer = csv.writer(f)
        writer.writerow(data)




In [22]:

# let's make a while loop
# let's update the timer every day
# we can also you a certain price point for which if the item hits, we can email ourselves to point the drop

while(True):
    check_price()
    time.sleep(86400)
    
# we can also you a certain price point for which if the item hits, we can email ourselves to point the drop
# if the price drops to 30 it will send us an email

    if(price < 30):
        send_mail()

AttributeError: 'NoneType' object has no attribute 'get_text'

In [23]:
# If uou want to try sending yourself an email (just for fun) when a price hits below a certain level you can try it out with this script

def send_mail():
    server = smtplib.SMTP_SSL('smtp.gmail.com',465)
    server.ehlo()
    #server.starttls()
    server.ehlo()
    server.login('abugetthecar@gmail.com','xxxxxxxxxxxxxx')
    
    subject = "The Shirt you want is below $30! Now is your chance to buy!"
    body = "Abu, This is the moment we have been waiting for. Now is your chance to pick up the shirt of your dreams. Don't mess it up! Link here: https://www.amazon.ca/Analyst-Definition-Scientist-Computer-Science/dp/B0CG2JHH2L?th=1&psc=1"
   
    msg = f"Subject: {subject}\n\n{body}"
    
    server.sendmail(
        'abugetthecar@gmail.com',
        msg
     
    )

In [None]:
df = pd.read_csv(r'/Users/abu/Data Science Bootcamp/Python for Data Science/AmazonWebScraperProject.csv')


print(df)