In [7]:
# Import libraries needed for the web scraping project
from bs4 import BeautifulSoup  # To parse HTML content
import requests  # To make HTTP requests to websites
import time  # To add sleep intervals for looping
import datetime  # To work with date and time
import smtplib  # To send email notifications


In [20]:
# Step 1: Define the URL of the product you want to scrape
url = 'https://www.amazon.co.uk/Apple-iPhone-15-128-GB/dp/B0CHX3RPHZ/ref=sr_1_1_sspa?crid=3C54B0GO9TVKX&dib=eyJ2IjoiMSJ9.ZIvfLG9rkG36sPq4SXKS3gqmDUfhmWtkE2sMG81-tq-u8z2JYYCNo4lv8Zitotoy95s7fvTlsqjRSu-46hs9Cjdz76mbnSrYImvTcsGtTUKsbGXcRYqlATkp8-AWEbZxiP53uBAMlnj3v12C5BOjFupPJK0PloZ9K5KxTytKRdijk7iXhmgHTug_NyVhUaFzRKDewMZDHd8ZXqb8ZijcE3sf9JrovbmKTBcJef4OPGI.mkiP_bMBb1knlcBLmvImeLrfIfff26j32Y_k0K6Ljrs&dib_tag=se&keywords=iphone&qid=1725551476&sprefix=iphone%2Caps%2C197&sr=8-1-spons&sp_csd=d2lkZ2V0TmFtZT1zcF9hdGY&psc=1'

# Step 2: Set up headers to mimic a browser request
# These headers help Amazon recognize the request as coming from a browser
headers = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36",
    "Sec-Fetch-Mode": "navigate", 
    "Sec-Fetch-Site": "none", 
    "Sec-Fetch-User": "?1", 
    "Upgrade-Insecure-Requests": "1", 
    "X-Amzn-Trace-Id": "Root=1-66d9d341-22d960847d4a2e833c263cd5"
}

In [21]:
# Step 3: Fetch the page content
# Use requests to get the webpage's HTML content
page = requests.get(url, headers=headers)

# Step 4: Parse the HTML using BeautifulSoup
# Parse the page content with BeautifulSoup
soup1 = BeautifulSoup(page.content, "html.parser")  # Get the raw HTML


In [35]:
# Step 5: Extract the Product Title
# Use the HTML "id" tag to locate and extract the title of the product
title = soup1.find(id="productTitle").get_text().strip()  # .strip() removes unnecessary spaces
print(f"Product: {title}")  # Print the product title to verify

# Step 6: Extract the Product Price
# Extract the Whole and Fractional Price

# Get the whole part of the price (e.g., "687"), handling any extra spaces or newline characters
whole_price = soup1.find("span", {"class": "a-price-whole"}).get_text().strip()

# Check if the fractional part exists, handle cases where it might be missing
fractional_price_element = soup1.find("span", {"class": "a-price-fraction"})

if fractional_price_element:
    # Get the fractional part, but handle extra spaces and ensure it's cleaned
    fractional_price = fractional_price_element.get_text().strip()
else:
    # Default fractional price if it's missing (for prices like "£687" with no decimals)
    fractional_price = "00"

# Ensure both whole_price and fractional_price are cleaned up correctly
whole_price = whole_price.replace(",", "").strip()  # Remove commas and extra spaces
fractional_price = fractional_price.strip()  # Clean any spaces in the fractional part

# Combine the whole and fractional parts
# Fix: Use string concatenation carefully to avoid extra periods or characters
price = f"{whole_price}.{fractional_price}".strip()  # Strip ensures no extra spaces or periods

# Print the cleaned-up price to verify the output
print(f"Price: £{price}")


Product: Apple iPhone 15 (128 GB) - Blue
Price: £687..00


In [23]:
# Step 7: Add a Date Stamp
# Track the date when the product price was scraped for historical reference
today = datetime.today().strftime('%Y-%m-%d')
print(f"Date: {today}")  # Print the current date


Date: 2024-09-05


In [33]:
# Step 8: Save the Data to a CSV file
# Import the CSV module to handle CSV file operations
import csv

# We'll store the product title, price, and date in a CSV file
header = ['Product', 'Price', 'Date']  # CSV headers
data = [title, price, today]  # The data we want to store

# Create a CSV file (or open if it exists) and write the data
with open('AmazonWebScraperDataset.csv', 'w', newline='', encoding='UTF8') as f:
    writer = csv.writer(f)
    writer.writerow(header)  # Write the header (only once)
    writer.writerow(data)  # Write the data (title, price, date)
# Confirm data has been saved
    print(f"Data saved: {data}")

Data saved: ['Apple iPhone 15 (128 GB) - Blue', '687..00', '2024-09-05']


In [26]:
# Import pandas to read the CSV file
import pandas as pd

# Read the CSV file to verify the data has been saved
df = pd.read_csv(r'AmazonWebScraperDataset.csv')

# Print the dataframe to check the content
print(df)


                           Product    Price        Date
0  Apple iPhone 15 (128 GB) - Blue  687..00  2024-09-05


In [39]:
# Function to fetch data and append it to the existing CSV file
def check_price():
    page = requests.get(url, headers=headers)
    soup1 = BeautifulSoup(page.content, "html.parser")
    
    # Extract the product title
    title = soup1.find(id="productTitle").get_text().strip()
    
    # Extract the price using the fixed logic
    whole_price = soup1.find("span", {"class": "a-price-whole"}).get_text().strip()
    fractional_price_element = soup1.find("span", {"class": "a-price-fraction"})
    
    if fractional_price_element:
        fractional_price = fractional_price_element.get_text().strip()
    else:
        fractional_price = "00"
    
    price = f"{whole_price}.{fractional_price}".strip()

    # Add the current date
    today = datetime.today().strftime('%Y-%m-%d')
    
    # Append data to the CSV
    data = [title, price, today]
    with open('AmazonWebScraperDataset.csv', 'a+', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(data)
    
    # Print the data for verification
    print(f"Data scraped and saved: {data}")

In [40]:
import time

# Test: Run the scraper every 2 seconds for 5 iterations
for i in range(5):
    check_price()  # Run the price check function
    time.sleep(2)  # Wait for 2 seconds before the next run
    print(f"Iteration {i+1} complete.")


Data scraped and saved: ['Apple iPhone 15 (128 GB) - Blue', '687..00', '2024-09-05']
Iteration 1 complete.
Data scraped and saved: ['Apple iPhone 15 (128 GB) - Blue', '687..00', '2024-09-05']
Iteration 2 complete.
Data scraped and saved: ['Apple iPhone 15 (128 GB) - Blue', '687..00', '2024-09-05']
Iteration 3 complete.
Data scraped and saved: ['Apple iPhone 15 (128 GB) - Blue', '687..00', '2024-09-05']
Iteration 4 complete.
Data scraped and saved: ['Apple iPhone 15 (128 GB) - Blue', '687..00', '2024-09-05']
Iteration 5 complete.


In [None]:
# Step 10: Run the scraper repeatedly (automating the process)
# The following loop runs the scraper at set intervals (e.g., once a day)
import time

# The loop will continuously scrape the product page and append new data every 24 hours (86400 seconds)
while True:
    check_price()  # Run the price check function
    time.sleep(86400)  # Wait 24 hours before running the script again (86400 seconds in a day)


In [41]:
# Import pandas to read the CSV file
import pandas as pd

# Read the CSV file to verify the data has been saved
df = pd.read_csv(r'AmazonWebScraperDataset.csv')

# Print the dataframe to check the content
print(df)


                           Product    Price        Date
0  Apple iPhone 15 (128 GB) - Blue  687..00  2024-09-05
1  Apple iPhone 15 (128 GB) - Blue  687..00  2024-09-05
2  Apple iPhone 15 (128 GB) - Blue  687..00  2024-09-05
3  Apple iPhone 15 (128 GB) - Blue  687..00  2024-09-05
4  Apple iPhone 15 (128 GB) - Blue  687..00  2024-09-05
5  Apple iPhone 15 (128 GB) - Blue  687..00  2024-09-05
6  Apple iPhone 15 (128 GB) - Blue  687..00  2024-09-05
