In [None]:
from bs4 import BeautifulSoup
import pandas as pd
import requests as re
import numpy as np

In [None]:
# Function to extract Product Title
def get_title(soup):

    try:
        # Outer Tag Object
        title = soup.find("span", attrs={"id":'productTitle'})
        
        # Inner NavigatableString Object
        title_value = title.text

        # Title as a string value
        title_string = title_value.strip()

    except AttributeError:
        title_string = ""

    return title_string

# Function to extract Product Price
def get_price(soup):
    try:
        # Find all <span> elements with the specified class
        span_elements = soup.find_all("span", attrs={"class": "a-offscreen"})
    
        # Check if there are any <span> elements found
        if span_elements:
            # Extract the text content of the first <span> element
            price = span_elements[0].text.strip()
            return price
    except AttributeError:
        try:
            # If there is some deal price
            price = soup.find("span", attrs={'id':'priceblock_dealprice'}).string.strip()
            return price
        except:
            pass  # If both attempts fail, the function will return None
        
    return None  # Return None if no price is found

# Function to extract Product Rating
def get_rating(soup):

    try:
        rating = soup.find("i", attrs={'class':'a-icon a-icon-star a-star-4-5'}).string.strip()
    
    except AttributeError:
        try:
            rating = soup.find("span", attrs={'class':'a-icon-alt'}).string.strip()
        except:
            rating = ""	

    return rating

# Function to extract Number of User Reviews
def get_review_count(soup):
    try:
        review_count = soup.find("span", attrs={'id':'acrCustomerReviewText'}).string.strip()

    except AttributeError:
        review_count = ""	

    return review_count

# Function to extract Availability Status
def get_availability(soup):
    try:
        available = soup.find("div", attrs={'id':'availability'})
        available = available.find("span").string.strip()

    except AttributeError:
        available = "Not Available"	

    return available

def get_category(soup):
    # Find all <th> elements with the specified class
    th_elements = soup.findAll('th', attrs={"class": "a-color-secondary a-size-base prodDetSectionEntry"})
    
    # Iterate through each <th> element
    for th_element in th_elements:
        # Check if the text content of the <th> element is "Best Sellers Rank"
        if th_element.text.strip() == "Best Sellers Rank":
            # Access the <th> element and find its next sibling <td> element
            rank_element = th_element.find_next_sibling("td")
            
            # Split the text content of the <td> element by spaces
            rank_info = rank_element.text.strip().split()
            
            category = rank_info[2]
            
            return category
        
# Function to get sales rank
def get_sales_rank(soup):
        # Find all <th> elements with the specified class
    th_elements = soup.findAll('th', attrs={"class": "a-color-secondary a-size-base prodDetSectionEntry"})
    
    # Iterate through each <th> element
    for th_element in th_elements:
        # Check if the text content of the <th> element is "Best Sellers Rank"
        if th_element.text.strip() == "Best Sellers Rank":
            # Access the <th> element and find its next sibling <td> element
            rank_element = th_element.find_next_sibling("td")
            
            # Split the text content of the <td> element by spaces
            rank_info = rank_element.text.strip().split()
            
            # Extract the rank (the first part of the split string)
            rank = rank_info[0]
            
            return rank

def get_brand_name(soup):
    brand = None
    # Find all <th> elements with the specified class
    th_elements = soup.findAll('th', attrs={"class": "a-color-secondary a-size-base prodDetSectionEntry"})
    
    # Iterate through each <th> element
    for th_element in th_elements:
        # Check if the text content of the <th> element is "Brand"
        if th_element.text.strip() == "Brand":
            # Access the <th> element and find its next sibling <td> element
            brand_element = th_element.find_next_sibling("td")
            # Get the text content of the <td> element
            brand = brand_element.text.strip()
            break  # Stop searching once brand is found
            
    return brand

def scrape_and_save_data(links_list, filename):
    # Initialize an empty dictionary to store scraped data
    d = {"title":[], "price":[], "rating":[], "reviews":[], "availability":[], "category":[], "sales_rank":[], "brand":[]}

    # Loop for extracting product details from each link 
    for link in links_list:
        try:
            new_webpage = re.get("https://www.amazon.com.au" + link, headers=HEADERS)
            new_soup = BeautifulSoup(new_webpage.content, "html.parser")

            # Function calls to display all necessary product information
            d['title'].append(get_title(new_soup))
            d['price'].append(get_price(new_soup))
            d['rating'].append(get_rating(new_soup))
            d['reviews'].append(get_review_count(new_soup))
            d['availability'].append(get_availability(new_soup))
            d['category'].append(get_category(new_soup))
            d['sales_rank'].append(get_sales_rank(new_soup))
            d['brand'].append(get_brand_name(new_soup))
        except Exception as e:
            print(f"Error scraping data for link {link}: {e}")

    # Create a DataFrame from the newly scraped data
    new_data = pd.DataFrame.from_dict(d)
    new_data['title'].replace('', np.nan, inplace=True)
    new_data = new_data.dropna(subset=['title'])

    # Read existing data from the CSV file if it exists
    try:
        existing_data = pd.read_csv(filename)
        # Concatenate existing data and new data
        combined_data = pd.concat([existing_data, new_data], ignore_index=True)
    except FileNotFoundError:
        combined_data = new_data

    # Save the combined data to the CSV file
    combined_data.to_csv(filename, header=True, index=False)

In [None]:
#scraping data of the second page of the results (Search: "desktop computer")
if __name__ == "__main__":
    url = "https://www.amazon.com.au/s?k=full+desktop+computer+set&crid=21Q9QRHBP5HLP&sprefix=desktop+full+comp%2Caps%2C246&ref=nb_sb_ss_ts-doa-p_1_17"
    # Headers for request
    HEADERS = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36', 'Accept-Language': 'en-Au, en; q=0.5'}
    # HTTP request
    webpage = re.get(url, headers=HEADERS)
    # Creating soup object containing all data
    soup = BeautifulSoup(webpage.content, "html.parser")
    links = soup.find_all("a", attrs={'class': "a-link-normal s-underline-text s-underline-link-text s-link-style a-text-normal"})
    # Store the links
    links_list = []

    # Loop for extracting links from Tag Objects
    for link in links:
        links_list.append(link.get('href'))

    # Slice the links_list and scrape data in batches
    batch_size = 7
    num_batches = int(len(links_list) / batch_size)  # Set the number of batches you want to run
    links_list_batches = [links_list[i:i + batch_size] for i in range(0, len(links_list), batch_size)][:num_batches]  # Slicing to only take first num_batches
    for i, batch in enumerate(links_list_batches):
        filename = f"amazon_data_batch_{i}.csv"
        scrape_and_save_data(batch, filename)

In [None]:
#scraping data of the second page of the results
if __name__ == "__main__":
    url = "https://www.amazon.com.au/s?k=desktop+computer&page=2&crid=S4MN14AIGALS&qid=1711942878&sprefix=desktop+computer%2Caps%2C285&ref=sr_pg_1"
    
    # Headers for request
    HEADERS = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36', 'Accept-Language': 'en-Au, en; q=0.5'}
    # HTTP request
    webpage = re.get(url, headers=HEADERS)
    # Creating soup object containing all data
    soup = BeautifulSoup(webpage.content, "html.parser")
    links = soup.find_all("a", attrs={'class': "a-link-normal s-underline-text s-underline-link-text s-link-style a-text-normal"})
    # Store the links
    links_list = []

    # Loop for extracting links from Tag Objects
    for link in links:
        links_list.append(link.get('href'))

    # Slice the links_list and scrape data in batches
    batch_size = 7
    num_batches = int(len(links_list) / batch_size)  # Set the number of batches you want to run
    links_list_batches = [links_list[i:i + batch_size] for i in range(0, len(links_list), batch_size)][:num_batches]  # Slicing to only take first num_batches
    for i, batch in enumerate(links_list_batches):
        filename = f"amazon_data_batch_{i+8}.csv"
        scrape_and_save_data(batch, filename)

In [None]:
#scraping data of the third page of the results
if __name__ == "__main__":
    url = "https://www.amazon.com.au/s?k=desktop+computer&page=3&crid=S4MN14AIGALS&qid=1711942885&sprefix=desktop+computer%2Caps%2C285&ref=sr_pg_2"    
    # Headers for request
    HEADERS = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36', 'Accept-Language': 'en-Au, en; q=0.5'}
    # HTTP request
    webpage = re.get(url, headers=HEADERS)
    # Creating soup object containing all data
    soup = BeautifulSoup(webpage.content, "html.parser")
    links = soup.find_all("a", attrs={'class': "a-link-normal s-underline-text s-underline-link-text s-link-style a-text-normal"})
    # Store the links
    links_list = []

    # Loop for extracting links from Tag Objects
    for link in links:
        links_list.append(link.get('href'))

    # Slice the links_list and scrape data in batches
    batch_size = 7
    num_batches = int(len(links_list) / batch_size)  # Set the number of batches you want to run
    links_list_batches = [links_list[i:i + batch_size] for i in range(0, len(links_list), batch_size)][:num_batches]  # Slicing to only take first num_batches
    for i, batch in enumerate(links_list_batches):
        filename = f"amazon_data_batch_{i+16}.csv"
        scrape_and_save_data(batch, filename)

In [None]:
#for git
"""
Amazon Product Scraper

This Python script scrapes product data from Amazon's search results page for a given query and saves the data into CSV files in batches.

It utilizes BeautifulSoup for web scraping and pandas for data manipulation and saving to CSV.

Usage:
- Modify the URL variable to set the search query and parameters.
- Run the script to scrape product data from Amazon in batches and save it to separate CSV files.

Dependencies:
- BeautifulSoup (bs4)
- pandas
- requests

Author: [Your Name]
GitHub: [Your GitHub Profile]

"""

from bs4 import BeautifulSoup
import pandas as pd
import requests as re
import numpy as np
