In [1]:
!pip install bs4
!pip install requests


Collecting bs4
  Downloading bs4-0.0.2-py2.py3-none-any.whl.metadata (411 bytes)
Downloading bs4-0.0.2-py2.py3-none-any.whl (1.2 kB)
Installing collected packages: bs4
Successfully installed bs4-0.0.2


In [15]:
import pandas as pd
import numpy as np
from  bs4 import BeautifulSoup
import requests
import os

In [22]:

# 1) Extract Book Title
def get_title(soup):
    try:
        title_element = soup.find("span", attrs={"id": "productTitle"})
        title = title_element.text.strip() if title_element else ""
    except AttributeError:
        title = ""
    return title


# 2) Extract Average Rating
def get_average_rating(soup):
    try:
        rating_element = soup.find("span", class_="a-icon-alt")  # Example: "4.5 out of 5 stars"
        rating_text = rating_element.text.strip() if rating_element else ""
        rating_value = rating_text.split()[0] if rating_text else ""
    except AttributeError:
        rating_value = ""
    return rating_value


# 3) Extract Number of Reviews
def get_number_of_reviews(soup):
    try:
        reviews_element = soup.find("span", attrs={"id": "acrCustomerReviewText"})
        reviews_text = reviews_element.text.strip() if reviews_element else ""
        reviews_numeric = "".join(filter(str.isdigit, reviews_text))  # Extract only numbers
    except AttributeError:
        reviews_numeric = ""
    return reviews_numeric


# 4) Extract Price
def get_price(soup):
    try:
        price_element = soup.find("span", class_="a-price-whole")  # Extract main price
        price_cents = soup.find("span", class_="a-price-fraction")  # Extract cents
        if price_element:
            price = price_element.text.strip()
            if price_cents:
                price += "." + price_cents.text.strip()
        else:
            price = ""
    except AttributeError:
        price = ""
    return price


# 5) Extract Discount Percentage
def get_discount_percentage(soup):
    try:
        discount_element = soup.find("span", class_="a-size-base a-color-price")  # Look for "Save 20%"
        discount_text = discount_element.text.strip() if discount_element else ""
        discount_value = "".join(filter(str.isdigit, discount_text))  # Extract numbers
    except AttributeError:
        discount_value = ""
    return discount_value


# 6) Extract Book Format (e.g., Hardcover, Kindle, Paperback)
def get_book_format(soup):
    try:
        format_element = soup.find("span", class_="a-size-base a-color-secondary")  # Format info
        format_string = format_element.text.strip() if format_element else ""
    except AttributeError:
        format_string = ""
    return format_string


# 7) Extract Amazon Bestseller Rank
def get_bestseller_rank(soup):
    try:
        rank_element = soup.find("span", class_="a-list-item")  # Rank is usually within a list item
        rank_text = rank_element.text.strip() if rank_element else ""
        rank_numeric = "".join(filter(str.isdigit, rank_text))  # Extract rank as a number
    except AttributeError:
        rank_numeric = ""
    return rank_numeric


# 8) Extract Category (e.g., "Data Science", "Machine Learning")
def get_category(soup):
    try:
        category_element = soup.select("a.a-link-normal.a-color-tertiary")  # Breadcrumb navigation
        category = category_element[-1].text.strip() if category_element else ""
    except AttributeError:
        category = ""
    return category


def get_publisher(soup):
    """
    Extracts the publisher name from the Amazon product details section.
    """
    try:

        # Locate the product details section
        details_section = soup.find("div", id="detailBullets_feature_div")

        if details_section:
            # Find all list items
            details_list = details_section.find_all("li")

            for detail in details_list:
                label = detail.find("span", class_="a-text-bold")
                if label and "Publisher" in label.text:
                    # Extract the publisher name (text after the label)
                    publisher_info = detail.text.strip().replace("Publisher ‏ : ‎", "").strip()
                    return publisher_info

        return ""  # Return empty if not found
    except AttributeError:
        return ""


# 11) Extract Customers Also Bought (List)
def get_customers_also_bought(soup):
    try:
        also_bought_section = soup.find("div", id="sims-fbt")  # "Frequently Bought Together" section
        also_bought_titles = also_bought_section.find_all("img") if also_bought_section else []
        also_bought_list = [img["alt"].strip() for img in also_bought_titles if "alt" in img.attrs]
    except AttributeError:
        also_bought_list = []
    return also_bought_list

#12) Availability

def get_availability(soup):
    """
    Extracts the availability status of the book from the Amazon product page.
    """
    try:
        # Locate the availability div
        availability_div = soup.find("div", id="availability")

        if availability_div:
            # Find the span containing the stock information
            stock_status = availability_div.find("span", class_="a-size-medium a-color-success")

            # Extract and clean the text
            availability = stock_status.text.strip() if stock_status else "Unavailable"
        else:
            availability = "Unavailable"

    except AttributeError:
        availability = "Unavailable"

    return availability



In [29]:


URL = 'https://www.amazon.com/s?k=data+engineering+books&page=3&crid=1IXU4QSIFZTSA&qid=1741382825&sprefix=data+engineer%2Caps%2C137&xpid=_aGOBw7H26cnD&ref=sr_pg_3'

HEADERS = ({'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36', 'Accept-Language': 'en-US, en;q=0.5'}) #add your user agent

webpage = requests.get(URL, headers=HEADERS)

# Soup Object containiang all data
soup = BeautifulSoup(webpage.content, "html.parser")

links = soup.find_all("a", attrs={'class':'a-link-normal s-no-outline'}) #will find links

# Store the links
links_list = []

# Loop for extracting links from Tag Objects
for link in links:
  links_list.append(link.get('href'))

d = {
    "title":[],
    "average_rating": [],
    "number_of_reviews": [],
    "price": [],
    "availability": [],
    "discount_percentage": [],
    "book_format": [],
    "bestseller_rank": [],
    "category": [],
    "publication_year": [],
    "customers_also_bought": []
}


# Loop for extracting product details from each link
for link in links_list:
  new_webpage = requests.get("https://www.amazon.com" + link, headers=HEADERS)

  new_soup = BeautifulSoup(new_webpage.content, "html.parser")
  d['title'].append(get_title(new_soup))
  d['average_rating'].append(get_average_rating(new_soup))
  d['number_of_reviews'].append(get_number_of_reviews(new_soup))
  d['price'].append(get_price(new_soup))
  d['availability'].append(get_availability(new_soup))
  d['discount_percentage'].append(get_discount_percentage(new_soup))
  d['book_format'].append(get_book_format(new_soup))
  d['bestseller_rank'].append(get_bestseller_rank(new_soup))
  d['category'].append(get_category(new_soup))
  d['publication_year'].append(get_publisher(new_soup))
  d['customers_also_bought'].append(get_customers_also_bought(new_soup))

In [30]:
new_data_df = pd.DataFrame.from_dict(d)

# Handle missing values by replacing empty strings with NaN
new_data_df.replace('', np.nan, inplace=True)

# Drop rows where 'average_rating' is missing (or choose another key if more important)
new_data_df = new_data_df.dropna(subset=['average_rating'])

# Define file name
csv_filename = "amazon_books_data.csv"

# Check if the file already exists
if os.path.exists(csv_filename):
    # If exists, load existing data and append new data
    existing_df = pd.read_csv(csv_filename)
    amazon_df = pd.concat([existing_df, new_data_df], ignore_index=True)
else:
    # If file doesn't exist, just use new data
    amazon_df = new_data_df

# Save updated DataFrame back to CSV
amazon_df.to_csv(csv_filename, header=True, index=False)

print("Data successfully appended and saved!")

Data successfully appended and saved!


  new_data_df.replace('', np.nan, inplace=True)


In [31]:
amazon_df

Unnamed: 0,title,average_rating,number_of_reviews,price,availability,discount_percentage,book_format,bestseller_rank,category,publication_year,customers_also_bought
0,Fundamentals of Data Engineering: Plan and Bui...,4.7,664.0,43..99,In Stock,,$41.79,,Data Modeling & Design,Publisher\n ...,[]
1,Designing Data-Intensive Applications: The Big...,4.8,5119.0,23..65,In Stock,,$22.47,,Data Modeling & Design,Publisher\n ...,[]
2,Data Engineering with AWS: Acquire the skills ...,4.3,43.0,24..96,In Stock,,$5.00,,Data Modeling & Design,Publisher\n ...,[]
3,AI Engineering: Building Applications with Fou...,4.6,56.0,67..97,In Stock,,$61.35,,Intelligence & Semantics,Publisher\n ...,[]
4,Data Pipelines Pocket Reference: Moving and Pr...,4.6,378.0,17..29,In Stock,,$5.80 - $11.99,,Data Modeling & Design,Publisher\n ...,[]
...,...,...,...,...,...,...,...,...,...,...,...
139,AI 101 for TECH LEADERS and ENGINEERS: JUST EN...,5.0,1,0..00,Unavailable,,1 global rating,,Software Development,,[]
140,Winning with Data Science: A Handbook for Busi...,4.8,11,19..80,In Stock,,$14.57,,Business,Publisher\n ...,[]
141,Cracking the Coding Interview: 189 Programming...,4.7,9280,30..99,In Stock,,Ships from and sold by Amazon.com.,,Data Structures,Publisher\n ...,[]
142,The Model Thinker: What You Need to Know to Ma...,4.5,547,0..00,Unavailable,,$14.99,,Research,,[]
