In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
import numpy as np
import random

# Amazon Search Results URL (Modify based on your needs)
URL = "https://www.amazon.in/s?rh=n%3A6612025031&fs=true&ref=lp_6612025031_sar"

# Headers to mimic a real browser request
# List of multiple user-agent headers
HEADERS_LIST = [
    {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
        "Accept-Language": "en-US,en;q=0.9",
    },
    {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36",
        "Accept-Language": "en-US,en;q=0.9",
    },
    {
        "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/109.0",
        "Accept-Language": "en-US,en;q=0.9",
    },
    {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Edge/118.0.2088.46",
        "Accept-Language": "en-US,en;q=0.9",
    },
    {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Version/16.0 Safari/537.36",
        "Accept-Language": "en-US,en;q=0.9",
    },
]
# Function to get a random header
def get_random_header():
    return random.choice(HEADERS_LIST)

# Example usage:
HEADERS = get_random_header()

# Create a session to manage cookies and headers
session = requests.Session()
session.headers.update(HEADERS)

def get_product_links(url):
    """Extracts all product links from an Amazon search results page."""
    try:
        response = session.get(url, timeout=10)  # Set timeout to avoid infinite waiting
        response.raise_for_status()  # Raise an HTTPError for bad responses (4xx, 5xx)
    except requests.exceptions.RequestException as e:
        print(f"❌ Error fetching search results: {e}")
        return []
    
    # Parse the HTML content
    soup = BeautifulSoup(response.content, "html.parser")
    links = soup.find_all("a", attrs={"class": "a-link-normal s-no-outline"})
    
    # Construct full product URLs
    product_links = ["https://www.amazon.in" + link.get('href') for link in links if link.get('href')]
    return product_links

def get_product_details(url):
    """Extracts product details from an individual product page."""
    time.sleep(2)  # Delay to avoid hitting Amazon too frequently so that we don't get blocked
    
    try:
        response = session.get(url, timeout=10)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(f"❌ Error fetching product page {url}: {e}")
        return None
    
    soup = BeautifulSoup(response.content, "html.parser")
    
    # Extracting product details safely
    def safe_extract(selector, attr=None, default="N/A"):
        """Utility function to safely extract text from a BeautifulSoup element."""
        element = soup.select_one(selector)
        return element.text.strip() if element else default
    
    # Get product details using selectors (handling missing data)
    title = safe_extract("#productTitle")
    price = safe_extract(".a-price-whole")
    rating = safe_extract(".a-icon-alt")
    seller = safe_extract("#sellerProfileTriggerId")
    availability = safe_extract("#availability")
    total_ratings = safe_extract("#acrCustomerReviewLink")
    
    return {
        "Title": title,
        "Price": price,
        "Rating": rating,
        "Seller": seller,
        "Availability": availability,
        "Total Ratings": total_ratings.replace("ratings", "").strip() if total_ratings != "N/A" else "N/A",
    }

# Step 1: Get all product links from the search results page
print("🔍 Fetching product links...")
product_links = get_product_links(URL)
print(f"✅ Found {len(product_links)} products.")

# Step 2: Scrape product details for each product link
product_data = []
for index, link in enumerate(product_links):
    print(f"📦 Scraping Product: ({index + 1}/{len(product_links)})")
    details = get_product_details(link)
    if details:
        product_data.append(details)

# Step 3: Store the data in a Pandas DataFrame and save as CSV
if product_data:
    df = pd.DataFrame(product_data)
    df.to_csv("amazon_products.csv", index=False)
    print("✅ Data saved to 'amazon_products.csv' successfully!")
else:
    print("⚠️ No data scraped. Please check your script or Amazon's page structure.")


🔍 Fetching product links...
✅ Found 33 products.
📦 Scraping Product: (1/33)
📦 Scraping Product: (2/33)
📦 Scraping Product: (3/33)
📦 Scraping Product: (4/33)
📦 Scraping Product: (5/33)
📦 Scraping Product: (6/33)
📦 Scraping Product: (7/33)
📦 Scraping Product: (8/33)
📦 Scraping Product: (9/33)
📦 Scraping Product: (10/33)
📦 Scraping Product: (11/33)
📦 Scraping Product: (12/33)
📦 Scraping Product: (13/33)
📦 Scraping Product: (14/33)
📦 Scraping Product: (15/33)
📦 Scraping Product: (16/33)
📦 Scraping Product: (17/33)
📦 Scraping Product: (18/33)
📦 Scraping Product: (19/33)
📦 Scraping Product: (20/33)
📦 Scraping Product: (21/33)
📦 Scraping Product: (22/33)
📦 Scraping Product: (23/33)
📦 Scraping Product: (24/33)
📦 Scraping Product: (25/33)
📦 Scraping Product: (26/33)
📦 Scraping Product: (27/33)
📦 Scraping Product: (28/33)
📦 Scraping Product: (29/33)
📦 Scraping Product: (30/33)
📦 Scraping Product: (31/33)
📦 Scraping Product: (32/33)
📦 Scraping Product: (33/33)
✅ Data saved to 'amazon_products.csv