In [None]:
!pip install requests
!pip install tqdm
!pip install beautifulsoup4
!pip install pandas
!pip install random

In [4]:
import requests
from tqdm import tqdm
from bs4 import BeautifulSoup
import pandas as pd
import time
import random

def get_product_details(search, page_number):
    url = f'https://www.amazon.in/s?k={search}&page={page_number}'

    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }

    response = requests.get(url)

    if response.status_code != 200:
        print()
        print(f"Error fetching page {page_number}: {response.status_code}")
        return []

    soup = BeautifulSoup(response.text, 'html.parser')
    products = []

    for item in soup.find_all('div', class_='sg-col-inner'):
        product = {}

        name_tag = item.find('a', class_='a-link-normal s-line-clamp-2 s-link-style a-text-normal')
        if name_tag:
            product['Product_Name'] = name_tag.get_text(strip=True)
        else:
            continue

        mrp_tag = item.find('span', class_='a-price a-text-price')
        if mrp_tag:
            mrp_span = mrp_tag.find('span', class_='a-offscreen')
            if mrp_span:
                product['MRP'] = mrp_span.get_text(strip=True).replace('₹', '').replace(',', '')
            else:
                product['MRP'] = 'N/A'
        else:
            product['MRP'] = 'N/A'

        discount_tag = item.find('span', class_='a-letter-space')
        if discount_tag:
            discount_span = discount_tag.find_next('span')
            if discount_span:
                product['Discount'] = discount_span.get_text(strip=True)
            else:
                product['Discount'] = 'N/A'
        else:
            product['Discount'] = 'N/A'

        price_tag = item.find('span', class_='a-price-whole')
        if price_tag:
            product['Price'] = price_tag.get_text(strip=True).replace('₹', '').replace(',', '')
        else:
            product['Price'] = 'N/A'

        rating_tag = item.find('span', class_='a-icon-alt')
        if rating_tag:
            product['Ratings'] = rating_tag.get_text(strip=True)
        else:
            product['Ratings'] = 'N/A'

        products.append(product)

    return products

def scrape_amazon(search, pages_to_scrape):
    all_products = []

    for page in tqdm(range(1, pages_to_scrape + 1), desc="Scrappinng", unit="page"):
        products = get_product_details(search, page)
        all_products.extend(products)

        # Random delay between 2 to 5 seconds to avoid detection
        time.sleep(random.uniform(2, 5))

    if all_products:
        df = pd.DataFrame(all_products)
        df = df[['Product_Name', 'MRP', 'Discount', 'Price', 'Ratings']]
    else:
        df = pd.DataFrame(columns=['Product_Name', 'MRP', 'Discount', 'Price', 'Ratings'])

    df.to_csv('amazon_products.csv', index=False)
    print()
    print("Data saved to amazon_products.csv")

search = input("Enter what you want to search (e.g., laptop, phone): ")
user_wanted_to_surf = int(input("Enter the number of pages to scrape: "))

scrape_amazon(search, user_wanted_to_surf)


Enter what you want to search (e.g., laptop, phone): clothes
Enter the number of pages to scrape: 1


Scrappinng:   0%|          | 0/1 [00:00<?, ?page/s]


Error fetching page 1: 503


Scrappinng: 100%|██████████| 1/1 [00:04<00:00,  4.37s/page]

Data saved to amazon_products.csv





## **If Error 503**
    try to use proxy as amazon avoids their site data to being scraped

In [3]:
from google.colab import files
files.download('amazon_products.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>