In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os


In [2]:

searchstrings = ["iphone", "oneplus", "oppo", "realme", "samsung", "vivo", "xiaomi"]


In [3]:

for searchstring in searchstrings:
    # Base URL for the Amazon search page
    base_url = f'https://www.amazon.in/s?k={searchstring}'
    base_url = base_url + '&page={}'
    # Headers to simulate a real browser request
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36",
        "Accept-Language": "en-US,en;q=0.5"
    }

    # Loop through pages, adjust the range to control the number of pages to scrape
    for page in range(1, 6):  # Scraping pages 1 to 5
        url = base_url.format(page)
        
        # Send a GET request to the Amazon URL
        response = requests.get(url, headers=headers)

        # Check if the request was successful
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # List to store product details for each page
            products = []
            
            # Find all product sections on the page
            product_sections = soup.find_all('div', {'data-component-type': 's-search-result'})
            
            # Loop through each product section and extract details
            for product_section in product_sections:
                # Extract product name
                name_tag = product_section.find('span', class_='a-size-medium a-color-base a-text-normal')
                product_name = name_tag.text.strip() if name_tag else "N/A"
                
                # Extract rating (stars)
                rating_tag = product_section.find('span', class_='a-icon-alt')
                product_rating = rating_tag.text.split()[0] if rating_tag else "N/A"  # e.g., '4.5 out of 5 stars'
                
                # Extract number of reviews
                reviews_tag = product_section.find('span', class_='a-size-base s-underline-text')
                product_reviews = reviews_tag.text.strip() if reviews_tag else "N/A"
                
                # Extract 'Bought last month' text (if available)
                bought_last_month_tag = product_section.find('span', class_='a-size-base a-color-secondary')
                bought_last_month = bought_last_month_tag.text.strip() if bought_last_month_tag else "N/A"
                
                # Extract current MRP (price)
                current_price_tag = product_section.find('span', class_='a-price-whole')
                current_price = current_price_tag.text.replace(',', '').strip() if current_price_tag else "N/A"
                
                # Extract dashed (original) MRP
                original_price_tag = product_section.find('span', class_='a-price a-text-price')
                dashed_mrp = original_price_tag.find('span', class_='a-offscreen').text.replace(',', '').strip() if original_price_tag else "N/A"
                
                # Extract discount percentage
                discount_tag = product_section.find('span', string=lambda text: text and '(' in text and '%' in text)
                discount_percentage = discount_tag.text if discount_tag else "N/A"

                # Check for free delivery status
                free_delivery_tag = product_section.find('span', string=lambda x: x and 'free delivery' in x.lower())
                free_delivery = "Yes" if free_delivery_tag else "No"

                # Append all product details to the list
                products.append({
                    'Product Name': product_name,
                    'Rating (Stars)': product_rating,
                    'Number of Reviews': product_reviews,
                    'Bought Last Month': bought_last_month,
                    'Current MRP': current_price,
                    'Dashed MRP': dashed_mrp,
                    'Discount (%)': discount_percentage,
                    'Free Delivery': free_delivery
                })
            
            # Convert list of products to DataFrame
            df = pd.DataFrame(products)
            
            # Create directory if it doesn't exist
            directory = f'Raw-Csvs/{searchstring}'
            if not os.path.exists(directory):
                os.makedirs(directory)
            
            # Save each page's data to a separate CSV file
            file_name = f'{directory}/page_{page}.csv'
            df.to_csv(file_name, index=False)
            print(f"Data exported to {file_name}")
        else:
            print(f"Failed to retrieve page {page}. Status code: {response.status_code}")


Data exported to Raw-Csvs/iphone/page_1.csv
Data exported to Raw-Csvs/iphone/page_2.csv
Data exported to Raw-Csvs/iphone/page_3.csv
Data exported to Raw-Csvs/iphone/page_4.csv
Data exported to Raw-Csvs/iphone/page_5.csv
Data exported to Raw-Csvs/oneplus/page_1.csv
Data exported to Raw-Csvs/oneplus/page_2.csv
Data exported to Raw-Csvs/oneplus/page_3.csv
Data exported to Raw-Csvs/oneplus/page_4.csv
Data exported to Raw-Csvs/oneplus/page_5.csv
Data exported to Raw-Csvs/oppo/page_1.csv
Data exported to Raw-Csvs/oppo/page_2.csv
Data exported to Raw-Csvs/oppo/page_3.csv
Data exported to Raw-Csvs/oppo/page_4.csv
Data exported to Raw-Csvs/oppo/page_5.csv
Data exported to Raw-Csvs/realme/page_1.csv
Data exported to Raw-Csvs/realme/page_2.csv
Data exported to Raw-Csvs/realme/page_3.csv
Data exported to Raw-Csvs/realme/page_4.csv
Data exported to Raw-Csvs/realme/page_5.csv
Data exported to Raw-Csvs/samsung/page_1.csv
Data exported to Raw-Csvs/samsung/page_2.csv
Data exported to Raw-Csvs/samsung/p

In [7]:
for brand in searchstrings:
    df1 = pd.read_csv(f'Raw-Csvs/{brand}/page_1.csv')
    df2 = pd.read_csv(f'Raw-Csvs/{brand}/page_2.csv')
    df3 = pd.read_csv(f'Raw-Csvs/{brand}/page_3.csv')
    df4 = pd.read_csv(f'Raw-Csvs/{brand}/page_4.csv')
    df5 = pd.read_csv(f'Raw-Csvs/{brand}/page_5.csv')

    df = pd.concat([df1, df2, df3, df4, df5], ignore_index=True)
    df = df.drop_duplicates(subset='Product Name')
    df.to_csv(f'Raw-csvs/{brand}.csv', index=False)
