In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Define headers to mimic a browser visit
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    "Accept-Language": "en-US, en;q=0.9",
}

# Function to scrape a single Amazon page
def scrape_amazon(url):
    response = requests.get(url, headers=HEADERS)
    soup = BeautifulSoup(response.content, "html.parser")
    
    # Scraping product details
    products = []
    for product in soup.find_all("div", class_="s-main-slot s-result-list s-search-results sg-row")[0].find_all("div", {"data-component-type": "s-search-result"}):
        title = product.h2.a.text.strip()
        price_whole = product.find("span", class_="a-price-whole")
        price_fraction = product.find("span", class_="a-price-fraction")
        price = f"{price_whole.text}.{price_fraction.text}" if price_whole and price_fraction else "N/A"
        rating = product.find("span", class_="a-icon-alt")
        products.append({
            "Title": title,
            "Price": price,
            "Rating": rating.text if rating else "N/A"
        })
    return products

# URL of the Amazon page to scrape
URL = "https://amzn.in/d/cRBK2D8"

# Scrape data
product_data = scrape_amazon(URL)

# Convert to DataFrame for better viewing and saving
df = pd.DataFrame(product_data)
print(df)

# Save to CSV
df.to_csv("amazon_products.csv", index=False)

IndexError: list index out of range

In [3]:
import requests
from bs4 import BeautifulSoup

HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    "Accept-Language": "en-US, en;q=0.9",
}

def scrape_product_page(url):
    response = requests.get(url, headers=HEADERS)
    soup = BeautifulSoup(response.content, "html.parser")

    # Extract product title
    try:
        title = soup.find("span", id="productTitle").text.strip()
    except:
        title = "N/A"

    # Extract price
    try:
        price = soup.find("span", class_="a-price-whole").text.strip()
    except:
        price = "N/A"

    # Extract rating
    try:
        rating = soup.find("span", class_="a-icon-alt").text.strip()
    except:
        rating = "N/A"

    return {
        "Title": title,
        "Price": price,
        "Rating": rating,
    }

# URL of the product page
URL = "https://amzn.in/d/cRBK2D8"
product_details = scrape_product_page(URL)
print(product_details)


{'Title': 'Xiaomi Pad 6| Qualcomm Snapdragon 870| Powered by HyperOS |144Hz Refresh Rate| 8GB, 256GB| 2.8K+ Display (11-inch/27.81cm) Tablet| Dolby Vision Atmos| Quad Speakers| Wi-Fi| Gray', 'Price': '23,999.', 'Rating': '4.5 out of 5 stars'}


In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

def extract_product_details(soup):
    # Extract product name, price, description, and rating
    product_name = soup.find("span", {"id": "productTitle"})
    product_name = product_name.get_text(strip=True) if product_name else "No product name available"
    
    price = soup.find("span", {"class": "a-price-whole"})
    price = price.get_text(strip=True) if price else "Not Available"
    
    description = soup.find("div", {"id": "feature-bullets"})
    description = description.get_text(strip=True) if description else "No description available"
    
    rating = soup.find("span", {"class": "a-icon-alt"})
    rating = rating.get_text(strip=True) if rating else "No rating"

    return {
        "Product Name": product_name,
        "Price": price,
        "Description": description,
        "Rating": rating,
    }

def extract_reviews(soup):
    reviews = []
    
    # Find all review items
    review_elements = soup.find_all("li", {"data-hook": "review"})
    for review in review_elements:
        reviewer_name = review.find("span", class_="a-profile-name")
        reviewer_name = reviewer_name.get_text(strip=True) if reviewer_name else "Anonymous"
        
        review_title = review.find("a", {"data-hook": "review-title"})
        review_title = review_title.get_text(strip=True) if review_title else "No title"
        
        review_date = review.find("span", {"data-hook": "review-date"})
        review_date = review_date.get_text(strip=True) if review_date else "No date"
        
        review_rating = review.find("i", {"data-hook": "review-star-rating"})
        review_rating = review_rating.get_text(strip=True) if review_rating else "No rating"
        
        review_text = review.find("span", {"data-hook": "review-body"})
        review_text = review_text.get_text(strip=True) if review_text else "No review text"
        
        helpful_votes = review.find("span", {"data-hook": "helpful-vote-statement"})
        helpful_votes = helpful_votes.get_text(strip=True) if helpful_votes else "0 helpful votes"
        
        reviews.append({
            "Reviewer Name": reviewer_name,
            "Review Title": review_title,
            "Review Date": review_date,
            "Rating": review_rating,
            "Review Text": review_text,
            "Helpful Votes": helpful_votes,
        })
    
    return reviews

def scrape_amazon_product(url):
    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"}
    response = requests.get(url, headers=headers)

    if response.status_code != 200:
        print("Failed to retrieve the webpage")
        return None, None

    soup = BeautifulSoup(response.content, "html.parser")

    # Extract product details
    product_details = extract_product_details(soup)
    
    # Extract reviews
    reviews = extract_reviews(soup)
    
    return product_details, reviews

def save_to_csv(product_details, reviews):
    if product_details:
        # Convert product details into DataFrame and save to CSV
        product_df = pd.DataFrame([product_details])
        product_filename = "product_details.csv"
        product_df.to_csv(product_filename, index=False)
        print(f"Product details saved to {product_filename}")
    
    if reviews:
        # Convert reviews into DataFrame and save to CSV
        reviews_df = pd.DataFrame(reviews)
        reviews_filename = "product_reviews.csv"
        reviews_df.to_csv(reviews_filename, index=False)
        print(f"Reviews saved to {reviews_filename}")

if __name__ == "__main__":
    # Example Amazon Product URL
    product_url = "https://amzn.in/d/2fYeMTZ"

    # Scrape product details and reviews
    product_details, reviews = scrape_amazon_product(product_url)

    if product_details and reviews:
        # Save to CSV files
        save_to_csv(product_details, reviews)
    else:
        print("No data to save.")


Product details saved to product_details.csv
Reviews saved to product_reviews.csv


In [12]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

def extract_product_details(soup):
    # Extract product name, price, description, image URL, and rating
    product_name = soup.find("span", {"id": "productTitle"})
    product_name = product_name.get_text(strip=True) if product_name else "No product name available"
    
    price = soup.find("span", {"class": "a-price-whole"})
    price = price.get_text(strip=True) if price else "Not Available"
    
    img = soup.find("img", {"id": "landingImage"})
    img_url = img['src'] if img else "No image found"
    
    description = soup.find("div", {"id": "feature-bullets"})
    description = description.get_text(strip=True) if description else "No description available"
    
    rating = soup.find("span", {"class": "a-icon-alt"})
    rating = rating.get_text(strip=True) if rating else "No rating"

    return {
        "Product Name": product_name,
        "Price": price,
        "Image URL": img_url,
        "Description": description,
        "Overall Rating": rating,
    }

def extract_reviews(soup):
    # Extract all review texts
    reviews = []
    review_elements = soup.find_all("li", {"data-hook": "review"})
    for review in review_elements:
        review_text = review.find("span", {"data-hook": "review-body"})
        if review_text:
            reviews.append(review_text.get_text(strip=True))  # Append only valid review texts
    return reviews

def scrape_amazon_product(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }
    response = requests.get(url, headers=headers)

    if response.status_code != 200:
        print(f"Failed to retrieve the webpage. Status code: {response.status_code}")
        return None

    soup = BeautifulSoup(response.content, "html.parser")

    # Extract product details
    product_details = extract_product_details(soup)
    
    # Extract review texts
    reviews = extract_reviews(soup)
    
    # Combine product details and reviews into one dictionary
    for i, review_text in enumerate(reviews, start=1):
        product_details[f"Review Text {i}"] = review_text
    
    return product_details

def save_to_csv(product_data):
    if product_data:
        # Convert product data into DataFrame and save to CSV
        product_df = pd.DataFrame([product_data])
        filename = "product_data.csv"
        product_df.to_csv(filename, index=False)
        print(f"Data saved to {filename}")

if __name__ == "__main__":
    # Example Amazon Product URL
    product_url = "https://amzn.in/d/g5fNUYy"

    # Scrape product details and reviews
    product_data = scrape_amazon_product(product_url)

    if product_data:
        # Save to a single CSV file
        save_to_csv(product_data)
    else:
        print("No data to save.")


Data saved to product_data.csv
