<a href="https://colab.research.google.com/github/maheshalalapati2/50-Days-of-Data-Analysis-with-Python/blob/main/web_scrape_top_digestive_brands.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np

# Function to extract Product Title
def get_title(soup):
    try:
        # Find the title of the product using the specified ID
        Title = soup.find("span", attrs={"id":'productTitle'})
        # Extract the text from the tag and strip any extra whitespace
        Title_value = Title.text
        Title_string = Title_value.strip()
    except AttributeError:
        Title_string = ""
    return Title_string

# Function to extract Product Price per Count
def get_price(soup):
    try:
        # Find the price of the product using the specified class
        Price = soup.find("span", attrs={'class':'a-size-mini aok-offscreen'}).string.strip()
    except AttributeError:
        try:
            # If there is a deal price, try to extract it
            Price = soup.find("span", attrs={'class':'a-size-mini aok-offscreen'}).string.strip()
        except:
            Price = ""
    return Price

# Function to extract Product Rating
def get_rating(soup):
    try:
        # Find the rating of the product using the specified class for star ratings
        Rating = soup.find("i", attrs={'class':'a-icon a-icon-star a-star-4-5'}).string.strip()
    except AttributeError:
        try:
            # If the star rating is not found, try to find it using a different class
            Rating = soup.find("span", attrs={'class':'a-icon-alt'}).string.strip()
        except:
            Rating = ""
    return Rating

# Function to extract Number of User Reviews
def get_review_count(soup):
    try:
        # Find the number of user reviews using the specified ID
        Review_count = soup.find("span", attrs={'id':'acrCustomerReviewText'}).string.strip()
    except AttributeError:
        Review_count = ""
    return Review_count

# Function to extract Availability Status
def get_availability(soup):
    try:
        # Find the availability status of the product using the specified ID
        available = soup.find("div", attrs={'id':'availability'})
        available = available.find("span").string.strip()
    except AttributeError:
        available = "Not Available"
    return available

if __name__ == '__main__':
    # Adding user agent to avoid request blocking
    HEADERS = ({'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36', 'Accept-Language':'en-US,en;q=0.5'})

    # URL of the Amazon search results page
    URL = "https://www.amazon.com/s?k=bloating+and+gas&rh=n%3A3760901%2Cp_n_feature_forty-one_browse-bin%3A119653281011&dc&ds=v1%3ARPPhU%2BQ0FhUc0VxoosKFu%2By7AvjUUZhq2hf1tlkAFSo&crid=3BXVFYIYNJOZ8&qid=1723470783&rnid=119653280011&sprefix=bloading+and+gas%2Caps%2C108&ref=sr_nr_p_n_feature_forty-one_browse-bin_1"

    # Send an HTTP GET request to the URL
    webpage = requests.get(URL, headers=HEADERS)

    # Parsing the webpage content using BeautifulSoup
    soup = BeautifulSoup(webpage.content, "html.parser")

    # Finding  all product links on the search results page
    links = soup.find_all("a", attrs={'class':'a-link-normal s-underline-text s-underline-link-text s-link-style a-text-normal'})

    # Store the links in a list
    links_list = []
    for link in links[:5]:  # Only process the first 20 links
        links_list.append(link.get('href'))

    # Dictionary to store product details
    d = {"Title":[], "Price":[], "Rating":[], "Reviews":[],"Availability":[]}

    # Loop through each product link to extract details
    for link in links_list:
        new_webpage = requests.get("https://www.amazon.com/" + link, headers=HEADERS)
        new_soup = BeautifulSoup(new_webpage.content, "html.parser")

        # Append extracted details to the dictionary
        d['Title'].append(get_title(new_soup))
        d['Price'].append(get_price(new_soup))
        d['Rating'].append(get_rating(new_soup))
        d['Reviews'].append(get_review_count(new_soup))
        d['Availability'].append(get_availability(new_soup))

    # Create a DataFrame from the dictionary
    amazon_df = pd.DataFrame.from_dict(d)
    # Replace empty titles with NaN and drop rows with NaN titles
    amazon_df['Title'].replace('', np.nan, inplace=True)
    amazon_df = amazon_df.dropna(subset=['Title'])
    # Save the DataFrame to a CSV file
    amazon_df.to_csv("amazon_data.csv", header=True, index=False)


In [None]:
amazon_df

Unnamed: 0,Title,Price,Rating,Reviews,Availability
