# Scraping Webpage Data - Amazon

In [1]:
# Import Libraries
from bs4 import BeautifulSoup
import requests
import time
import datetime

import pandas as pd
import numpy as np

In [2]:
# To change cell output font color
class bcolors:
    HEADER = '\033[95m'
    OKBLUE = '\033[94m'
    OKCYAN = '\033[96m'
    OKGREEN = '\033[92m'
    WARNING = '\033[93m'
    FAIL = '\033[91m'
    ENDC = '\033[0m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'

In [3]:
# User Agent - use: whatismyuseragent.com
USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.5.1 Safari/605.1.15"

In [4]:
# Headers for the request
HEADERS = ({'User-Agent': USER_AGENT, 'Accept-Language': "en-US, en;q=0.5"})

In [5]:
# Url of the page to scrap - kettlebell items
#URL = "https://www.amazon.com/s?k=kettlebell&crid=2PEDPQD374AZF&sprefix=kettlebell%2Caps%2C104&ref=nb_sb_noss_1"

In [6]:
# Dictionary to hold the data scraped
d = {'title': [], 'price': [], 'rating': []}

In [7]:
# Function to extract Product Title
def get_title(product):

    try:
        # Outer Tag Object
        title = product.find("span", class_="a-text-normal")

        # Inner NavigableString Object
        title_value = title.text

        # Title as a string value
        title_string = title_value.strip()

    except AttributeError:
        title_string = ""

    return title_string


In [8]:
# Function to extract Product Price
def get_price(product):

    try:
        price = product.find("span", class_="a-offscreen").string.strip()[1:]

    except AttributeError:

        try:
            # If there is some deal price
            price = product.find('span', attrs = {'id', 'priceblock_dealprice'}).string.strip()[1:]

        except:
            price = ""

    return price

In [9]:
# Function to extract Product Rating
def get_rating(soup):

    try:
        rating = soup.find('i', attrs={'class': 'a-icon a-icon-star a-star-4-5'}).string.strip()

    except AttributeError:
        try: 
            rating = soup.find('span', attrs={'class': 'a-icon-alt'}).string.strip()
        except:
            rating = ''

    return rating

In [10]:
def scraper(keyword):
    # Url of the page to scrap - kettlebell items
    URL = "https://www.amazon.com/s?k=" + keyword

    page = requests.get(URL, headers=HEADERS)
    status_code = page.status_code
    print('Status Code', page.status_code)

    if status_code != 200:
        print('Status Code', status_code)
        return
    else:
        soup = BeautifulSoup(page.content, "html.parser")
        
        # Find all the product containers on the page
        product_containers = soup.find_all("div", class_="s-result-item")
        print('total items found =', len(product_containers))

        for product in product_containers:
            #title = product.find("span", class_="a-text-normal")
            #price = product.find("span", class_="a-offscreen")

            #link = product.find("a", class_="a-link-normal")
            #rating = product.find('span', attrs={'class': 'a-icon-alt'})

            d['title'].append(get_title(product))
            d['price'].append(get_price(product))
            d['rating'].append(get_rating(product))
        

In [11]:
# 
keyword = 'magna+safe+iphone+powerbank'

**A search keyword or a phrase is required**

In [12]:
# PROMPTING THE USER TO ENTER A `word` or a `phrase` TO SEARCH
user_input = input("Please enter a product name or a phrase: ")

# Displaying the input
if len(user_input) < 1:
    print(f"{bcolors.FAIL}That was not a valid entry, check and try again!!!{bcolors.ENDC}")
else:
    print("You entered:", "`"+user_input+"`")
    num = len(user_input.split(' '))

    if( num > 1):
        keyword = user_input.split()
        keyword = '+'.join(keyword)
        print('keyword', f"{bcolors.OKGREEN} {keyword} {bcolors.ENDC}")

Please enter a product name or a phrase:  kettlebell


You entered: `kettlebell`


In [13]:
scraper(keyword)

Status Code 200
total items found = 39


In [14]:
# Create DataFrame from dictionary
amazon_df = pd.DataFrame.from_dict(d)

In [15]:
# Get the dimensions of the DataFrame
num_rows, num_columns = amazon_df.shape

print("Number of Rows:", num_rows)
print("Number of Columns:", num_columns)

Number of Rows: 39
Number of Columns: 3


In [16]:
amazon_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39 entries, 0 to 38
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   title   39 non-null     object
 1   price   39 non-null     object
 2   rating  39 non-null     object
dtypes: object(3)
memory usage: 1.0+ KB


In [17]:
# Replace the title on those entries whose title is empty with a `nan` value, then drop them.
amazon_df['title'].replace('', np.nan, inplace=True)
amazon_df = amazon_df.dropna(subset=['title'])

In [18]:
amazon_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 30 entries, 3 to 34
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   title   30 non-null     object
 1   price   30 non-null     object
 2   rating  30 non-null     object
dtypes: object(3)
memory usage: 960.0+ bytes


In [19]:
# Save `df` as a `csv` file
amazon_df.to_csv('_'.join(user_input.split(' '))+'_amazon_scraped_data.csv', header=True, index=False)