**Importing required modules**

In [136]:
import pandas as pd                     #for wrangling data
import requests                         #for making http requests to target server
import re                               #for regex operations   
import numpy as np                      #for wrangling data
from datetime import datetime as dt     #for date/time operations                         
from bs4 import BeautifulSoup as bs     #for parsing http response

In [137]:
def generate_search_keywords():
    """
    Prompts user for search keywords

    Returns:
        list: [*keywords]
    """
    
    keywords = []
    while True:
        keyword = input("Please enter search keyword (Press enter to submit)")
        if len(keyword) > 0: keywords.append(keyword) 
        else: break
    return keywords

In [138]:
def generate_search_results(keywords: list):
    """
    Function generates url path(s) to results of search keyword(s) 

    Args:
        keywords (list): contains search keyword(s)

    Returns:
        dict: ({keyword: url})
    """
    
    results = {keyword: f"https://www.amazon.com/s?k={keyword}&ref=nb_sb_noss" for keyword in keywords}
    return results
search = generate_search_results(generate_search_keywords())
search

{'pants': 'https://www.amazon.com/s?k=pants&ref=nb_sb_noss',
 'sunglasses': 'https://www.amazon.com/s?k=sunglasses&ref=nb_sb_noss'}

In [None]:
# url = "https://www.amazon.com/s?k=watches&ref=nb_sb_noss"
#headers for get

headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36", 
           "Accept-Language": "en-US,en;q=0.9", 
           
 }

In [146]:
def generate_links(results: list):
    """
    Generate the n number of urls, for each keyword.
    N is the number of pages in the pagination strip

    Args:
        results (list): url of search result

    Returns:
        dict: {keyword_n: url}
    """
    
    links = {}
    for result in results:
        page = requests.get(results[result], headers=headers)   #makes http request
        soup = bs(page.content, 'html.parser')                  #parses http response
        
        pages = soup.find_all('span', class_= 's-pagination-item')                      #locates pagination element
        pages_num = max([int(page.text) for page in pages if page.text.isdigit()])      #identifies number of pages   
        pages_link_template = soup.find('a', class_= 's-pagination-item').get('href')   #extracts page's hyperlink from 'href' attribute
        pages_link_num = re.search(r"&page=(\d+)", pages_link_template)                 #locates page number argument within hyperlink using regex
        
        #replaces page number argument with hyperlink with numbers ranging 1 <= x <= pages_num 
        for i in range(1, pages_num + 1):                                               
            # links = {"https://www.amazon.com{0}_{1}".format(result, i)] = pages_link_template.replace(pages_link_num.group(1), str(i))
            links["{0}_{1}".format(result, i)] = "https://www.amazon.com{}".format(pages_link_template.replace(pages_link_num.group(1), str(i))) 
        
    return links
g_links = generate_links(search)
g_links


{'pants_1': 'https://www.amazon.com/s?k=pants&page=1&qid=1730900756&ref=sr_pg_1',
 'pants_2': 'https://www.amazon.com/s?k=pants&page=2&qid=1730900756&ref=sr_pg_2',
 'pants_3': 'https://www.amazon.com/s?k=pants&page=3&qid=1730900756&ref=sr_pg_3',
 'pants_4': 'https://www.amazon.com/s?k=pants&page=4&qid=1730900756&ref=sr_pg_4',
 'pants_5': 'https://www.amazon.com/s?k=pants&page=5&qid=1730900756&ref=sr_pg_5',
 'pants_6': 'https://www.amazon.com/s?k=pants&page=6&qid=1730900756&ref=sr_pg_6',
 'pants_7': 'https://www.amazon.com/s?k=pants&page=7&qid=1730900756&ref=sr_pg_7',
 'sunglasses_1': 'https://www.amazon.com/s?k=sunglasses&page=1&qid=1730900761&ref=sr_pg_1',
 'sunglasses_2': 'https://www.amazon.com/s?k=sunglasses&page=2&qid=1730900761&ref=sr_pg_2',
 'sunglasses_3': 'https://www.amazon.com/s?k=sunglasses&page=3&qid=1730900761&ref=sr_pg_3',
 'sunglasses_4': 'https://www.amazon.com/s?k=sunglasses&page=4&qid=1730900761&ref=sr_pg_4',
 'sunglasses_5': 'https://www.amazon.com/s?k=sunglasses&pa

In [None]:
def scrape_links(links: dict):
    """
    Webscrapes each of the supplied urls and parses relevant product information.

    Args:
        links (dict): list of urls to be scraped.

    Returns:
        pd.DataFrame: collection of products and relevant information.
    """
    
    cols = ['keyword', 'title', 'sale_price($)', 'anchor_price($)', 
            'ratings(5)', 'reviews', 'orders', 'scrape_datetime']  #product information
    df = pd.DataFrame(columns=cols)                                         #initializing dataframe object 
    
    for link in links:
        try:
            response = requests.get(links[link], headers=headers)           #making http request for url resource
        except:
            print(links[links], 'get request failed')
            return None
        
        soup = bs(response.content, "html.parser")                          #parsing http request
        products = soup.find_all('div', class_='puis-card-container')
        
        for product in products:
            try:
                title = product.find('span', class_="a-color-base").text    #extracting product title
            except:
                title = ''
            
            try:
                ratings = product.find('span', class_="a-icon-alt").text    #extracting product rating
                ratings = ratings.split(" out")[0]
            except:
                ratings = ''
                
            try:
                rating_cnt = product.find('span', class_="a-size-base s-underline-text").text       #extracting product review count
                rating_cnt = rating_cnt.replace(",","")
            except:
                rating_cnt = ''
                
            try:
                purchase_cnt = product.find('span', class_="a-size-base a-color-secondary").text    #extracting product order count
                purchase_cnt = purchase_cnt.split(' bought')[0]
                purchase_cnt = purchase_cnt.replace('K', '000')
            except:
                purchase_cnt = ''
                
            try:
                price_whole = product.find('span', class_ = 'a-price-whole').text                   #extracting product sale price
                price_fraction = product.find('span', class_='a-price-fraction' ).text
                price = price_whole + price_fraction
            except:
                price = ''
            
            try:
                old_price = product.find('span', class_ = 'a-offscreen').text                       #extracting product anchor price
                old_price = old_price.split('$')[-1]
            except:
                old_price = ''
            
            product_info = [link.split('_')[0], title, price, 
                            old_price, ratings, rating_cnt, 
                            purchase_cnt, dt.now().strftime("%Y-%m-%d %H:%M:%S")]                  #adding product info to dataframe
            df.loc[len(df)] = product_info
    return df

In [148]:
data = scrape_links(g_links)

In [149]:
data.head()

Unnamed: 0,keyword,title,sale_price($),anchor_price($),ratings(5),reviews,orders
0,pants,Amazon's Choice: Overall Pick,25.6,25.6,4.5,19156,600+
1,pants,PURE CHAMP Mens 3 Pack Fleece Active Athletic ...,39.99,39.99,4.2,13649,4000+
2,pants,Amazon Essentials Men's Classic-Fit Wrinkle-Re...,3.75,3.75,4.2,55310,2000+
3,pants,G Gradual Men's Sweatpants with Zipper Pockets...,29.99,29.99,4.5,8074,3000+
4,pants,AUTOMET Womens Cargo Sweatpants Oversized Flee...,32.99,32.99,4.4,478,800+


In [150]:
data_c = data.copy()    #making a backup of dataframe 
data_c.head()

Unnamed: 0,keyword,title,sale_price($),anchor_price($),ratings(5),reviews,orders
0,pants,Amazon's Choice: Overall Pick,25.6,25.6,4.5,19156,600+
1,pants,PURE CHAMP Mens 3 Pack Fleece Active Athletic ...,39.99,39.99,4.2,13649,4000+
2,pants,Amazon Essentials Men's Classic-Fit Wrinkle-Re...,3.75,3.75,4.2,55310,2000+
3,pants,G Gradual Men's Sweatpants with Zipper Pockets...,29.99,29.99,4.5,8074,3000+
4,pants,AUTOMET Womens Cargo Sweatpants Oversized Flee...,32.99,32.99,4.4,478,800+


In [151]:
num_cols = [col for col in data_c.columns if col not in ['keyword', 'title', 'orders']] #highlighting numerical attributes
num_cols

['sale_price($)', 'anchor_price($)', 'ratings(5)', 'reviews']

In [152]:
for col in num_cols:                            #parsing numerical attributes
    data_c[col] = (
                    data_c[col]
                    .astype(str)                # Convert all values to strings
                    .replace('', np.nan)        # Replace empty strings with NaN
                    .str.replace(',', '')       # Remove thousand separators
    .astype(float) 
    )
data_c.dtypes


keyword             object
title               object
sale_price($)      float64
anchor_price($)    float64
ratings(5)         float64
reviews            float64
orders              object
dtype: object

In [153]:
data_c['orders'] = data_c['orders'].map(lambda x: x if x.endswith('+') else np.nan) #standardizing orders attribute
data_c['orders']

0       600+
1      4000+
2      2000+
3      3000+
4       800+
       ...  
612      50+
613      NaN
614      NaN
615     100+
616     100+
Name: orders, Length: 617, dtype: object

In [154]:
data_c.dtypes

keyword             object
title               object
sale_price($)      float64
anchor_price($)    float64
ratings(5)         float64
reviews            float64
orders              object
dtype: object

In [155]:
data_c.to_csv('amazon_search.csv', index=None)  #downloading dataframe into csv file

In [161]:
if data_c.to_csv('amazon_search.csv', index=None):  #downloading dataframe into csv file
    print('successful')

In [163]:
'.csv' in 'amazon_search.csv'

True