# Amazon Product Scraping

In [3]:
import requests
from bs4 import BeautifulSoup
import os
import json
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
from selenium.common.exceptions import NoSuchElementException
import math
import time
import pandas as pd

In [4]:
df = pd.read_excel("../productpages.xlsx")

In [5]:
df

Unnamed: 0,Brand,Line,URL
0,Not Your Mother's,Naturals,https://www.amazon.com/stores/page/AD6500C9-1C...
1,Carol's Daughter,Curly Hair Products,https://www.amazon.com/stores/page/A5D6BF5C-26...
2,Carol's Daughter,Shampoos and Conditioners,https://www.amazon.com/stores/page/FB3658B5-14...
3,Carol's Daughter,Leave-Ins,https://www.amazon.com/stores/page/0E98909E-6E...
4,Carol's Daughter,Masks and Treatments,https://www.amazon.com/stores/page/DE3D32F1-89...
5,Carol's Daughter,Styling,https://www.amazon.com/stores/page/FAFBE68C-47...
6,SheaMoisture,Jamaican Black Castor Oil,https://www.amazon.com/stores/page/A03541FA-F1...
7,SheaMoisture,Coconut and Hibiscus,https://www.amazon.com/stores/page/6013C3E4-60...
8,SheaMoisture,100% Virgin Coconut Oil,https://www.amazon.com/stores/page/7D202725-D6...
9,SheaMoisture,Manuka Honey and Yogurt,https://www.amazon.com/stores/page/FDA558AB-CB...


In [6]:
driver = webdriver.Firefox(executable_path=r'/Users/morganoneka/Documents/PersonalProjects/geckodriver')

In [14]:
def get_item_list(brand_page):
    # pull up brand page
    driver.get(brand_page)
    time.sleep(5)
    html = driver.page_source
    soup = BeautifulSoup(html, "html.parser")
    
    item_listing = []
    
    # note that some pages have combinations of the different item listings so that's why this is 3 separate if statements not if/else
    # parse page like this: https://www.amazon.com/stores/page/AD6500C9-1CC9-47A6-AF55-30CAC721D436?ingress=2&visitId=6a918af0-3d48-4b0c-8963-93feb0ebbb20&ref_=ast_bln
    if len(soup.find_all("div", class_="columns")) > 0:
        items = soup.find_all("div", class_="columns")[0].find_all("li")
        for item in items:
            item_listing.append(get_item_url_from_listing(item))
    
    # parse page like this: https://www.amazon.com/s?k=As+I+Am&rh=n%3A3760911%2Cp_89%3AAs+I+Am&dc&qid=1650565270&rnid=2528832011&ref=sr_nr_p_89_1
    if len(soup.find_all("h2")) > 0:
        items = soup.find_all("h2")
        for item in items:
            item_listing.append(get_item_url_from_listing(item))
    
    # parse page like this: https://www.amazon.com/stores/page/1DB8A150-0F33-43F4-B681-6911B16BF1BD?ingress=2&visitId=3dec1212-38b5-45f5-b1df-26f94b3a7df5&ref_=ast_bln
    if len(soup.find_all("a", class_="ProductShowcase__title__3eXnB")) > 0:
        items = soup.find_all("a", class_="ProductShowcase__title__3eXnB")
        for item in items:
            item_listing.append(get_item_url_from_listing(item))
    
    
#     return(["https://www.amazon.com" + item.find_all("a")[0].get("href") for item in items])
    return(item_listing)

def get_item_url_from_listing(listing):
    try:
        if listing.name == 'a':
            return ("https://www.amazon.com" + listing.get("href") )
        else:
            return ("https://www.amazon.com" + listing.find_all("a")[0].get("href") )
    except:
        return ""

def get_review_url(product_url):
    sp = product_url.split("/")
    return('/'.join(sp[:4]) + "/product-reviews/" + sp[-1].split("?")[0] + "/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews")

def get_url_img(url):
     # navigate to product home page
    driver.get(url)

    html = driver.page_source
    soup = BeautifulSoup(html, "html.parser")

    # get product title
    product = soup.find(id="productTitle").text.strip()
    image = soup.find("img", id="landingImage").get("src")
    
    return [product, url, image]
    

# url = page for the individual project
def get_product_reviews(url):
    # navigate to product home page
    driver.get(url)

    html = driver.page_source
    soup = BeautifulSoup(html, "html.parser")

    # get product title
    product = soup.find(id="productTitle").text.strip()

    # determine url for reviews and navigate
    review_url = get_review_url(item_page)
    driver.get(review_url)
    time.sleep(5)

    # get data from review page
    html = driver.page_source
    soup = BeautifulSoup(html, "html.parser")

    # identify how many reviews with text there are
    review_info = soup.find_all("div", class_="a-row a-spacing-base a-size-base")[0].text.split()
    num_reviews = int([x for x in review_info if x.strip().isnumeric()][-1])

    # iterate over all reviews
    revs = []
    for pg in range(math.ceil(num_reviews/10)):
        # get page - either base review url, or with page number added
        if pg > 0:
            driver.get(review_url + "&pageNumber=" + str(pg+1))
            # wait for page to load
            time.sleep(5)

            # get html
            html = driver.page_source
            soup = BeautifulSoup(html, "html.parser")

        # get list of reviews
        page_reviews = soup.find("div", id="cm_cr-review_list").find_all("div", class_="review")

        # get info from reviews
        for single_review in page_reviews:
            rating = single_review.find_all("a", class_="a-link-normal")[0].text
            title = single_review.find_all("a", class_="a-link-normal")[1].text.strip()
            review = single_review.find("span", class_="review-text-content").text.strip()
            revs.append([product,rating,title,review])


    return(pd.DataFrame(revs, columns=['Product', 'Rating', 'Title', 'Review']))

In [15]:
for index, row in df.iterrows():
    # get page for brand's line
    brand_page = row.URL
    
    brand = row.Brand
    line = row.Line
    
    # get links and images
    if os.path.exists("../output/amazon_links/" + brand + "_" + line + ".csv"):
        continue
        
    print (brand + "\t" + line)
    
    # get list of urls for each item
    items = [x for x in get_item_list(brand_page) if x != '']
    
    if len(items) == 0:
        continue
    
    item_info = []
    for item_page in items:
        try:
            item_info.append(get_url_img(item_page))
        except:
            print("issue")
            
    item_df = pd.DataFrame(item_info)
    item_df.columns = ['Product', 'URL', 'Image']
    item_df.to_csv("../output/amazon_links/" + brand + "_" + line + ".csv")
    
    if os.path.exists("../output/amazon/" + brand + "_" + line + ".csv"):
        continue
    
    review_list = []
    
    
    for item_page in items:
        print(item_page)
        
        try:
            item_reviews = get_product_reviews(item_page)
            review_list.append(item_reviews)
        except:
            print("issue")
        
    pd.concat(review_list).to_csv("../output/amazon/" + brand + "_" + line + ".csv")
        
        

Carol's Daughter	Curly Hair Products
SheaMoisture	Manuka Honey and Mafura Oil
Curls	Blueberry Bliss
Moptop	Shampoo
Moptop	Conditioner
Moptop	Gel
Cake Beauty	Hair
Giovanni	Curly Girl Products
TRESemme	Curly & Wavy Hair
Pacifica	Hair
Uncle Funky's Daughter	Cleansers
Uncle Funky's Daughter	Conditioners
https://www.amazon.com/Uncle-Funkys-Daughter-Moisturizing-Conditioner/dp/B00P8B25HW?ref_=ast_sto_dp
https://www.amazon.com/Uncle-Funkys-Daughter-Heal-Renew/dp/B07CMJ2WXF?ref_=ast_sto_dp
https://www.amazon.com/Uncle-Funkys-Daughter-Midnite-Conditioner/dp/B00P8BFN8K?ref_=ast_sto_dp&th=1&psc=1
https://www.amazon.com/Maximum-Thermal-Protection-Deep-Conditioner/dp/B096YJ5CSB?ref_=ast_sto_dp
https://www.amazon.com/Uncle-Funkys-Daughter-Richee-Rich/dp/B08V4XFBV7?ref_=ast_sto_dp
Uncle Funky's Daughter	Moisturizers
https://www.amazon.com/Extra-Butter-Curl-Forming-Creme/dp/B00P7AEXSI?ref_=ast_sto_dp&th=1&psc=1
https://www.amazon.com/Uncle-Funkys-Daughter-Supercurl-Moisture/dp/B00P7C007E?ref_=ast_sto_

ValueError: No objects to concatenate

In [None]:
row = df.iloc[1]

In [None]:
brand_page = row.URL
brand = row.Brand
line = row.Line

In [None]:
get_item_list(brand_page)

In [None]:
html = driver.page_source
soup = BeautifulSoup(html, "html.parser")

In [None]:
items = soup.find_all("div", class_="columns")

In [None]:
len(items)

In [None]:
items = soup.find_all("div", class_="columns")[0].find_all("li")

In [None]:
len(items)