# Amazon Product Scraping

In [70]:
import requests
from bs4 import BeautifulSoup
import os
import json
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
from selenium.common.exceptions import NoSuchElementException
import math
import time
import pandas as pd

In [76]:
df = pd.read_excel("../productpages.xlsx")

In [77]:
df

Unnamed: 0,Brand,Line,URL
0,Not Your Mother's,Naturals,https://www.amazon.com/stores/page/AD6500C9-1C...
1,Carol's Daughter,Curly Hair Products,https://www.amazon.com/stores/page/A5D6BF5C-26...
2,Carol's Daughter,Shampoos and Conditioners,https://www.amazon.com/stores/page/FB3658B5-14...
3,Carol's Daughter,Leave-Ins,https://www.amazon.com/stores/page/0E98909E-6E...
4,Carol's Daughter,Masks and Treatments,https://www.amazon.com/stores/page/DE3D32F1-89...
5,Carol's Daughter,Styling,https://www.amazon.com/stores/page/FAFBE68C-47...
6,SheaMoisture,Jamaican Black Castor Oil,https://www.amazon.com/stores/page/A03541FA-F1...
7,SheaMoisture,Coconut and Hibiscus,https://www.amazon.com/stores/page/6013C3E4-60...
8,SheaMoisture,100% Virgin Coconut Oil,https://www.amazon.com/stores/page/7D202725-D6...
9,SheaMoisture,Manuka Honey and Yogurt,https://www.amazon.com/stores/page/FDA558AB-CB...


In [73]:
driver = webdriver.Firefox(executable_path=r'/Users/morganoneka/Documents/PersonalProjects/geckodriver')

In [74]:
def get_item_list(brand_page):
    # pull up brand page
    driver.get(brand_page)
    time.sleep(5)
    
    html = driver.page_source
    soup = BeautifulSoup(html, "html.parser")
    
    # get list of all items
    items = soup.find_all("div", class_="columns")[0].find_all("li")
    
#     return(["https://www.amazon.com" + item.find_all("a")[0].get("href") for item in items])
    return([get_item_url_from_listing(item) for item in items])

def get_item_url_from_listing(listing):
    try:
        return ("https://www.amazon.com" + listing.find_all("a")[0].get("href") )
    except:
        return ""

def get_review_url(product_url):
    sp = product_url.split("/")
    return('/'.join(sp[:4]) + "/product-reviews/" + sp[-1].split("?")[0] + "/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews")

# url = page for the individual project
def get_product_reviews(url):
    # navigate to product home page
    driver.get(url)

    html = driver.page_source
    soup = BeautifulSoup(html, "html.parser")

    # get product title
    product = soup.find(id="productTitle").text.strip()

    # determine url for reviews and navigate
    review_url = get_review_url(item_page)
    driver.get(review_url)
    time.sleep(5)

    # get data from review page
    html = driver.page_source
    soup = BeautifulSoup(html, "html.parser")

    # identify how many reviews with text there are
    review_info = soup.find_all("div", class_="a-row a-spacing-base a-size-base")[0].text.split()
    num_reviews = int([x for x in review_info if x.strip().isnumeric()][-1])

    # iterate over all reviews
    revs = []
    for pg in range(math.ceil(num_reviews/10)):
        # get page - either base review url, or with page number added
        if pg > 0:
            driver.get(review_url + "&pageNumber=" + str(pg+1))
            # wait for page to load
            time.sleep(5)

            # get html
            html = driver.page_source
            soup = BeautifulSoup(html, "html.parser")

        # get list of reviews
        page_reviews = soup.find("div", id="cm_cr-review_list").find_all("div", class_="review")

        # get info from reviews
        for single_review in page_reviews:
            rating = single_review.find_all("a", class_="a-link-normal")[0].text
            title = single_review.find_all("a", class_="a-link-normal")[1].text.strip()
            review = single_review.find("span", class_="review-text-content").text.strip()
            revs.append([product,rating,title,review])


    return(pd.DataFrame(revs, columns=['Product', 'Rating', 'Title', 'Review']))

In [78]:
for index, row in df.iterrows():
    # get page for brand's line
    brand_page = row.URL
    
    brand = row.Brand
    line = row.Line
    
    if os.path.exists("../output/amazon/" + brand + "_" + line + ".csv"):
        continue
        
    print (brand + "\t" + line)
    
    
    # get list of urls for each item
    items = [x for x in get_item_list(brand_page) if x != '']
    
    review_list = []
    
    
    for item_page in items:
        print(item_page)
        
        try:
            item_reviews = get_product_reviews(item_page)
            review_list.append(item_reviews)
        except:
            print("issue")
        
    pd.concat(review_list).to_csv("../output/amazon/" + brand + "_" + line + ".csv")
        
        

Giovanni	Curly Girl Products
https://www.amazon.com/GIOVANNI-COSMETICS-Leave-Conditioner-Weightless/dp/B001ET78BU?ref_=ast_sto_dp
https://www.amazon.com/GIOVANNI-Eco-Chic-Natural-Styling/dp/B001ET78CO?ref_=ast_sto_dp
https://www.amazon.com/GIOVANNI-COSMETICS-Natural-Styling-Air-Turbo/dp/B001ET789M?ref_=ast_sto_dp
https://www.amazon.com/GIOVANNI-Chic-Frizz-Gone-2-75/dp/B001ET78CY?ref_=ast_sto_dp&th=1&psc=1
https://www.amazon.com/Giovanni-conditioner-weightless-18615-Grapefruit/dp/B084445YKB?ref_=ast_sto_dp
https://www.amazon.com/GIOVANNI-Smooth-Silk-Shampoo-Pack/dp/B001G7PLDW?ref_=ast_sto_dp&th=1&psc=1
https://www.amazon.com/Giovanni-Conditioner-Smooth-Damaged-Bottle/dp/B00066D2JE?ref_=ast_sto_dp
https://www.amazon.com/GIOVANNI-COSMETICS-Conditioner-Moisture-Damaged/dp/B001ET7892?ref_=ast_sto_dp
https://www.amazon.com/Giovanni-Moisturizing-Conditioner-Detangles-Sulfate/dp/B07NP3X7PV?ref_=ast_sto_dp
https://www.amazon.com/GIOVANNI-Smooth-Conditioner-Moisture-Damaged/dp/B00B620YU2?ref_=as

IndexError: list index out of range

In [55]:
row = df.iloc[1]

In [56]:
brand_page = row.URL
brand = row.Brand
line = row.Line

In [57]:
get_item_list(brand_page)

IndexError: list index out of range

In [58]:
html = driver.page_source
soup = BeautifulSoup(html, "html.parser")

In [60]:
items = soup.find_all("div", class_="columns")

In [61]:
len(items)

1

In [62]:
items = soup.find_all("div", class_="columns")[0].find_all("li")

In [64]:
len(items)

8