In [1]:
from bs4 import BeautifulSoup
from selenium import webdriver
import requests
import pandas as pd
import numpy as np
import re

In [2]:
base_url = 'http://deciem.com'
username = 'deciem'

In [3]:
r = requests.get(base_url)
soup = BeautifulSoup(r.text, 'lxml')

In [4]:
categories = []

for li in soup.findAll('li', {'data-navid':True}):
    try:
        categories.append(li.find('a')['href'])
    except:
        pass

# create list of pages to scrape e.g. /hair /body
categories = categories[0:6]

In [5]:
def get_product_name(product_card):
    return product_card.find('h3').text + ' - ' + product_card.find('h2').text

def get_product_link(product_card):
    return base_url + product_card.find('a')['href']

def get_product_price(product_card):
    try:
        return float(product_card.find('span', {'class':'price'}).text.replace(' USD', ''))
    except:
        return None

def get_product_sku(product_card):
    return product_card.find('a')['href'].replace('/product/', '')

def get_product_img(product_card):
    return product_card.find('img')['src']

In [6]:
funcs = {'name' : get_product_name,
         'product_url' : get_product_link,
         'price' : get_product_price,
         'sku' : get_product_sku,
         'image_url' : get_product_img
        }

In [7]:
products = []
# launches selenium so that we can interact with dynamically loaded parts of webpage
driver = webdriver.Chrome()

for cat in categories:
    
    driver.get(cat)
    innerHTML = driver.execute_script("return document.body.innerHTML") #returns the inner HTML as a string
    soup = BeautifulSoup(innerHTML, 'lxml')
    
    # should hold all pertinent info for each product
    product_cards = soup.findAll('div', {'class':'brief'})
    
    for product_card in product_cards:
        product = {}
        for key, func in funcs.items():
            product[key] = func(product_card)
        products.append(product)

In [8]:
df = pd.DataFrame(products)
df['username'] = username
# placeholder for client
df['main_category'] = None
df['product_category'] = None
df['sub_category'] = None
df = df[['username', 'name', 'product_url', 'main_category', 'sub_category', 'product_category', 'price', 'sku', 'image_url']].drop_duplicates(subset=['sku'])

In [9]:
df.to_csv('../Output/' + username + '.csv',index=False,header=True)