In [18]:
import json
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import requests
import re
from time import sleep
from collections import defaultdict

In [19]:
base_url = 'https://www.sigmabeauty.com'
username = 'sigmabeauty'

categories = ['/c/317?pageSize=500','/c/316?pageSize=500','/brush-care/c/1151?pageSize=500']


In [20]:
def get_image_url(soup):
    
    each_product = soup.find_all('li',class_ = 'mz-productlist-item')
   
    return ['https:'+x.find('div',class_ = 'mz-productlisting-image').img['src'] for x in each_product]

def get_sku(soup): 
    
    each_product = soup.find_all('li',class_ = 'mz-productlist-item')
    
    return [x.find('div',class_='bvr-inline-rating')['data-mz-product-code'] for x in each_product]    
    
    

def get_price(soup):

    each_product = soup.find_all('li',class_ = 'mz-productlist-item')
    product_list = []

    for product in each_product:

        try:
            if product.find('span',class_='mz-price is-crossedout') is not None:
                product_list.append(float(product.find('span',class_='mz-price is-crossedout').text.strip('$')))
            else:
                product_list.append(float(product.find('span',class_='mz-price').text.strip('$')))
        
        except ValueError:
         
            product_list.append(None)
        
    return product_list


def get_product_url(soup):
     
    each_product = soup.find_all('li',class_ = 'mz-productlist-item')    
        
    return [base_url + x.find('a')['href'] for x in each_product]   

def get_product_name(soup):
    
    each_product = soup.find_all('div',class_ = 'mz-productlisting-productcode')
    
    return [x.text for x in each_product]

In [21]:
funcs = {'name':get_product_name,
         'product_url' : get_product_url,
         'price' : get_price,
         'sku' : get_sku,
         'image_url' : get_image_url}

In [22]:
product_dict = defaultdict(list)

for category in categories:
    try:
        r = requests.get(base_url+category)
    except ConnectionError:
        print('Request Failed')
        break
    
    soup = BeautifulSoup(r.text,'lxml')

    for key,func in funcs.items():
        product_dict[key].extend(func(soup))

    
# Creating additional columns
product_dict['username'] = [username for x in range(len(product_dict['name']))]
product_dict['main_category'] = None
product_dict['sub_category'] = None
product_dict['product_category'] = None


In [23]:
columns = ['name', 'product_url', 'main_category', 'sub_category', 'product_category', 'price', 'sku', 'image_url']
df = pd.DataFrame(product_dict,columns = columns )
df.head()

Unnamed: 0,name,product_url,main_category,sub_category,product_category,price,sku,image_url
0,3DHD® - Kabuki Brush,https://www.sigmabeauty.com/3dhd-kabuki/p/3DK,,,,25.0,3DK,https://cdn-tp1.mozu.com/7907-10193/cms/10193/...
1,3DHD® - Precision Brush,https://www.sigmabeauty.com/3dhd-precision/p/3DP,,,,20.0,3DP,https://cdn-tp1.mozu.com/7907-10193/cms/10193/...
2,3DHD® Max Kabuki,https://www.sigmabeauty.com/3dhd-max-kabuki/p/...,,,,27.0,3DK-M,https://cdn-tp1.mozu.com/7907-10193/cms/10193/...
3,3DHD® Precision - Pink,https://www.sigmabeauty.com/3dhd-precision-pin...,,,,20.0,3DP-P,https://cdn-tp1.mozu.com/7907-10193/cms/10193/...
4,3DHD® Precision - White,https://www.sigmabeauty.com/3dhd-precision-whi...,,,,20.0,3DP-W,https://cdn-tp1.mozu.com/7907-10193/cms/10193/...


In [24]:
# Dropping duplicates

df2 = df.drop_duplicates(subset = ['name','sku'])
df2.shape

(234, 8)

In [25]:
# Exporting to csv

df2.to_csv(username+'.csv',index=False)