In [3]:
import json
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import requests
from time import sleep
from collections import defaultdict

In [4]:
base_url = 'https://www.wetnwildbeauty.com/'
categories = ['new','eyes','lips','nails','face','accessories','sale','limited-editions']
username = 'wetnwildbeauty'


In [5]:
def get_image_url(prod):
    
    return [x['src'] for x in prod.find_all('img',class_='product-image-photo')]

def get_sku(prod):
    
    item_details = [x for x in prod.find_all('div',class_='product details product-item-details')]

    data_product_id = []
    
    for index,x in enumerate(item_details):
        
        if x.find('div',class_='price-box price-final_price') is not None:
            data_product_id.append(x.find('div',class_='price-box price-final_price')['data-product-id'])

        elif x.find('div','price-box custom-price-wrapper price-final_price') is not None:
            data_product_id.append(x.find('div','price-box custom-price-wrapper price-final_price')['data-product-id'])
        else:
            data_product_id.append(None)
        
    
    return data_product_id
    

def get_price(prod):
    try:
        
        item_details = [x for x in prod.find_all('div',class_='product details product-item-details')]
        price = []
        
        for index,x in enumerate(item_details):
            if x.find('div',class_='old-price') is not None:
                price.append(float(x.find('div',class_='old-price').text.strip('$').strip()))

            else:
                price.append(float(x.find('span',class_ = 'price-container price-final_price tax weee').text.strip().strip('$'))) 

        
    except ValueError:
        pass
    return price

def get_product_url(prod):
     
    return [x['href'] for x in prod.find_all('a',class_='product-item-link')]   

def get_product_name(prod):
    
    return [x.text.strip() for x in prod.find_all('a',{'class':'product-item-link'})]

In [6]:
funcs = {'name':get_product_name,
         'product_url' : get_product_url,
         'price' : get_price,
         'sku' : get_sku,
         'image_url' : get_image_url}

In [7]:
product_dict = defaultdict(list)

for category in categories:
    try:
        r = requests.get(base_url+category+'.html')
    except ConnectionError:
        print('Request Failed')
        break
    
    soup = BeautifulSoup(r.text,'lxml')
    
    for key,func in funcs.items():
        product_dict[key].extend(func(soup))
    
    sleep(1)
    
# Creating additional columns
product_dict['username'] = [username for x in range(len(product_dict['name']))]
product_dict['main_category'] = [None for x in range(len(product_dict['name']))]
product_dict['sub_category'] = [None for x in range(len(product_dict['name']))]
product_dict['product_category'] = [None for x in range(len(product_dict['name']))]


In [8]:
columns = ['name', 'product_url', 'main_category', 'sub_category', 'product_category', 'price', 'sku', 'image_url']
df = pd.DataFrame(product_dict,columns = columns )
df.head()

Unnamed: 0,name,product_url,main_category,sub_category,product_category,price,sku,image_url
0,Pro Brush Line Brush Bundle,https://www.wetnwildbeauty.com/new/pro-brush-l...,,,,69.9,7178,https://www.wetnwildbeauty.com/media/catalog/p...
1,MegaCushion Foundation SPF 15,https://www.wetnwildbeauty.com/new/megacushion...,,,,8.99,7134,https://www.wetnwildbeauty.com/media/catalog/p...
2,MegaGlo Liquid Highlighter,https://www.wetnwildbeauty.com/new/megaglo-liq...,,,,5.99,7131,https://www.wetnwildbeauty.com/media/catalog/p...
3,Color Icon Eyeshadow Quad,https://www.wetnwildbeauty.com/new/color-icon-...,,,,2.99,7107,https://www.wetnwildbeauty.com/media/catalog/p...
4,Color Icon Eyeshadow 10 Pan Palette,https://www.wetnwildbeauty.com/new/color-icon-...,,,,4.99,7086,https://www.wetnwildbeauty.com/media/catalog/p...


In [11]:
# Dropping duplicates

df2 = df.drop_duplicates(subset = ['name','sku'])
df2.shape

(167, 8)

In [10]:
# Exporting to csv

df2.to_csv(username+'.csv',index=False)