In [1]:
import json
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import requests
import re
from time import sleep
from collections import defaultdict

In [2]:
base_url = 'https://www.pixibeauty.com'
username = 'pixibeauty'
categories = ['skintreats','makeup','kits','tools','last-chance']
categories = ['/collections/' + x for x in categories]


In [3]:
def get_image_url(soup):
    
    each_product = soup.find_all('div',class_ = 'grid__item large--one-quarter medium-down--one-half')
    
    return ['https:'+x.find('img',class_='lazy')['data-original'] for x in each_product]

def get_sku(soup):

    each_product = soup.find('div',class_ = 'grid__item large--one-quarter medium-down--one-half')
    
    return [re.findall('(?<=v=).*',x)[0] for x in get_image_url(soup)]
    
    

def get_price(soup):
    
    each_product = soup.find_all('div',class_ = 'grid__item large--one-quarter medium-down--one-half')
    product_list = []

    for product in each_product:

        try:
            if product.find('p',class_='price').find('s') is not None:
                product_list.append(float(product.find('p',class_='price').find('s').text.strip('$')))
            else:
                product_list.append(float(product.find('p',class_='price').text.replace('NEW!','').replace('From','').strip().strip('$')))
        
        except ValueError:
         
            product_list.append(None)
        
    return product_list

def get_product_url(soup):
    
    each_product = soup.find_all('div',class_ = 'grid__item large--one-quarter medium-down--one-half')
    
    return [base_url + x.find('a')['href'] for x in each_product]   

def get_product_name(soup):
    
    each_product = soup.find_all('div',class_ = 'grid__item large--one-quarter medium-down--one-half')
        
    return [x.find('p',class_='h6').text.strip() for x in each_product]

In [4]:
funcs = {'name':get_product_name,
         'product_url' : get_product_url,
         'price' : get_price,
         'sku' : get_sku,
         'image_url' : get_image_url}

In [5]:
product_dict = defaultdict(list)

for category in categories:
    i=1
    try: r = requests.get(base_url+category+'?page='+str(i))
    except ConnectionError:
        break
       
    soup = BeautifulSoup(r.text,'lxml')

    for key,func in funcs.items():
        product_dict[key].extend(func(soup))

    while soup.find('a',class_ = 'pagination-right') is not None:    
        i += 1
        try: r = requests.get(base_url+category+'?page='+str(i))
        except ConnectionError:
            print('Request Failed')
            break
        
        soup = BeautifulSoup(r.text,'lxml')

        for key,func in funcs.items():
            product_dict[key].extend(func(soup))         
        
        sleep(1)

# Creating additional columns
product_dict['username'] = [username for x in range(len(product_dict['name']))]
product_dict['main_category'] = [None for x in range(len(product_dict['name']))]
product_dict['sub_category'] = [None for x in range(len(product_dict['name']))]
product_dict['product_category'] = [None for x in range(len(product_dict['name']))]


In [6]:
columns = ['name', 'product_url', 'main_category', 'sub_category', 'product_category', 'price', 'sku', 'image_url']
df = pd.DataFrame(product_dict,columns = columns )
df.head()

Unnamed: 0,name,product_url,main_category,sub_category,product_category,price,sku,image_url
0,Skintreat Duos - Glow Tonic (250ml) + Glow Mist,https://www.pixibeauty.com/collections/skintre...,,,,38.0,1506474199,https://cdn.shopify.com/s/files/1/1463/5858/pr...
1,Glow Tonic 250ml,https://www.pixibeauty.com/collections/skintre...,,,,29.0,1512415419,https://cdn.shopify.com/s/files/1/1463/5858/pr...
2,Glow Tonic 100ml,https://www.pixibeauty.com/collections/skintre...,,,,15.0,1512415257,https://cdn.shopify.com/s/files/1/1463/5858/pr...
3,Glow Tonic 100ml (Holiday),https://www.pixibeauty.com/collections/skintre...,,,,15.0,1512005196,https://cdn.shopify.com/s/files/1/1463/5858/pr...
4,Glow Tonic 15ml,https://www.pixibeauty.com/collections/skintre...,,,,6.0,1512415349,https://cdn.shopify.com/s/files/1/1463/5858/pr...


In [7]:
# Dropping duplicates

df2 = df.drop_duplicates(subset = ['name','sku'])
df2.shape

(166, 8)

In [8]:
# Exporting to csv

df2.to_csv(username+'.csv',index=False)