In [121]:
import json
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import requests
import re
from time import sleep
from collections import defaultdict

In [98]:
base_url = 'https://dermae.com'
suffix_url = '/collections/all?page='
username = 'dermae'


In [99]:
def get_image_url(soup):
    
    each_product = soup.find_all('div',{'class':'grid__item small--one-whole medium--one-half large--one-fifth product-grid-item'})
    return ['https://'+x.img['src'].strip('/') for x in each_product]


def get_sku(soup):
        
    each_product = soup.find_all('div',{'class':'grid__item small--one-whole medium--one-half large--one-fifth product-grid-item'})
          
    return [x['id'] for x in each_product]    
    
    

def get_price(soup):      
      
    each_product = soup.find_all('div',{'class':'grid__item small--one-whole medium--one-half large--one-fifth product-grid-item'})
    product_list = []

    for product in each_product:

        try:
            if product.find('span',class_='compare-at') is not None:
                product_list.append(float(product.find('span',class_='compare-at').text.strip().strip('$')))
            else:
                product_list.append(float(product.find('span',class_='price').text.strip().strip('$')))
        
        except ValueError:
            
            product_list.append(None)
        
    return product_list
                                                
                                              
def get_product_url(soup):
    
    each_product = soup.find_all('div',{'class':'grid__item small--one-whole medium--one-half large--one-fifth product-grid-item'})
     
    return [base_url + x.find('div',{'class':'figcaption under'}).find('a')['href'] for x in each_product]   

def get_product_name(soup):
    
    each_product = soup.find_all('div',{'class':'grid__item small--one-whole medium--one-half large--one-fifth product-grid-item'})
    
    return [x.find('div',class_ = 'figcaption under').find('p').text.strip() for x in each_product]

In [100]:
funcs = {'name':get_product_name,
         'product_url' : get_product_url,
         'price' : get_price,
         'sku' : get_sku,
         'image_url' : get_image_url}

In [125]:
product_dict = defaultdict(list)


i=1
try: r = requests.get(base_url+suffix_url+str(i))
except ConnectionError:
    print('Request Failed')

soup = BeautifulSoup(r.text,'lxml')

for key,func in funcs.items():
    product_dict[key].extend(func(soup))

while soup.find('ul',class_='pagination-custom').find('li',class_='active') is not None:

    i += 1
    try: r = requests.get(base_url+suffix_url+str(i))
    except ConnectionError:
        print('Request Failed')
        break

    soup = BeautifulSoup(r.text,'lxml')

    for key,func in funcs.items():
        product_dict[key].extend(func(soup))         

    sleep(1)    
    
    
    

# Creating additional columns
product_dict['username'] = [username for x in range(len(product_dict['name']))]
product_dict['main_category'] = None
product_dict['sub_category'] = None
product_dict['product_category'] = None


In [127]:
columns = ['name', 'product_url', 'main_category', 'sub_category', 'product_category', 'price', 'sku', 'image_url']
df = pd.DataFrame(product_dict,columns = columns )
df.head()

Unnamed: 0,name,product_url,main_category,sub_category,product_category,price,sku,image_url
0,$20 Gift Card,https://dermae.com/products/gift-card,,,,20.0,10815783368,https://cdn.shopify.com/s/files/1/1525/1400/pr...
1,$40 Gift Card,https://dermae.com/products/40-gift-card,,,,40.0,60826943496,https://cdn.shopify.com/s/files/1/1525/1400/pr...
2,3-in-1 Hair Protection Serum,https://dermae.com/products/3-in-1-hair-protec...,,,,10.99,10079341576,https://cdn.shopify.com/s/files/1/1525/1400/pr...
3,Age-Defying Antioxidant Day Cream,https://dermae.com/products/age-defying-antiox...,,,,39.5,7966871368,https://cdn.shopify.com/s/files/1/1525/1400/pr...
4,Age-Defying Antioxidant Eye Cream,https://dermae.com/products/age-defying-antiox...,,,,24.75,7967103368,https://cdn.shopify.com/s/files/1/1525/1400/pr...


In [128]:
# Dropping duplicates

df2 = df.drop_duplicates(subset = ['name','sku'])
df2.shape

(100, 8)

In [129]:
# Exporting to csv

df2.to_csv(username+'.csv',index=False)