In [10]:
import json
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import requests
from time import sleep
from collections import defaultdict

In [11]:
base_url = 'https://www.glamglow.com'
suffix_url = '/products/18842/shop-all'
username = 'glamglow'


In [13]:
def get_image_url(soup):
    
    return [base_url+ x.find('img')['src'] for x in soup.find_all('div',class_ = 'product-grid__item')]

def get_sku(soup):
   
    return  [x.find('div',{'class': 'product-thumb__image-wrapper'}).img['data-default-sku-pcode']
            for x in soup.find_all('div',class_ = 'product-grid__item')]
    

def get_price(soup):
    
    try:
        price = [float(x.find('span',class_='product_brief ').text.strip().strip('$')) for x in soup.find_all('div',class_ = 'product-grid__item')]
     
    except ValueError:
        pass
    return price

def get_product_url(soup):
     
    return [base_url + x.find('a')['href'] for x in soup.find_all('div',class_ = 'product-grid__item')]   

def get_product_name(soup):
    
    return [x.find('a',class_ = 'product-thumb__headline-link').text.strip('#') for x in soup.find_all('div',class_ = 'product-grid__item')]

In [14]:
funcs = {'name':get_product_name,
         'product_url' : get_product_url,
         'price' : get_price,
         'sku' : get_sku,
         'image_url' : get_image_url}

In [19]:
product_dict = defaultdict(list)

try:
    r = requests.get(base_url+suffix_url)
    
except ConnectionError:
    print('Request Failed')
    
    
soup = BeautifulSoup(r.text,'lxml')

for key,func in funcs.items():
    product_dict[key].extend(func(soup))
    
    
# Creating additional columns
product_dict['username'] = [username for x in range(len(product_dict['name']))]
product_dict['main_category'] = None
product_dict['sub_category'] = None
product_dict['product_category'] = None

In [16]:
columns = ['name', 'product_url', 'main_category', 'sub_category', 'product_category', 'price', 'sku', 'image_url']
df = pd.DataFrame(product_dict,columns = columns )
df.head()

Unnamed: 0,name,product_url,main_category,sub_category,product_category,price,sku,image_url
0,GLITTERMASK GRAVITYMUD™ FIRMING TREATMENT,https://www.glamglow.com/product/18842/51072/s...,,,,69.0,G0HY01,https://www.glamglow.com/media/export/cms/prod...
1,SUPERMUD® CLEARING TREATMENT,https://www.glamglow.com/product/18842/39399/s...,,,,69.0,G06201,https://www.glamglow.com/media/export/cms/prod...
2,MULTIMASKING MASK TREATMENT SET,https://www.glamglow.com/product/18842/47230/s...,,,,39.0,G0G101,https://www.glamglow.com/media/export/cms/prod...
3,GLOWSTARTER™ MEGA ILLUMINATING MOISTURIZER,https://www.glamglow.com/product/18842/41472/s...,,,,49.0,G05902,https://www.glamglow.com/media/export/cms/prod...
4,YOUTHMUD® TINGLEXFOLIATE TREATMENT,https://www.glamglow.com/product/18842/36600/s...,,,,69.0,G06501,https://www.glamglow.com/media/export/cms/prod...


In [20]:
# Dropping duplicates

df2 = df.drop_duplicates(subset = ['name','sku'])
df2.shape

(54, 8)

In [18]:
# Exporting to csv

df2.to_csv(username+'.csv',index=False)