In [2]:
import re
from bs4 import BeautifulSoup
import requests
import urllib
import json

In [3]:
#urls to scrape
#homepages
zara = 'https://www.zara.com/ca/'
uniqlo = 'https://www.uniqlo.com/ca/en/'
handm = 'https://www2.hm.com/en_ca/index.html'
urbout = 'https://www.urbanoutfitters.com/en-ca/'
arit = 'https://www.aritzia.com/'

#product pages
zara_p = 'https://www.zara.com/ca/en/minimal-leather-bucket-bag-p16751010.html?v1=227311474&v2=2114083'
uniqlo_p = 'https://www.uniqlo.com/ca/en/products/E425974-000?colorCode=COL74&sizeCode=SMA002'
uniqlo_base_api = 'https://www.uniqlo.com/ca/api/commerce/v3/en/products/' #uniqlo API
handm_p = 'https://www2.hm.com/en_ca/productpage.1024122005.html'
urbout_p = 'https://www.urbanoutfitters.com/en-ca/shop/bdg-eliana-bootcut-cargo-pant?category=womens-clothing&color=031&viewcode=b&type=REGULAR&quantity=1'
arit_p = 'https://www.aritzia.com/en/product/harper-sweater/65627.html?dwvar_65627_color=7325'
lulu_p = 'https://shop.lululemon.com/p/jackets-and-hoodies-jackets/Wool-Long-Coat/_/prod10642997?color=55842'
lulu_ps= ['https://shop.lululemon.com/p/jackets-and-hoodies-jackets/Wunder-Puff-Cropped-Jacket/_/prod9961198?color=56496',
'https://shop.lululemon.com/p/mens-jackets-and-hoodies-jackets/Down-For-It-All-Hoodie/_/prod9200476?color=47759',
'https://shop.lululemon.com/p/accessories/Power-Stride-Hiking-Crew-Sock-M/_/prod11120013?color=57579']


In [33]:
#webscraping functions
class webscrape():
    #helper functions

    def clean_n(text):
        '''
        removes r"\n" from text
        '''
        pattern = re.compile(r"\n")
        return re.sub(pattern,"",text)

    def clean_price(price):
        '''
        cleans price for lululemon
        '''
        search = re.search(r'\$[0-9]*\sCAD', price)
        if search:
            return search.group()


    def uniqlo_to_api(url):
        '''
        gets uniqlo product url and transforms to api url
        '''
        url_pieces = url.split(r"/")
        #print(url_pieces) 
        return uniqlo_base_api+url_pieces[-1]

    def show_html(url = ""):
        '''
        shows html text from response
        '''
        html = requests.get(url, headers={"User-Agent": "Custom"})
        print(html.text)

    def show_response(url = ""):
        '''
        shows response from request.get
        '''
        html = requests.get(url, headers={"User-Agent": "Custom"})
        print(html)


    #serach functions
    def zara(url = ""):
        '''
        Input a zara.com product url and return product and price as a dictionary
        '''
        html = requests.get(url, headers={'User-Agent': 'Custom'})
        soup = BeautifulSoup(html.text, 'lxml')
        product_title = soup.find('h1', class_='product-detail-info__header-name')
        price = soup.find('span', class_ ='money-amount__main')
        img = soup.find('img',class_='media-image__image media__wrapper--media')

        dict = {}
        dict['product title'] = product_title.text
        dict['price'] = price.text

        return dict

    def lululemon(url = ""):
        '''
        Input a lululemon.com product url and return product, price, category, and gender as a dictionary
        '''
        html = requests.get(url, headers={'User-Agent': 'Custom'})
        soup = BeautifulSoup(html.text, 'lxml')
        product_title = soup.find('div', itemprop ='name')
        price = soup.find('span', class_ ='OneLinkNoTx')
        category = soup.find('a', class_ ='link OneLinkTx')
        img = soup.find('img',class_='media-image__image media__wrapper--media')

        #find gender in the title or category
        gender = 'gender not found'
        if 'women' in product_title.text.lower() or 'women' in category.text.lower():
            gender = 'women'
        elif 'men' in product_title.text.lower() or 'men' in category.text.lower():
            gender = 'men'


        #clean
        clean_price = re.sub('[^A-Za-z0-9$]',' ',price.text)
        cleaner_price = webscrape.clean_price(clean_price)
        

        dict = {}
        dict['product title'] = product_title.text
        dict['price'] = cleaner_price
        dict['gender'] = gender
        dict['category'] = category.text

        return dict

    def uniqlo(url = ""):
        '''
        Input a uniqlo.com product url and return product and price as a dictionary
        '''
        new_url = webscrape.uniqlo_to_api(url)
        html = requests.get(new_url, headers={'User-Agent': 'Custom'})
        diction = {}
        dictionary = json.loads(html.text)
        items = dictionary['result']['items'][0]
        title = items['name']
        gender = items['genderName']
        price =items['prices']['base']['value']
        new_price = price[0:len(price)-2]+ " " + items['prices']['base']['currency']['code']
        diction['title'] = title
        diction['gender'] = gender
        diction['price'] = new_price

        return diction

    def handm(url = ""):
        '''
        Input a h&m product url and return product title and price as a dictionary
        '''
        html = requests.get(url, headers={'User-Agent': 'Custom'})
        soup = BeautifulSoup(html.text, 'lxml')
        scripts = soup.find_all('script')

        #find script with all data
        desc_script = ""
        for script in scripts:
            if "regularPrice:" in script.text:
                desc_script = script.text
        img = soup.find('div',class_='product-detail-main-image-container')

        dict = {}
       
        if desc_script:
            price = re.search(r'(?s)(?<=regularPrice:")(.*?)(?=")', desc_script) #(?s) flag makes regex recognize ALL characters including \n
            title = re.search(r'(?s)(?<=title:")(.*?)(?=")', desc_script)
            dict['price'] = price.group(1)
            dict['title'] = title.group(1)

        return dict

    def aritzia(url = ""):
        '''
        Input a aritzia.com product url and return product and price as a dictionary
        '''
        html = requests.get(url, headers={'User-Agent': 'Custom'})
        soup = BeautifulSoup(html.text, 'lxml')
        product_title = soup.find('h1', class_='js-product-detail__product-name f2 ttc ttu-fr')
        price = soup.find('div', class_ ='product-price')
        img = soup.find('img',class_='media-image__image media__wrapper--media')

        dict = {}
        if product_title:
            dict['product title'] = webscrape.clean_n(product_title.text)
        else:
            return 'product title not found'
        if price:
            dict['price'] = webscrape.clean_n(price.text)

        return dict

    def urbout(url = ""):
        '''
        Input a zara.com product url and return product and price as a dictionary
        '''
        html = requests.get(url, headers={'User-Agent': 'Custom'})
        soup = BeautifulSoup(html.text, 'lxml')
        product_title = soup.find('h1', class_='product-detail-info__header-name')
        price = soup.find('span', class_ ='money-amount__main')
        img = soup.find('img',class_='media-image__image media__wrapper--media')

        dict = {}
        dict['product title'] = product_title.text
        dict['price'] = price.text

        return dict



In [34]:
#print(webscrape.zara(zara_p))
#print(webscrape.aritzia(arit_p))
#print(webscrape.handm(handm_p))
#print(webscrape.uniqlo(uniqlo_p))
for p in lulu_ps:
    print(webscrape.lululemon(p))


{'product title': 'Wunder Puff Cropped Jacket Online Only', 'price': '$318 CAD', 'gender': 'women', 'category': "Women's Clothes"}
{'product title': 'Down For It All Hoodie', 'price': '$198 CAD', 'gender': 'men', 'category': "Men's Clothes"}
{'product title': "Men's Power Stride Hiking Crew Sock Online Only", 'price': '$28 CAD', 'gender': 'men', 'category': 'Accessories'}


In [31]:
def clean_price(price):
    search = re.search(r'\$[0-9]*\sCAD', price)
    if search:
        return search.group()

print(clean_price('$318 CAD'))


$318 CAD
