# Web-Scraping on SSENSE.COM - Men's 

In [1]:
#Importing packages

from bs4 import BeautifulSoup
import requests
import json
from time import sleep

## Getting a sense of HTML codes on the site

In [2]:
mens_page = requests.get("https://ssense.com/en-ca/men")
mens_soup = BeautifulSoup(mens_page.content, 'html.parser')

#print(mens_soup.prettify())

## Testing individual steps

### Category Level

In [3]:
#Getting all categories on men's products

cat = mens_soup.find_all('ul', class_='nav nav--stacked')[0]
categories = cat.find_all('li')
categories = [i.get_text().strip() for i in categories][1:]

print(categories)

['ACCESSORIES', 'BAGS', 'CLOTHING', 'SHOES']


In [4]:
#Getting URLs to each categories

categories_url = cat.find_all('a',href=True)
categories_url = [i['href'] for i in categories_url][1:]

In [5]:
#Dictionary containing men's categories and their URLs 

mens_categories = dict(zip(categories, categories_url))
print(mens_categories)

{'ACCESSORIES': '/en-ca/men/accessories', 'BAGS': '/en-ca/men/bags', 'CLOTHING': '/en-ca/men/clothing', 'SHOES': '/en-ca/men/shoes'}


In [6]:
#Diving into individual page of each categories - Example: Accessories

cat_page = requests.get("https://ssense.com" + mens_categories['ACCESSORIES'])
cat_soup = BeautifulSoup(cat_page.content, 'html.parser')

#print(cat_soup.prettify())

In [7]:
#Number of pages for a particular category

cat_soup.find_all('li', class_='last-page')[0].get_text()

'91'

In [8]:
#Getting product information of a particular item on a page

json.loads(cat_soup.find_all("script", {"type": "application/ld+json"})[59].get_text())

{'@context': 'https://schema.org',
 '@type': 'Product',
 'productID': 5064611,
 'name': 'Black & Yellow Monopoly 101 Sunglasses',
 'sku': '201463M134010',
 'brand': {'@type': 'Brand', 'name': 'Thierry Lasry'},
 'offers': {'@type': 'Offer',
  'price': 515,
  'priceCurrency': 'CAD',
  'availability': 'https://schema.org/InStock',
  'url': '/men/product/thierry-lasry/black-and-yellow-monopoly-101-sunglasses/5064611'},
 'url': '/men/product/thierry-lasry/black-and-yellow-monopoly-101-sunglasses/5064611',
 'image': 'https://res.cloudinary.com/ssenseweb/image/upload/201463M134010_1.jpg'}

In [9]:
#Number of products or items on a page

len(cat_soup.find_all("script", {"type": "application/ld+json"}))

60

### Product Level

In [10]:
#Getting HTML codes for a particular product

product = requests.get("https://ssense.com/en-ca/men/product/random-identities/black-faux-leather-hoodie/5230331")
product_soup = BeautifulSoup(product.content, 'html.parser')

#print(product_soup.prettify())

In [11]:
#Getting information of a product on product page.

js = json.loads(product_soup.find_all("script", {"type": "application/ld+json"})[0].get_text())

js

{'@context': 'https://schema.org',
 '@type': 'Product',
 'productID': 5230331,
 'name': 'Black Faux-Leather Hoodie',
 'sku': '192172M202054',
 'brand': {'@type': 'Brand', 'name': 'Random Identities'},
 'offers': {'@type': 'Offer',
  'price': 761,
  'priceCurrency': 'CAD',
  'availability': 'https://schema.org/InStock',
  'url': '/en-ca/men/product/random-identities/black-faux-leather-hoodie/5230331'},
 'url': '/en-ca/men/product/random-identities/black-faux-leather-hoodie/5230331',
 'description': 'Long sleeve grained faux-leather hoodie in black. Bungee-style drawstring at hood. Zip closure at front. Patch pockets at waist. Elasticized cuffs and hem. Raglan sleeves. Full faux-fur lining in green. Silver-tone hardware. \r\n\r\nSupplier color: Black',
 'image': ['https://res.cloudinary.com/ssenseweb/image/upload/192172M202054_1.jpg',
  'https://res.cloudinary.com/ssenseweb/image/upload/192172M202054_2.jpg',
  'https://res.cloudinary.com/ssenseweb/image/upload/192172M202054_3.jpg',
  'ht

On product page, it has more info regarding the description of the product which the one on category page does not have.

In [12]:
#Product name

test_name = js['name']
test_name

'Black Faux-Leather Hoodie'

In [13]:
#Product SKU

test_sku = js['sku']
test_sku

'192172M202054'

In [14]:
#Product Price

test_price = js['offers']['price']
test_price

761

In [15]:
#Product Price currency

test_currency = js['offers']['priceCurrency']
test_currency

'CAD'

In [16]:
#Product description

test_description = js['description']
test_description

'Long sleeve grained faux-leather hoodie in black. Bungee-style drawstring at hood. Zip closure at front. Patch pockets at waist. Elasticized cuffs and hem. Raglan sleeves. Full faux-fur lining in green. Silver-tone hardware. \r\n\r\nSupplier color: Black'

In [17]:
#Product's current sizes

test_size = product_soup.find_all('option')

test_size = [i.get_text().strip() for i in test_size][1:]

print(test_size)

['XS - Sold Out', 'S', 'L']


### Putting Everything Together

Looping through every pages of each categories to obtain all info (except sizes for now) of all products and combining them together into a list of list.

Since there are only 4 categories, it should be easier to scrape through each individual category one by one to advoid connection request timeout.

In [18]:
all_products = []

In [19]:
#Accessories

cat_page = requests.get("https://ssense.com" + mens_categories['ACCESSORIES'], timeout = 5)
cat_soup = BeautifulSoup(cat_page.content, 'html.parser')
cat_page.close()

last_page = int(cat_soup.find_all('li', class_='last-page')[0].get_text())

#first page
    
num_product = len(cat_soup.find_all("script", {"type": "application/ld+json"}))
    
for p in range(num_product):
    product_json = json.loads(cat_soup.find_all("script", {"type": "application/ld+json"})[p].get_text())
        
    prod_id = product_json['productID']
    prod_name = product_json['name']
    prod_sku = product_json['sku']
    prod_brand = product_json['brand']['name']
    prod_price = product_json['offers']['price']
    prod_currency = product_json['offers']['priceCurrency']
    prod_url = product_json['url']
    prod_img = product_json['image']                  

    prod_details = ['ACCESSORIES', prod_id, prod_brand, prod_name, prod_sku, prod_price, prod_currency, prod_url, prod_img]
        
    all_products.append(prod_details)

sleep(5)
    
for page in range(1,last_page):
    cat_page = requests.get("https://ssense.com" + mens_categories['ACCESSORIES'] + '?page=' + str(page+1), timeout = 5)
    cat_soup = BeautifulSoup(cat_page.content, 'html.parser')
    cat_page.close()
        
    num_product = len(cat_soup.find_all("script", {"type": "application/ld+json"}))

    for p in range(num_product):
        product_json = json.loads(cat_soup.find_all("script", {"type": "application/ld+json"})[p].get_text())
        
        prod_id = product_json['productID']
        prod_name = product_json['name']
        prod_sku = product_json['sku']
        prod_brand = product_json['brand']['name']
        prod_price = product_json['offers']['price']
        prod_currency = product_json['offers']['priceCurrency']
        prod_url = product_json['url']
        prod_img = product_json['image']                  

        prod_details = ['ACCESSORIES', prod_id, prod_brand, prod_name, prod_sku, prod_price, prod_currency, prod_url, prod_img]
        
        all_products.append(prod_details)
        
    sleep(5)

In [20]:
#Bags

cat_page = requests.get("https://ssense.com" + mens_categories['BAGS'], timeout = 5)
cat_soup = BeautifulSoup(cat_page.content, 'html.parser')
cat_page.close()

last_page = int(cat_soup.find_all('li', class_='last-page')[0].get_text())

#first page
    
num_product = len(cat_soup.find_all("script", {"type": "application/ld+json"}))
    
for p in range(num_product):
    product_json = json.loads(cat_soup.find_all("script", {"type": "application/ld+json"})[p].get_text())
        
    prod_id = product_json['productID']
    prod_name = product_json['name']
    prod_sku = product_json['sku']
    prod_brand = product_json['brand']['name']
    prod_price = product_json['offers']['price']
    prod_currency = product_json['offers']['priceCurrency']
    prod_url = product_json['url']
    prod_img = product_json['image']                  

    prod_details = ['BAGS', prod_id, prod_brand, prod_name, prod_sku, prod_price, prod_currency, prod_url, prod_img]
        
    all_products.append(prod_details)

sleep(5)
    
for page in range(1,last_page):
    cat_page = requests.get("https://ssense.com" + mens_categories['BAGS'] + '?page=' + str(page+1), timeout = 5)
    cat_soup = BeautifulSoup(cat_page.content, 'html.parser')
    cat_page.close()
        
    num_product = len(cat_soup.find_all("script", {"type": "application/ld+json"}))

    for p in range(num_product):
        product_json = json.loads(cat_soup.find_all("script", {"type": "application/ld+json"})[p].get_text())
        
        prod_id = product_json['productID']
        prod_name = product_json['name']
        prod_sku = product_json['sku']
        prod_brand = product_json['brand']['name']
        prod_price = product_json['offers']['price']
        prod_currency = product_json['offers']['priceCurrency']
        prod_url = product_json['url']
        prod_img = product_json['image']                  

        prod_details = ['BAGS', prod_id, prod_brand, prod_name, prod_sku, prod_price, prod_currency, prod_url, prod_img]
        
        all_products.append(prod_details)
        
    sleep(5)

In [21]:
#Shoes

cat_page = requests.get("https://ssense.com" + mens_categories['SHOES'], timeout = 5)
cat_soup = BeautifulSoup(cat_page.content, 'html.parser')
cat_page.close()

last_page = int(cat_soup.find_all('li', class_='last-page')[0].get_text())

#first page
    
num_product = len(cat_soup.find_all("script", {"type": "application/ld+json"}))
    
for p in range(num_product):
    product_json = json.loads(cat_soup.find_all("script", {"type": "application/ld+json"})[p].get_text())
        
    prod_id = product_json['productID']
    prod_name = product_json['name']
    prod_sku = product_json['sku']
    prod_brand = product_json['brand']['name']
    prod_price = product_json['offers']['price']
    prod_currency = product_json['offers']['priceCurrency']
    prod_url = product_json['url']
    prod_img = product_json['image']                  

    prod_details = ['SHOES', prod_id, prod_brand, prod_name, prod_sku, prod_price, prod_currency, prod_url, prod_img]
        
    all_products.append(prod_details)

sleep(5)
    
for page in range(1,last_page):
    cat_page = requests.get("https://ssense.com" + mens_categories['SHOES'] + '?page=' + str(page+1), timeout = 5)
    cat_soup = BeautifulSoup(cat_page.content, 'html.parser')
    cat_page.close()
        
    num_product = len(cat_soup.find_all("script", {"type": "application/ld+json"}))

    for p in range(num_product):
        product_json = json.loads(cat_soup.find_all("script", {"type": "application/ld+json"})[p].get_text())
        
        prod_id = product_json['productID']
        prod_name = product_json['name']
        prod_sku = product_json['sku']
        prod_brand = product_json['brand']['name']
        prod_price = product_json['offers']['price']
        prod_currency = product_json['offers']['priceCurrency']
        prod_url = product_json['url']
        prod_img = product_json['image']                  

        prod_details = ['SHOES', prod_id, prod_brand, prod_name, prod_sku, prod_price, prod_currency, prod_url, prod_img]
        
        all_products.append(prod_details)
        
    sleep(5)

In [24]:
all_products2 = all_products.copy()

In [25]:
#Clothing

cat_page = requests.get("https://ssense.com" + mens_categories['CLOTHING'], timeout = 5)
cat_soup = BeautifulSoup(cat_page.content, 'html.parser')
cat_page.close()

last_page = int(cat_soup.find_all('li', class_='last-page')[0].get_text())

#first page
    
num_product = len(cat_soup.find_all("script", {"type": "application/ld+json"}))
    
for p in range(num_product):
    product_json = json.loads(cat_soup.find_all("script", {"type": "application/ld+json"})[p].get_text())
        
    prod_id = product_json['productID']
    prod_name = product_json['name']
    prod_sku = product_json['sku']
    prod_brand = product_json['brand']['name']
    prod_price = product_json['offers']['price']
    prod_currency = product_json['offers']['priceCurrency']
    prod_url = product_json['url']
    prod_img = product_json['image']                  

    prod_details = ['CLOTHING', prod_id, prod_brand, prod_name, prod_sku, prod_price, prod_currency, prod_url, prod_img]
        
    all_products2.append(prod_details)

sleep(5)
    
for page in range(1,last_page):
    cat_page = requests.get("https://ssense.com" + mens_categories['CLOTHING'] + '?page=' + str(page+1), timeout = 5)
    cat_soup = BeautifulSoup(cat_page.content, 'html.parser')
    cat_page.close()
        
    num_product = len(cat_soup.find_all("script", {"type": "application/ld+json"}))

    for p in range(num_product):
        product_json = json.loads(cat_soup.find_all("script", {"type": "application/ld+json"})[p].get_text())
        
        prod_id = product_json['productID']
        prod_name = product_json['name']
        prod_sku = product_json['sku']
        prod_brand = product_json['brand']['name']
        prod_price = product_json['offers']['price']
        prod_currency = product_json['offers']['priceCurrency']
        prod_url = product_json['url']
        prod_img = product_json['image']                  

        prod_details = ['CLOTHING', prod_id, prod_brand, prod_name, prod_sku, prod_price, prod_currency, prod_url, prod_img]
        
        all_products2.append(prod_details)
        
    sleep(5)

ReadTimeout: HTTPSConnectionPool(host='www.ssense.com', port=443): Read timed out. (read timeout=5)

In [36]:
#Temp fix - run from last break down

for page in range(305,last_page): #i at timeout is 305
    cat_page = requests.get("https://ssense.com" + mens_categories['CLOTHING'] + '?page=' + str(page+1), timeout = 5)
    cat_soup = BeautifulSoup(cat_page.content, 'html.parser')
    cat_page.close()
        
    num_product = len(cat_soup.find_all("script", {"type": "application/ld+json"}))

    for p in range(num_product):
        product_json = json.loads(cat_soup.find_all("script", {"type": "application/ld+json"})[p].get_text())
        
        prod_id = product_json['productID']
        prod_name = product_json['name']
        prod_sku = product_json['sku']
        prod_brand = product_json['brand']['name']
        prod_price = product_json['offers']['price']
        prod_currency = product_json['offers']['priceCurrency']
        prod_url = product_json['url']
        prod_img = product_json['image']                  

        prod_details = ['CLOTHING', prod_id, prod_brand, prod_name, prod_sku, prod_price, prod_currency, prod_url, prod_img]
        
        all_products2.append(prod_details)
        
    sleep(5)

In [41]:
import pickle

pickle_out = open("all_products_12042019.pickle","wb")
pickle.dump(all_products2,pickle_out)
pickle_out.close()

In [42]:
pickle_in = open("all_products_12042019.pickle","rb")
all_products_with_des = pickle.load(pickle_in)

In [43]:
len(all_products_with_des)

30506

In [45]:
def chunkIt(seq, num):
    avg = len(seq) / float(num)
    out = []
    last = 0.0

    while last < len(seq):
        out.append(seq[int(last):int(last + avg)])
        last += avg

    return out

In [47]:
scrape_chunks = chunkIt(range(len(all_products_with_des)),500)

print(scrape_chunks[:5])

[range(0, 61), range(61, 122), range(122, 183), range(183, 244), range(244, 305)]


In [54]:
for c in scrape_chunks:
    
    for i in c:
        
        prod_page = requests.get("https://www.ssense.com/en-ca" + all_products_with_des[i][7], timeout = 5)
        prod_soup = BeautifulSoup(prod_page.content, 'html.parser')
        prod_page.close()
        
        try:
            prod_desc = json.loads(prod_soup.find_all("script", {"type": "application/ld+json"})[0].get_text())['description']
        
        except KeyError:
            prod_desc = 'N/A'
        
        all_products_with_des[i].append(prod_desc)
    sleep(5)

ReadTimeout: HTTPSConnectionPool(host='www.ssense.com', port=443): Read timed out. (read timeout=5)

In [55]:
#c at timeout
c

range(20683, 20744)

In [56]:
#i at timeout
i

20742

In [69]:
#Temp fix - run from last break down

scrape_chunks2 = chunkIt(range(i,len(all_products_with_des)),250)

print(scrape_chunks2[:5])

[range(20742, 20781), range(20781, 20820), range(20820, 20859), range(20859, 20898), range(20898, 20937)]


In [70]:
for c2 in scrape_chunks2:
    
    for i in c2:
        
        prod_page = requests.get("https://www.ssense.com/en-ca" + all_products_with_des[i][7], timeout = 5)
        prod_soup = BeautifulSoup(prod_page.content, 'html.parser')
        prod_page.close()
        
        try:
            prod_desc = json.loads(prod_soup.find_all("script", {"type": "application/ld+json"})[0].get_text())['description']
        
        except KeyError:
            prod_desc = 'N/A'
        
        all_products_with_des[i].append(prod_desc)
    sleep(5)

ReadTimeout: HTTPSConnectionPool(host='www.ssense.com', port=443): Read timed out. (read timeout=5)

In [72]:
#c at timeout
c2

range(21640, 21679)

In [73]:
#i at timeout
i

21642

In [74]:
#Temp fix - run from last break down

scrape_chunks3 = chunkIt(range(i,len(all_products_with_des)),250)

print(scrape_chunks3[:5])

[range(21642, 21677), range(21677, 21712), range(21712, 21748), range(21748, 21783), range(21783, 21819)]


In [76]:
for c3 in scrape_chunks3:
    
    for i in c3:
        
        prod_page = requests.get("https://www.ssense.com/en-ca" + all_products_with_des[i][7], timeout = 5)
        prod_soup = BeautifulSoup(prod_page.content, 'html.parser')
        prod_page.close()
        
        try:
            prod_desc = json.loads(prod_soup.find_all("script", {"type": "application/ld+json"})[0].get_text())['description']
        
        except KeyError:
            prod_desc = 'N/A'
        
        all_products_with_des[i].append(prod_desc)
    sleep(7)

ReadTimeout: HTTPSConnectionPool(host='www.ssense.com', port=443): Read timed out. (read timeout=5)

In [77]:
#c at timeout
c3

range(25754, 25790)

In [78]:
#i at timeout
i

25760

In [79]:
#Temp fix - run from last break down

scrape_chunks4 = chunkIt(range(i,len(all_products_with_des)),250)

print(scrape_chunks4[:5])

[range(25760, 25778), range(25778, 25797), range(25797, 25816), range(25816, 25835), range(25835, 25854)]


In [84]:
for c4 in scrape_chunks4:
    
    for i in c4:
        
        prod_page = requests.get("https://www.ssense.com/en-ca" + all_products_with_des[i][7], timeout = 5)
        prod_soup = BeautifulSoup(prod_page.content, 'html.parser')
        prod_page.close()
        
        try:
            prod_desc = json.loads(prod_soup.find_all("script", {"type": "application/ld+json"})[0].get_text())['description']
        
        except KeyError:
            prod_desc = 'N/A'
        
        all_products_with_des[i].append(prod_desc)
    sleep(7)

In [92]:
#Testing to make sure last item includes description
all_products_with_des[-5:-1]

[['CLOTHING',
  3586729,
  'AMI Alexandre Mattiussi',
  'Red & White Smiley Edition Oxford Shirt',
  '191482M192012',
  195,
  'CAD',
  '/men/product/ami-alexandre-mattiussi/red-and-white-smiley-edition-oxford-shirt/3586729',
  'https://res.cloudinary.com/ssenseweb/image/upload/191482M192012_1.jpg',
  'Long sleeve cotton oxford shirt featuring check pattern in red and white. Buttoned-down spread collar. Button closure at front. Multicolor embroidered Smiley© patch at chest. Single-button barrel cuffs. Drop-tail hem. Tonal stitching. \r\n\r\nPart of the AMI Alexandre Mattiussi x Smiley Collaboration.'],
 ['CLOTHING',
  3713629,
  'Boss',
  'Blue Regular Fit Gordon Shirt',
  '182085M192010',
  103,
  'CAD',
  '/men/product/boss-hugo-boss/blue-regular-fit-gordon-shirt/3713629',
  'https://res.cloudinary.com/ssenseweb/image/upload/182085M192010_1.jpg',
  "Long sleeve cotton twill shirt in 'light' blue. Spread collar. Button closure at front. Adjustable single-button barrel cuffs. Shirttail

In [86]:
#Testing to make sure length is unchanged

len(all_products_with_des)

30506

In [87]:
pickle_out = open("all_products_desp_12042019.pickle","wb")
pickle.dump(all_products_with_des,pickle_out)
pickle_out.close()

### Code Efficiency Improvements

The multiple timeout results from the fact that multiple requests from the same IP are sent at the same time to the site. In addition, Big-O time complexity for double loops over very long range means a very long run time. In order to obtain the 30,506 data points above, it took around 24 hours.

One of the solution is to utilize multiple Virtual Machines to help excecute the codes. Howevever, here a try and except pair could be used whenever a timeout occur to let the code to rest for a while before continuing.

This improvement should be incorporated into the code for the next data pull and will include sizes.