# Web-Scraping on SSENSE.COM - Men's 

In [26]:
#Importing packages

from bs4 import BeautifulSoup
import requests
import json
from time import sleep

## Getting a sense of HTML codes on the site

In [37]:
mens_page = requests.get("https://ssense.com/en-ca/men")
mens_soup = BeautifulSoup(mens_page.content, 'html.parser')

#print(mens_soup.prettify())

## Testing individual steps

### Category Level

In [3]:
#Getting all categories on men's products

cat = mens_soup.find_all('ul', class_='nav nav--stacked')[0]
categories = cat.find_all('li')
categories = [i.get_text().strip() for i in categories][1:]

print(categories)

['ACCESSORIES', 'BAGS', 'CLOTHING', 'SHOES']


In [4]:
#Getting URLs to each categories

categories_url = cat.find_all('a',href=True)
categories_url = [i['href'] for i in categories_url][1:]

In [5]:
#Dictionary containing men's categories and their URLs 

mens_categories = dict(zip(categories, categories_url))
print(mens_categories)

{'ACCESSORIES': '/en-ca/men/accessories', 'BAGS': '/en-ca/men/bags', 'CLOTHING': '/en-ca/men/clothing', 'SHOES': '/en-ca/men/shoes'}


In [36]:
#Diving into individual page of each categories - Example: Accessories

cat_page = requests.get("https://ssense.com" + mens_categories['ACCESSORIES'])
cat_soup = BeautifulSoup(cat_page.content, 'html.parser')

#print(cat_soup.prettify())

In [9]:
#Number of pages for a particular category

cat_soup.find_all('li', class_='last-page')[0].get_text()

'91'

In [10]:
#Getting product information of a particular item on a page

json.loads(cat_soup.find_all("script", {"type": "application/ld+json"})[59].get_text())

{'@context': 'https://schema.org',
 '@type': 'Product',
 'productID': 5057171,
 'name': 'Blue Dystopy 384 Glasses',
 'sku': '201463M133009',
 'brand': {'@type': 'Brand', 'name': 'Thierry Lasry'},
 'offers': {'@type': 'Offer',
  'price': 560,
  'priceCurrency': 'CAD',
  'availability': 'https://schema.org/InStock',
  'url': '/men/product/thierry-lasry/blue-dystopy-384-glasses/5057171'},
 'url': '/men/product/thierry-lasry/blue-dystopy-384-glasses/5057171',
 'image': 'https://res.cloudinary.com/ssenseweb/image/upload/201463M133009_1.jpg'}

In [11]:
#Number of products or items on a page

len(cat_soup.find_all("script", {"type": "application/ld+json"}))

60

### Product Level

In [38]:
#Getting HTML codes for a particular product

product = requests.get("https://ssense.com/en-ca/men/product/random-identities/black-faux-leather-hoodie/5230331")
product_soup = BeautifulSoup(product.content, 'html.parser')

#print(product_soup.prettify())

In [13]:
#Getting information of a product on product page.

js = json.loads(product_soup.find_all("script", {"type": "application/ld+json"})[0].get_text())

js

{'@context': 'https://schema.org',
 '@type': 'Product',
 'productID': 5230331,
 'name': 'Black Faux-Leather Hoodie',
 'sku': '192172M202054',
 'brand': {'@type': 'Brand', 'name': 'Random Identities'},
 'offers': {'@type': 'Offer',
  'price': 761,
  'priceCurrency': 'CAD',
  'availability': 'https://schema.org/InStock',
  'url': '/en-ca/men/product/random-identities/black-faux-leather-hoodie/5230331'},
 'url': '/en-ca/men/product/random-identities/black-faux-leather-hoodie/5230331',
 'description': 'Long sleeve grained faux-leather hoodie in black. Bungee-style drawstring at hood. Zip closure at front. Patch pockets at waist. Elasticized cuffs and hem. Raglan sleeves. Full faux-fur lining in green. Silver-tone hardware. \r\n\r\nSupplier color: Black',
 'image': ['https://res.cloudinary.com/ssenseweb/image/upload/192172M202054_1.jpg',
  'https://res.cloudinary.com/ssenseweb/image/upload/192172M202054_2.jpg',
  'https://res.cloudinary.com/ssenseweb/image/upload/192172M202054_3.jpg',
  'ht

On product page, it has more info regarding the description of the product which the one on category page does not have.

In [14]:
#Product name

test_name = js['name']
test_name

'Black Faux-Leather Hoodie'

In [15]:
#Product SKU

test_sku = js['sku']
test_sku

'192172M202054'

In [16]:
#Product Price

test_price = js['offers']['price']
test_price

761

In [17]:
#Product Price currency

test_currency = js['offers']['priceCurrency']
test_currency

'CAD'

In [18]:
#Product description

test_description = js['description']
test_description

'Long sleeve grained faux-leather hoodie in black. Bungee-style drawstring at hood. Zip closure at front. Patch pockets at waist. Elasticized cuffs and hem. Raglan sleeves. Full faux-fur lining in green. Silver-tone hardware. \r\n\r\nSupplier color: Black'

In [19]:
#Product's current sizes

test_size = product_soup.find_all('option')

test_size = [i.get_text().strip() for i in test_size][1:]

print(test_size)

['XS - Sold Out', 'S', 'L']


### Putting Everything Together

Looping through every pages of each categories to obtain all info (except sizes for now) of all products and combining them together into a list of list

In [30]:
all_products = []

for i, j in mens_categories.items():
    cat_page = requests.get("https://ssense.com" + j, timeout = 5)
    cat_soup = BeautifulSoup(cat_page.content, 'html.parser')
    cat_page.close()
    
    last_page = int(cat_soup.find_all('li', class_='last-page')[0].get_text())
    
    #first page
    
    num_product = len(cat_soup.find_all("script", {"type": "application/ld+json"}))
    
    for p in range(num_product):
        product_json = json.loads(cat_soup.find_all("script", {"type": "application/ld+json"})[p].get_text())
        
        prod_id = product_json['productID']
        prod_name = product_json['name']
        prod_sku = product_json['sku']
        prod_brand = product_json['brand']['name']
        prod_price = product_json['offers']['price']
        prod_currency = product_json['offers']['priceCurrency']
        prod_url = product_json['url']
        prod_img = product_json['image']                  

        prod_details = [i, prod_id, prod_brand, prod_name, prod_sku, prod_price, prod_currency, prod_url, prod_img]
        
        all_products.append(prod_details)
        
    sleep(5)
        
    for page in range(1,last_page):
        cat_page = requests.get("https://ssense.com" + j + '?page=' + str(page+1), timeout = 5)
        cat_soup = BeautifulSoup(cat_page.content, 'html.parser')
        cat_page.close()
        
        num_product = len(cat_soup.find_all("script", {"type": "application/ld+json"}))

        for p in range(num_product):
            product_json = json.loads(cat_soup.find_all("script", {"type": "application/ld+json"})[p].get_text())
        
            prod_id = product_json['productID']
            prod_name = product_json['name']
            prod_sku = product_json['sku']
            prod_brand = product_json['brand']['name']
            prod_price = product_json['offers']['price']
            prod_currency = product_json['offers']['priceCurrency']
            prod_url = product_json['url']
            prod_img = product_json['image']                  

            prod_details = [i, prod_id, prod_brand, prod_name, prod_sku, prod_price, prod_currency, prod_url, prod_img]
        
            all_products.append(prod_details)
        
        sleep(5)

ReadTimeout: HTTPSConnectionPool(host='www.ssense.com', port=443): Read timed out. (read timeout=5)

In [None]:
#Getting the product description

for prod in all_products:
     
    prod_page = requests.get("https://www.ssense.com/en-ca" + prod[7], timeout = 5)
    prod_soup = BeautifulSoup(prod_page.content, 'html.parser')        
    prod_desc = json.loads(prod_soup.find_all("script", {"type": "application/ld+json"})[0].get_text())['description']
    
    prod.append(prod_desc)
    
    sleep(5)

In [35]:
all_products[:10]

[['ACCESSORIES',
  4607571,
  'Loewe',
  'Blue William De Morgan Edition Dragon Beanie',
  '201677M138033',
  1500,
  'CAD',
  '/men/product/loewe/blue-william-de-morgan-edition-dragon-beanie/4607571',
  'https://res.cloudinary.com/ssenseweb/image/upload/201677M138033_1.jpg'],
 ['ACCESSORIES',
  4534841,
  'Off-White',
  'Black Industrial Belt',
  '201607M131235',
  285,
  'CAD',
  '/men/product/off-white/black-industrial-belt/4534841',
  'https://res.cloudinary.com/ssenseweb/image/upload/201607M131235_1.jpg'],
 ['ACCESSORIES',
  4534851,
  'Off-White',
  'Yellow Industrial Belt',
  '201607M131236',
  285,
  'CAD',
  '/men/product/off-white/yellow-industrial-belt/4534851',
  'https://res.cloudinary.com/ssenseweb/image/upload/201607M131236_1.jpg'],
 ['ACCESSORIES',
  4534861,
  'Off-White',
  'Black Mini Industrial Belt',
  '201607M131237',
  240,
  'CAD',
  '/men/product/off-white/black-mini-industrial-belt/4534861',
  'https://res.cloudinary.com/ssenseweb/image/upload/201607M131237_1.

In [32]:
len(all_products)

12277