# IMPORTS

In [1]:
import requests
import pandas as pd
import numpy  as np

from datetime import datetime

from bs4 import BeautifulSoup

## Helper functions

# DATA EXTRATION

In [8]:
url = 'https://www2.hm.com/en_us/men/products/jeans.html'

headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:106.0) Gecko/20100101 Firefox/106.0'}

page = requests.get(url, headers=headers)

soup = BeautifulSoup(page.text, 'html.parser')

## URL for all products

In [9]:
# getting number of products
total_item = soup.find_all('h2', class_='load-more-heading')[0].get('data-total')
total_item

# calculating number of pages
page_number = np.ceil(int(total_item)/36)
page_number

# generating url
url02 = url + '?page-size=' + str(int(page_number)*36)
url02

'https://www2.hm.com/en_us/men/products/jeans.html?page-size=108'

In [10]:
page = requests.get(url02, headers=headers)

soup = BeautifulSoup(page.text, 'html.parser')

## Products details

In [11]:
products = soup.find('ul', class_='products-listing small')

product_list = products.find_all('article', class_='hm-product-item')

### Product ID


In [12]:
# product id
product_id = [p.get('data-articlecode') for p in product_list]

### Product Category

In [13]:
# product category
product_category = [p.get('data-category') for p in product_list]

### Product Name

In [14]:
# product name
product_list = products.find_all('a', class_='link')
product_name = [p.get_text() for p in product_list]

### Product Price

In [15]:
# price
product_list = products.find_all('span', class_='price regular')
product_price = [p.get_text() for p in product_list]

### Creating Dataset

In [None]:
data = pd.DataFrame([product_id, product_category, product_name, product_price]).T
data.columns = ['product_id', 'product_category', 'product_name', 'product_price']

# scrapy datetime
data['scrapy_datetime'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')

In [60]:
data.head()

Unnamed: 0,product_id,product_category,product_name,product_price,scrapy_datetime
0,1024256007,men_jeans_slim,Slim Jeans,$ 19.99,2022-10-29 09:00:31
1,985159001,men_jeans_skinny,Skinny Jeans,$ 24.99,2022-10-29 09:00:31
2,1024256001,men_jeans_slim,Slim Jeans,$ 19.99,2022-10-29 09:00:31
3,1071707008,men_jeans_relaxed,Relaxed Jeans,$ 29.99,2022-10-29 09:00:31
4,1100162002,men_jeans_regular,Essentials No 2: THE JEANS,$ 39.99,2022-10-29 09:00:31


### Product Color

In [82]:
product_color = []
product_id = []

# iteration for each id product
for code in data['product_id']:
    url02 = 'https://www2.hm.com/en_us/productpage.' + str(code) + '.html'

    headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:106.0) Gecko/20100101 Firefox/106.0'}

    page = requests.get(url02, headers=headers)
    soup = BeautifulSoup(page.text, 'html.parser')
    
    # color
    color_name = soup.find('a', class_='filter-option miniature active').get('data-color')

    # product id
    product_code = soup.find('a', class_='filter-option miniature active').get('data-articlecode')
    
    product_color.append(color_name)
    product_id.append(product_code)
    
    
df_color = pd.DataFrame({'product_id': product_id, 'color_name': product_color})
df_color

Unnamed: 0,product_id,color_name
0,1024256007,Dark gray
1,0985159001,Black
2,1024256001,Black
3,1071707008,Denim blue
4,1100162002,Denim blue
...,...,...
86,0985197003,Denim blue
87,0811993037,Dark blue
88,1025726003,Graphite gray
89,1024256008,White


### Product Composition


In [90]:
df_composition = pd.DataFrame()
#df_composition.columns = ['Fit', 'Composition', 'Art. No.']

# iteration for each id product
for code in data['product_id']:
    url02 = 'https://www2.hm.com/en_us/productpage.' + str(code) + '.html'

    headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:106.0) Gecko/20100101 Firefox/106.0'}

    page = requests.get(url02, headers=headers)
    soup = BeautifulSoup(page.text, 'html.parser')
    
    product_composition_list = soup.find_all('div', class_='details-attributes-list-item')

    product_composition = [list(filter(None, p.get_text().split('\n'))) for p in product_composition_list]
    
    composition = pd.DataFrame(product_composition).T

    # rename dataframe
    composition.columns = composition.iloc[0]

    # delete columns and first row
    composition = composition[['Fit', 'Composition', 'Art. No.']]
    composition = composition.iloc[1:3].fillna(method='ffill')
    composition_aux = composition.groupby(['Art. No.', 'Fit']).sum().reset_index()
    
    df_composition = pd.concat([df_composition, composition_aux])

df_composition

Unnamed: 0,Art. No.,Fit,Composition
0,1024256007,Slim fit,"Shell: Cotton 99%, Spandex 1%Pocket lining: Po..."
0,0985159001,Skinny fit,"Shell: Cotton 99%, Spandex 1%Pocket lining: Co..."
0,1024256001,Slim fit,"Shell: Cotton 99%, Spandex 1%Pocket lining: Po..."
0,1071707008,Relaxed fit,Shell: Cotton 100%Pocket lining: Polyester 65%...
0,1100162002,Regular fit,"Shell: Cotton 99%, Spandex 1%Pocket lining: Po..."
...,...,...,...
0,0985197003,Slim fit,"Shell: Cotton 99%, Spandex 1%Pocket lining: Po..."
0,0811993037,Regular fit,"Cotton 99%, Spandex 1%Cotton 99%, Spandex 1%"
0,1025726003,Relaxed fit,Shell: Cotton 100%Pocket lining: Polyester 65%...
0,1024256008,Slim fit,"Shell: Cotton 99%, Spandex 1%Pocket lining: Co..."


In [5]:
product_composition_list = soup.find_all('div', class_='details-attributes-list-item')

product_composition = [list(filter(None, p.get_text().split('\n'))) for p in product_composition_list]

In [7]:
df_composition = pd.DataFrame(product_composition).T

# rename dataframe
df_composition.columns = df_composition.iloc[0]

# delete columns and first row
df_composition = df_composition[['Fit', 'Composition', 'Art. No.']]
df_composition = df_composition.iloc[1:3].fillna(method='ffill')
df_composition.groupby(['Art. No.', 'Fit']).sum().reset_index()

Unnamed: 0,Art. No.,Fit,Composition
0,1024256007,Slim fit,"Shell: Cotton 99%, Spandex 1%Pocket lining: Po..."
