# Libraries

In [1]:
import pandas as pd
import numpy as np
import requests

from bs4 import BeautifulSoup
from datetime import datetime

# ID, name, category, price and datetime

In [2]:
url02 = "https://www2.hm.com/en_us/men/products/jeans.html?sort=stock&image-size=small&image=model&offset=0&page-size=72"

# conteúdo de headers é padrão
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
page = requests.get(url02, headers=headers)

soup = BeautifulSoup(page.text, 'html.parser')

products = soup.find('ul', 'products-listing small')
products_list = products.find_all('article', 'hm-product-item')
products_id = [p.get('data-articlecode') for p in products_list]
products_category = [p.get('data-category') for p in products_list]

product_name = products.find_all('a', 'link')
product_name = [p.get_text() for p in product_name]

product_price = products.find_all('span', 'price regular')
product_price = [p.get_text() for p in product_price]

data = pd.DataFrame([products_id, product_name, products_category, product_price]).T
data.columns = ['id', 'product_name', 'product_type', 'price']

datetime.now()
datetime.now().strftime('%Y-%m-%d %H:%M:%S')
data['datetime'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')

In [3]:
data

Unnamed: 0,id,product_name,product_type,price,datetime
0,1024256001,Slim Jeans,men_jeans_slim,$ 19.99,2022-01-24 10:09:09
1,1024256003,Slim Jeans,men_jeans_slim,$ 19.99,2022-01-24 10:09:09
2,0985159001,Skinny Jeans,men_jeans_skinny,$ 24.99,2022-01-24 10:09:09
3,0690449056,Skinny Jeans,men_jeans_ripped,$ 39.99,2022-01-24 10:09:09
4,1024256004,Slim Jeans,men_jeans_slim,$ 19.99,2022-01-24 10:09:09
...,...,...,...,...,...
59,0974597001,Slim Tapered Jeans,men_jeans_slim,$ 29.99,2022-01-24 10:09:09
60,0974202002,Regular Denim Joggers,men_jeans_loose,$ 29.99,2022-01-24 10:09:09
61,1025726003,Relaxed Jeans,men_jeans_relaxed,$ 39.99,2022-01-24 10:09:09
62,0993887002,Hybrid Regular Denim Joggers,men_jeans_regular,$ 44.99,2022-01-24 10:09:09


# Color, Fit, composition, more sustainable materials and size 

In [4]:
total_itens = soup.find_all('h2', 'load-more-heading')[0].get('data-total')
total_itens

'64'

In [5]:
pagination_number = np.round(int(total_itens)/36)
pagination_number

2.0

In [6]:
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}

#empty dataframe
df_final = pd.DataFrame()

# All columns found on website
cols = ['Art. No.', 'Composition', 'Fit', 'More sustainable materials', 'Size']
df_pattern = pd.DataFrame(columns=cols)

for i in range(len(data)):

    #API request
    # conteúdo de headers é padrão
    url02 = "https://www2.hm.com/en_us/productpage." + data.loc[i, 'id'] + ".html"+ "?page-size=" + str(int(pagination_number*36))

    page = requests.get(url02, headers=headers)

    #Beautiful Soup object
    soup = BeautifulSoup(page.text, 'html.parser')

    # ============================= Color =========================

    #product list
    product_list = soup.find_all('a', 'filter-option miniature')

    #color
    product_color = [p.get('data-color') for p in product_list] 

    #id
    product_id = [p.get('data-articlecode') for p in product_list]

    #dataframe
    df_color = pd.DataFrame([product_id, product_color]).T
    df_color.columns = ['id', 'color']

    #generate style id + color id
    df_color['style_id'] = df_color['id'].apply(lambda x: x[:-3])
    df_color['color_id'] = df_color['id'].apply(lambda x: x[-3:])

    # ============================ Composition =====================

    # Product list
    product_composition_list = soup.find_all('div', 'pdp-description-list-item')

    # Composition
    product_composition = [list( filter( None, p.get_text().split('\n') ) ) for p in product_composition_list]

  
    # dataframe
    df_composition = pd.DataFrame(product_composition).T

    # Columns name
    df_composition.columns = df_composition.iloc[0]

    # Filling None/NA values
    df_composition = df_composition.iloc[1:].fillna(method='ffill')

    # The same number of columns (pattern)
    df_composition = pd.concat( [df_pattern, df_composition] )
    
    # Generate Style ID + Color ID
    # All values, but the last three values
    df_composition['style_id'] = df_composition['Art. No.'].apply(lambda x: x[:-3])
    df_composition['color_id'] = df_composition['Art. No.'].apply(lambda x: x[-3:])
    
    # ======================= Merging color + composition ==========================
    data_merge = pd.merge(df_color, df_composition[['style_id', 'Fit', 'Composition', 'More sustainable materials', 'Size']], how='left', on='style_id')

    # ======================= Concatenate ==========================================
    df_final = pd.concat( [df_final, data_merge], axis=0 )

In [7]:
# Creating style_id + color_id
data['style_id'] = data['id'].apply(lambda x: x[:-3])
data['color_id'] = data['id'].apply(lambda x: x[-3:])

data_raw = pd.merge( data, df_final[['color', 'style_id', 'Fit', 'Composition', 'More sustainable materials', 'Size']], how='left', on='style_id')

In [8]:
data_raw

Unnamed: 0,id,product_name,product_type,price,datetime,style_id,color_id,color,Fit,Composition,More sustainable materials,Size
0,1024256001,Slim Jeans,men_jeans_slim,$ 19.99,2022-01-24 10:09:09,1024256,001,Light denim blue,Slim fit,"Pocket lining: Polyester 65%, Cotton 35%",,"The model is 185cm/6'1"" and wears a size 31/32"
1,1024256001,Slim Jeans,men_jeans_slim,$ 19.99,2022-01-24 10:09:09,1024256,001,Light denim blue,Slim fit,"Shell: Cotton 99%, Spandex 1%",,"The model is 185cm/6'1"" and wears a size 31/32"
2,1024256001,Slim Jeans,men_jeans_slim,$ 19.99,2022-01-24 10:09:09,1024256,001,Light denim blue,Slim fit,"Pocket lining: Polyester 65%, Cotton 35%",,"The model is 185cm/6'1"" and wears a size 31/32"
3,1024256001,Slim Jeans,men_jeans_slim,$ 19.99,2022-01-24 10:09:09,1024256,001,Light denim blue,Slim fit,"Shell: Cotton 99%, Spandex 1%",,"The model is 185cm/6'1"" and wears a size 31/32"
4,1024256001,Slim Jeans,men_jeans_slim,$ 19.99,2022-01-24 10:09:09,1024256,001,Denim blue,Slim fit,"Pocket lining: Polyester 65%, Cotton 35%",,"The model is 185cm/6'1"" and wears a size 31/32"
...,...,...,...,...,...,...,...,...,...,...,...,...
3190,0993887002,Hybrid Regular Denim Joggers,men_jeans_regular,$ 44.99,2022-01-24 10:09:09,0993887,002,Black,Regular fit,"Cotton 77%, Polyester 21%, Spandex 2%",Recycled cotton 20%,"The model is 189cm/6'2"" and wears a size 32/32"
3191,0865734001,Relaxed Tapered Pull-on Jeans,men_jeans_relaxed,$ 29.99,2022-01-24 10:09:09,0865734,001,Denim blue,Relaxed fit,Cotton 100%,,
3192,0865734001,Relaxed Tapered Pull-on Jeans,men_jeans_relaxed,$ 29.99,2022-01-24 10:09:09,0865734,001,Light denim blue,Relaxed fit,Cotton 100%,,
3193,0865734001,Relaxed Tapered Pull-on Jeans,men_jeans_relaxed,$ 29.99,2022-01-24 10:09:09,0865734,001,Gray,Relaxed fit,Cotton 100%,,


# Teste cor preta

In [9]:
soup.find_all('a', role='radio')

[<a aria-checked="true" class="filter-option miniature active" data-articlecode="0865734001" data-color="Light denim blue" data-sizes="" href="/en_us/productpage.0865734001.html" id="filter-colour-0865734001" role="radio" title="Light denim blue">
 <noscript data-alt="Light denim blue" data-src="//lp2.hm.com/hmgoepprod?set=quality%5B79%5D%2Csource%5B%2Fba%2F62%2Fba620ca501a0869c9bd42a7d10a90a97b6f121b0.jpg%5D%2Corigin%5Bdam%5D%2Ccategory%5Bmen_jeans_tapered%5D%2Ctype%5BDESCRIPTIVESTILLLIFE%5D%2Cres%5Bm%5D%2Chmver%5B1%5D&amp;call=url[file:/product/miniature]">
 <img alt="Light denim blue" src="//lp2.hm.com/hmgoepprod?set=quality%5B79%5D%2Csource%5B%2Fba%2F62%2Fba620ca501a0869c9bd42a7d10a90a97b6f121b0.jpg%5D%2Corigin%5Bdam%5D%2Ccategory%5Bmen_jeans_tapered%5D%2Ctype%5BDESCRIPTIVESTILLLIFE%5D%2Cres%5Bm%5D%2Chmver%5B1%5D&amp;call=url[file:/product/miniature]"/>
 </noscript>
 <span></span>
 </a>,
 <a aria-checked="false" class="filter-option miniature" data-articlecode="0865734002" data-col

In [10]:
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}

#empty dataframe
df_final = pd.DataFrame()

# All columns found on website
cols = ['Art. No.', 'Composition', 'Fit', 'More sustainable materials', 'Size']
df_pattern = pd.DataFrame(columns=cols)

for i in range(len(data)):

    #API request
    # conteúdo de headers é padrão
    url02 = "https://www2.hm.com/en_us/productpage." + data.loc[i, 'id'] + ".html"+ "?page-size=" + str(int(pagination_number*36))

    page = requests.get(url02, headers=headers)

    #Beautiful Soup object
    soup = BeautifulSoup(page.text, 'html.parser')

    # ============================= Color =========================

    #product list
    product_list = soup.find_all('a', role='radio')

    #color
    product_color = [p.get('data-color') for p in product_list] 

    #id
    product_id = [p.get('data-articlecode') for p in product_list]

    #dataframe
    df_color = pd.DataFrame([product_id, product_color]).T
    df_color.columns = ['id', 'color']

    #generate style id + color id
    df_color['style_id'] = df_color['id'].apply(lambda x: x[:-3])
    df_color['color_id'] = df_color['id'].apply(lambda x: x[-3:])

    # ============================ Composition =====================

    # Product list
    product_composition_list = soup.find_all('div', 'pdp-description-list-item')

    # Composition
    product_composition = [list( filter( None, p.get_text().split('\n') ) ) for p in product_composition_list]

  
    # dataframe
    df_composition = pd.DataFrame(product_composition).T

    # Columns name
    df_composition.columns = df_composition.iloc[0]

    # Filling None/NA values
    df_composition = df_composition.iloc[1:].fillna(method='ffill')

    # The same number of columns (pattern)
    df_composition = pd.concat( [df_pattern, df_composition] )
    
    # Generate Style ID + Color ID
    # All values, but the last three values
    df_composition['style_id'] = df_composition['Art. No.'].apply(lambda x: x[:-3])
    df_composition['color_id'] = df_composition['Art. No.'].apply(lambda x: x[-3:])
    
    # ======================= Merging color + composition ==========================
    data_merge = pd.merge(df_color, df_composition[['style_id', 'Fit', 'Composition', 'More sustainable materials', 'Size']], how='left', on='style_id')

    # ======================= Concatenate ==========================================
    df_final = pd.concat( [df_final, data_merge], axis=0 )

In [11]:
# Creating style_id + color_id
data['style_id'] = data['id'].apply(lambda x: x[:-3])
data['color_id'] = data['id'].apply(lambda x: x[-3:])

data_raw = pd.merge( data, df_final[['color', 'style_id', 'Fit', 'Composition', 'More sustainable materials', 'Size']], how='left', on='style_id')

In [12]:
data_raw

Unnamed: 0,id,product_name,product_type,price,datetime,style_id,color_id,color,Fit,Composition,More sustainable materials,Size
0,1024256001,Slim Jeans,men_jeans_slim,$ 19.99,2022-01-24 10:09:09,1024256,001,Black,Slim fit,"Pocket lining: Polyester 65%, Cotton 35%",,"The model is 185cm/6'1"" and wears a size 31/32"
1,1024256001,Slim Jeans,men_jeans_slim,$ 19.99,2022-01-24 10:09:09,1024256,001,Black,Slim fit,"Shell: Cotton 99%, Spandex 1%",,"The model is 185cm/6'1"" and wears a size 31/32"
2,1024256001,Slim Jeans,men_jeans_slim,$ 19.99,2022-01-24 10:09:09,1024256,001,Light denim blue,Slim fit,"Pocket lining: Polyester 65%, Cotton 35%",,"The model is 185cm/6'1"" and wears a size 31/32"
3,1024256001,Slim Jeans,men_jeans_slim,$ 19.99,2022-01-24 10:09:09,1024256,001,Light denim blue,Slim fit,"Shell: Cotton 99%, Spandex 1%",,"The model is 185cm/6'1"" and wears a size 31/32"
4,1024256001,Slim Jeans,men_jeans_slim,$ 19.99,2022-01-24 10:09:09,1024256,001,Light denim blue,Slim fit,"Pocket lining: Polyester 65%, Cotton 35%",,"The model is 185cm/6'1"" and wears a size 31/32"
...,...,...,...,...,...,...,...,...,...,...,...,...
3652,0865734001,Relaxed Tapered Pull-on Jeans,men_jeans_relaxed,$ 29.99,2022-01-24 10:09:09,0865734,001,Light denim blue,Relaxed fit,Cotton 100%,,
3653,0865734001,Relaxed Tapered Pull-on Jeans,men_jeans_relaxed,$ 29.99,2022-01-24 10:09:09,0865734,001,Denim blue,Relaxed fit,Cotton 100%,,
3654,0865734001,Relaxed Tapered Pull-on Jeans,men_jeans_relaxed,$ 29.99,2022-01-24 10:09:09,0865734,001,Light denim blue,Relaxed fit,Cotton 100%,,
3655,0865734001,Relaxed Tapered Pull-on Jeans,men_jeans_relaxed,$ 29.99,2022-01-24 10:09:09,0865734,001,Gray,Relaxed fit,Cotton 100%,,


# Tratando os dados

Test to identify Na values:

In [13]:
data_raw['price'].isna().sum()

0

In case of no Na values:

In [14]:
data_raw['price'].apply(lambda x: x.replace('$', ''))

0        19.99
1        19.99
2        19.99
3        19.99
4        19.99
         ...  
3652     29.99
3653     29.99
3654     29.99
3655     29.99
3656     29.99
Name: price, Length: 3657, dtype: object

In case of Na values:

In [15]:
data_raw['price'].apply(lambda x: x.replace('$ ', '') if pd.notnull(x) else x)

0       19.99
1       19.99
2       19.99
3       19.99
4       19.99
        ...  
3652    29.99
3653    29.99
3654    29.99
3655    29.99
3656    29.99
Name: price, Length: 3657, dtype: object

Another manner:

In [16]:
data_raw.apply(lambda x: x['price'].replace('$ ', '') if pd.notnull(x['price']) else x['price'], axis = 1)

0       19.99
1       19.99
2       19.99
3       19.99
4       19.99
        ...  
3652    29.99
3653    29.99
3654    29.99
3655    29.99
3656    29.99
Length: 3657, dtype: object