# Libraries

In [1]:
import pandas as pd
import numpy as np
import requests

from bs4 import BeautifulSoup
from datetime import datetime

# ID, name, category, price and datetime

In [3]:
url02 = "https://www2.hm.com/en_us/men/products/jeans.html?sort=stock&image-size=small&image=model&offset=0&page-size=72"

# conteúdo de headers é padrão
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
page = requests.get(url02, headers=headers)

soup = BeautifulSoup(page.text, 'html.parser')

products = soup.find('ul', 'products-listing small')
products_list = products.find_all('article', 'hm-product-item')
products_id = [p.get('data-articlecode') for p in products_list]
products_category = [p.get('data-category') for p in products_list]

product_name = products.find_all('a', 'link')
product_name = [p.get_text() for p in product_name]

product_price = products.find_all('span', 'price regular')
product_price = [p.get_text() for p in product_price]

data = pd.DataFrame([products_id, product_name, products_category, product_price]).T
data.columns = ['id', 'product_name', 'product_type', 'price']

datetime.now()
datetime.now().strftime('%Y-%m-%d %H:%M:%S')
data['datetime'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')

In [4]:
data

Unnamed: 0,id,product_name,product_type,price,datetime
0,1024256001,Slim Jeans,men_jeans_slim,$ 19.99,2022-01-22 11:05:58
1,0985159001,Skinny Jeans,men_jeans_skinny,$ 19.99,2022-01-22 11:05:58
2,0875105023,Relaxed Jeans,men_jeans_relaxed,$ 34.99,2022-01-22 11:05:58
3,1024256003,Slim Jeans,men_jeans_slim,$ 19.99,2022-01-22 11:05:58
4,0690449043,Skinny Jeans,men_jeans_ripped,$ 44.99,2022-01-22 11:05:58
...,...,...,...,...,...
63,1025726003,Relaxed Jeans,men_jeans_relaxed,$ 39.99,2022-01-22 11:05:58
64,0993887002,Hybrid Regular Denim Joggers,men_jeans_regular,$ 44.99,2022-01-22 11:05:58
65,1008110003,Freefit® Slim Jeans,men_jeans_slim,$ 59.99,2022-01-22 11:05:58
66,0865734001,Relaxed Tapered Pull-on Jeans,men_jeans_relaxed,$ 29.99,2022-01-22 11:05:58


# Color, Fit, composition, more sustainable materials and size 

In [31]:
total_itens = soup.find_all('h2', 'load-more-heading')[0].get('data-total')
total_itens

'68'

In [32]:
pagination_number = np.round(int(total_itens)/36)
pagination_number

2.0

In [33]:
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}

#empty dataframe
df_final = pd.DataFrame()

# All columns found on website
cols = ['Art. No.', 'Composition', 'Fit', 'More sustainable materials', 'Size']
df_pattern = pd.DataFrame(columns=cols)

for i in range(len(data)):

    #API request
    # conteúdo de headers é padrão
    url02 = "https://www2.hm.com/en_us/productpage." + data.loc[i, 'id'] + ".html"+ "?page-size=" + str(int(pagination_number*36))

    page = requests.get(url02, headers=headers)

    #Beautiful Soup object
    soup = BeautifulSoup(page.text, 'html.parser')

    # ============================= Color =========================

    #product list
    product_list = soup.find_all('a', 'filter-option miniature')

    #color
    product_color = [p.get('data-color') for p in product_list] 

    #id
    product_id = [p.get('data-articlecode') for p in product_list]

    #dataframe
    df_color = pd.DataFrame([product_id, product_color]).T
    df_color.columns = ['id', 'color']

    #generate style id + color id
    df_color['style_id'] = df_color['id'].apply(lambda x: x[:-3])
    df_color['color_id'] = df_color['id'].apply(lambda x: x[-3:])

    # ============================ Composition =====================

    # Product list
    product_composition_list = soup.find_all('div', 'pdp-description-list-item')

    # Composition
    product_composition = [list( filter( None, p.get_text().split('\n') ) ) for p in product_composition_list]

  
    # dataframe
    df_composition = pd.DataFrame(product_composition).T

    # Columns name
    df_composition.columns = df_composition.iloc[0]

    # Filling None/NA values
    df_composition = df_composition.iloc[1:].fillna(method='ffill')

    # The same number of columns (pattern)
    df_composition = pd.concat( [df_pattern, df_composition] )
    
    # Generate Style ID + Color ID
    # All values, but the last three values
    df_composition['style_id'] = df_composition['Art. No.'].apply(lambda x: x[:-3])
    df_composition['color_id'] = df_composition['Art. No.'].apply(lambda x: x[-3:])
    
    # ======================= Merging color + composition ==========================
    data_merge = pd.merge(df_color, df_composition[['style_id', 'Fit', 'Composition', 'More sustainable materials', 'Size']], how='left', on='style_id')

    # ======================= Concatenate ==========================================
    df_final = pd.concat( [df_final, data_merge], axis=0 )

In [35]:
# Creating style_id + color_id
data['style_id'] = data['id'].apply(lambda x: x[:-3])
data['color_id'] = data['id'].apply(lambda x: x[-3:])

data_raw = pd.merge( data, df_final[['color', 'style_id', 'Fit', 'Composition', 'More sustainable materials', 'Size']], how='left', on='style_id')

In [36]:
data_raw

Unnamed: 0,id,product_name,product_type,price,datetime,style_id,color_id,color,Fit,Composition,More sustainable materials,Size
0,1024256001,Slim Jeans,men_jeans_slim,$ 19.99,2022-01-22 11:05:58,1024256,001,Light denim blue,Slim fit,"Pocket lining: Polyester 65%, Cotton 35%",,"The model is 185cm/6'1"" and wears a size 31/32"
1,1024256001,Slim Jeans,men_jeans_slim,$ 19.99,2022-01-22 11:05:58,1024256,001,Light denim blue,Slim fit,"Shell: Cotton 99%, Spandex 1%",,"The model is 185cm/6'1"" and wears a size 31/32"
2,1024256001,Slim Jeans,men_jeans_slim,$ 19.99,2022-01-22 11:05:58,1024256,001,Light denim blue,Slim fit,"Pocket lining: Polyester 65%, Cotton 35%",,"The model is 185cm/6'1"" and wears a size 31/32"
3,1024256001,Slim Jeans,men_jeans_slim,$ 19.99,2022-01-22 11:05:58,1024256,001,Light denim blue,Slim fit,"Shell: Cotton 99%, Spandex 1%",,"The model is 185cm/6'1"" and wears a size 31/32"
4,1024256001,Slim Jeans,men_jeans_slim,$ 19.99,2022-01-22 11:05:58,1024256,001,Denim blue,Slim fit,"Pocket lining: Polyester 65%, Cotton 35%",,"The model is 185cm/6'1"" and wears a size 31/32"
...,...,...,...,...,...,...,...,...,...,...,...,...
3411,0865734001,Relaxed Tapered Pull-on Jeans,men_jeans_relaxed,$ 29.99,2022-01-22 11:05:58,0865734,001,Gray,Relaxed fit,Cotton 100%,,
3412,0865734001,Relaxed Tapered Pull-on Jeans,men_jeans_relaxed,$ 29.99,2022-01-22 11:05:58,0865734,001,Pale denim blue,Relaxed fit,Cotton 100%,,
3413,0927966005,Slim Tapered Jeans,men_jeans_slim,$ 29.99,2022-01-22 11:05:58,0927966,005,Light denim blue,Slim fit,"Cotton 98%, Spandex 2%",,
3414,0927966005,Slim Tapered Jeans,men_jeans_slim,$ 29.99,2022-01-22 11:05:58,0927966,005,Denim blue,Slim fit,"Cotton 98%, Spandex 2%",,


# Teste cor preta

In [37]:
soup.find_all('a', role='radio')

[<a aria-checked="false" class="filter-option miniature" data-articlecode="0927966001" data-color="Light denim blue" data-sizes="" href="/en_us/productpage.0927966001.html" id="filter-colour-0927966001" role="radio" title="Light denim blue">
 <noscript data-alt="Light denim blue" data-src="//lp2.hm.com/hmgoepprod?set=quality%5B79%5D%2Csource%5B%2F0a%2Fe0%2F0ae04a8a72fa18b958a3bf975c9ca667b66985fe.jpg%5D%2Corigin%5Bdam%5D%2Ccategory%5Bmen_jeans_slim%5D%2Ctype%5BDESCRIPTIVESTILLLIFE%5D%2Cres%5Bm%5D%2Chmver%5B2%5D&amp;call=url[file:/product/miniature]">
 <img alt="Light denim blue" src="//lp2.hm.com/hmgoepprod?set=quality%5B79%5D%2Csource%5B%2F0a%2Fe0%2F0ae04a8a72fa18b958a3bf975c9ca667b66985fe.jpg%5D%2Corigin%5Bdam%5D%2Ccategory%5Bmen_jeans_slim%5D%2Ctype%5BDESCRIPTIVESTILLLIFE%5D%2Cres%5Bm%5D%2Chmver%5B2%5D&amp;call=url[file:/product/miniature]"/>
 </noscript>
 <span></span>
 </a>,
 <a aria-checked="false" class="filter-option miniature" data-articlecode="0927966002" data-color="Denim bl

In [38]:
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}

#empty dataframe
df_final = pd.DataFrame()

# All columns found on website
cols = ['Art. No.', 'Composition', 'Fit', 'More sustainable materials', 'Size']
df_pattern = pd.DataFrame(columns=cols)

for i in range(len(data)):

    #API request
    # conteúdo de headers é padrão
    url02 = "https://www2.hm.com/en_us/productpage." + data.loc[i, 'id'] + ".html"+ "?page-size=" + str(int(pagination_number*36))

    page = requests.get(url02, headers=headers)

    #Beautiful Soup object
    soup = BeautifulSoup(page.text, 'html.parser')

    # ============================= Color =========================

    #product list
    product_list = soup.find_all('a', role='radio')

    #color
    product_color = [p.get('data-color') for p in product_list] 

    #id
    product_id = [p.get('data-articlecode') for p in product_list]

    #dataframe
    df_color = pd.DataFrame([product_id, product_color]).T
    df_color.columns = ['id', 'color']

    #generate style id + color id
    df_color['style_id'] = df_color['id'].apply(lambda x: x[:-3])
    df_color['color_id'] = df_color['id'].apply(lambda x: x[-3:])

    # ============================ Composition =====================

    # Product list
    product_composition_list = soup.find_all('div', 'pdp-description-list-item')

    # Composition
    product_composition = [list( filter( None, p.get_text().split('\n') ) ) for p in product_composition_list]

  
    # dataframe
    df_composition = pd.DataFrame(product_composition).T

    # Columns name
    df_composition.columns = df_composition.iloc[0]

    # Filling None/NA values
    df_composition = df_composition.iloc[1:].fillna(method='ffill')

    # The same number of columns (pattern)
    df_composition = pd.concat( [df_pattern, df_composition] )
    
    # Generate Style ID + Color ID
    # All values, but the last three values
    df_composition['style_id'] = df_composition['Art. No.'].apply(lambda x: x[:-3])
    df_composition['color_id'] = df_composition['Art. No.'].apply(lambda x: x[-3:])
    
    # ======================= Merging color + composition ==========================
    data_merge = pd.merge(df_color, df_composition[['style_id', 'Fit', 'Composition', 'More sustainable materials', 'Size']], how='left', on='style_id')

    # ======================= Concatenate ==========================================
    df_final = pd.concat( [df_final, data_merge], axis=0 )

In [39]:
# Creating style_id + color_id
data['style_id'] = data['id'].apply(lambda x: x[:-3])
data['color_id'] = data['id'].apply(lambda x: x[-3:])

data_raw = pd.merge( data, df_final[['color', 'style_id', 'Fit', 'Composition', 'More sustainable materials', 'Size']], how='left', on='style_id')

In [40]:
data_raw

Unnamed: 0,id,product_name,product_type,price,datetime,style_id,color_id,color,Fit,Composition,More sustainable materials,Size
0,1024256001,Slim Jeans,men_jeans_slim,$ 19.99,2022-01-22 11:05:58,1024256,001,Black,Slim fit,"Pocket lining: Polyester 65%, Cotton 35%",,"The model is 185cm/6'1"" and wears a size 31/32"
1,1024256001,Slim Jeans,men_jeans_slim,$ 19.99,2022-01-22 11:05:58,1024256,001,Black,Slim fit,"Shell: Cotton 99%, Spandex 1%",,"The model is 185cm/6'1"" and wears a size 31/32"
2,1024256001,Slim Jeans,men_jeans_slim,$ 19.99,2022-01-22 11:05:58,1024256,001,Light denim blue,Slim fit,"Pocket lining: Polyester 65%, Cotton 35%",,"The model is 185cm/6'1"" and wears a size 31/32"
3,1024256001,Slim Jeans,men_jeans_slim,$ 19.99,2022-01-22 11:05:58,1024256,001,Light denim blue,Slim fit,"Shell: Cotton 99%, Spandex 1%",,"The model is 185cm/6'1"" and wears a size 31/32"
4,1024256001,Slim Jeans,men_jeans_slim,$ 19.99,2022-01-22 11:05:58,1024256,001,Light denim blue,Slim fit,"Pocket lining: Polyester 65%, Cotton 35%",,"The model is 185cm/6'1"" and wears a size 31/32"
...,...,...,...,...,...,...,...,...,...,...,...,...
3902,0865734001,Relaxed Tapered Pull-on Jeans,men_jeans_relaxed,$ 29.99,2022-01-22 11:05:58,0865734,001,Pale denim blue,Relaxed fit,Cotton 100%,,
3903,0927966005,Slim Tapered Jeans,men_jeans_slim,$ 29.99,2022-01-22 11:05:58,0927966,005,Light denim blue,Slim fit,"Cotton 98%, Spandex 2%",,
3904,0927966005,Slim Tapered Jeans,men_jeans_slim,$ 29.99,2022-01-22 11:05:58,0927966,005,Denim blue,Slim fit,"Cotton 98%, Spandex 2%",,
3905,0927966005,Slim Tapered Jeans,men_jeans_slim,$ 29.99,2022-01-22 11:05:58,0927966,005,Black,Slim fit,"Cotton 98%, Spandex 2%",,


# Tratando os dados

In [41]:
data_raw['price'].isna().sum()

0

In [42]:
data_raw['price'].apply(lambda x: x.replace('$', ''))

0        19.99
1        19.99
2        19.99
3        19.99
4        19.99
         ...  
3902     29.99
3903     29.99
3904     29.99
3905     29.99
3906     29.99
Name: price, Length: 3907, dtype: object

In [43]:
data_raw['price'].apply(lambda x: x.replace('$ ', '') if pd.notnull(x) else x)

0       19.99
1       19.99
2       19.99
3       19.99
4       19.99
        ...  
3902    29.99
3903    29.99
3904    29.99
3905    29.99
3906    29.99
Name: price, Length: 3907, dtype: object