## Imports

In [1]:
from datetime import datetime
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import re

## Data Collection

In [2]:
### ID # Product_name ## Product_type ## Product_color #Composition # Price
url = 'https://www2.hm.com/en_us/men/products/jeans.html?sort=stock&image-size=small&image=model&offset=0&page-size=108'
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5),AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}

page = requests.get(url,headers=headers)
soup = BeautifulSoup(page.text,'html.parser')

products = soup.find('ul',class_="products-listing small")

products_list = products.find_all('article', class_='hm-product-item')

### Product_ID ###
product_id = [i.get('data-articlecode') for i in products_list]

### Product_Category ### 
product_category = [i.get('data-category') for i in products_list]

### Product_Name ###
products_list = products.find_all('a', class_='link')
#print(product_list[2].get_text())
product_name = [i.get_text() for i in products_list]

### Product_price ###
products_list = products.find_all('span',class_='price regular')
#print(products_list[0].get_text())
product_price = [i.get_text() for i in products_list]

data = pd.DataFrame([product_id,product_category,product_name,product_price]).T
data.columns = ['product_id','product_category','products_name','products_price']




## Collect all attributes 

In [15]:
#Empty DataFrame
df_compositions = pd.DataFrame()

#Unique columns for all products
aux = []

df_pattern = pd.DataFrame(columns=['Art. No.', 'Composition', 'Fit', 'Product safety', 'Size','More sustainable materials'])

for i in range(len(data)):
    #API Request
    url = 'https://www2.hm.com/en_us/productpage.' + data.loc[i,'product_id'] + '.html'
    #url = 'https://www2.hm.com/en_us/productpage.1004476004.html'
    print('Product: {}'.format(url))

    page = requests.get(url,headers=headers)      

    # Beautiful Soup Object
    soup = BeautifulSoup(page.text,'html.parser')

    ################ Color Name ################ 
    #Color Name and Product Type Actives
    products = soup.find_all('a',class_=['filter-option miniature active','filter-option miniature'])        

    #Color Name and Product Type (Others)
    color = [i.get('data-color') for i in products]

    #Product Id
    product_id = [i.get( 'data-articlecode' ) for i in products]

    df_color = pd.DataFrame( [product_id, color] ).T
    df_color.columns=['product_id','color_name']

    for j in range(len(df_color)):
        #API Request
            url = 'https://www2.hm.com/en_us/productpage.' + df_color.loc[j,'product_id'] + '.html'
            print('Color: {}'.format(url))
            page = requests.get(url,headers=headers)
            
            
            # Beautiful Soup Object
            soup = BeautifulSoup(page.text,'html.parser')

            # =================== Product name =================== #
            product_name = soup.find_all('h1',class_='primary product-item-headline')
            #print(product_name)
            if len(product_name)>0:

                product_name = product_name[0].get_text()
                #print(product_name)

                # =================== Product price ================== #
                product_price = soup.find_all('div',class_='primary-row product-item-price')
                product_price = product_price[0].get_text()
                #print(product_price)  

                ################ Composition ###############
                try:
                    product_composition_list  = soup.find_all('div',class_='pdp-description-list-item')
                    #print(product_composition_list)
                    product_composition = [list(filter(None,p.get_text().split('\n'))) for p in product_composition_list]    

                    # Rename DataFrame
                    df_composition = pd.DataFrame(product_composition).T
                    df_composition.columns = df_composition.iloc[0]

                    #Delete first row
                    df_composition = df_composition.iloc[1:].fillna(method='ffill')

                    #Remove pocket lining, shell and lining
                    df_composition['Composition'] = df_composition['Composition'].replace('Pocket lining: ','',regex=True)
                    df_composition['Composition'] = df_composition['Composition'].replace('Shell: ','',regex=True)
                    df_composition['Composition'] = df_composition['Composition'].replace('Lining: ','',regex=True)

                    df_composition = pd.concat([df_pattern,df_composition],axis=0)

                    #Rename Columns
                    df_composition.columns=['product_id','composition','fit','product_safety','size','sustainable_materials']
                    df_composition['product_name'] = product_name
                    df_composition['product_price'] = product_price
                    aux = aux + df_composition.columns.tolist()

                    # merge data color + composition
                    df_composition = pd.merge(df_composition,df_color,how='left',on='product_id')
                    #print(df_composition)

                    # all products
                    df_compositions = pd.concat( [df_compositions, df_composition], axis=0)
                except:
                    pass

#Join showroon data + details
df_compositions['style_id'] = df_compositions['product_id'].apply(lambda x: x[:-3])
df_compositions['color_id'] = df_compositions['product_id'].apply(lambda x: x[-3:])
#Scrapy datetime
df_compositions['scrapy_datetime'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')



Product: https://www2.hm.com/en_us/productpage.0985197001.html
Color: https://www2.hm.com/en_us/productpage.0985197001.html
Color: https://www2.hm.com/en_us/productpage.0985197002.html
Color: https://www2.hm.com/en_us/productpage.0985197003.html
Color: https://www2.hm.com/en_us/productpage.0985197004.html
Color: https://www2.hm.com/en_us/productpage.0985197005.html
Color: https://www2.hm.com/en_us/productpage.0985197006.html
Color: https://www2.hm.com/en_us/productpage.0985197007.html
Product: https://www2.hm.com/en_us/productpage.0985159001.html
Color: https://www2.hm.com/en_us/productpage.0985159001.html
Color: https://www2.hm.com/en_us/productpage.0985159002.html
Color: https://www2.hm.com/en_us/productpage.0985159003.html
Color: https://www2.hm.com/en_us/productpage.0985159004.html
Color: https://www2.hm.com/en_us/productpage.0985159005.html
Color: https://www2.hm.com/en_us/productpage.0985159006.html
Product: https://www2.hm.com/en_us/productpage.0690449051.html
Color: https://www

## Data Final Cleaning 

In [24]:
#product id
df_data = df_compositions.dropna(subset=['product_id'])

#datetime
#df_data['scrapy_datetime'] = pd.to_datetime(df['scrapy_datetime'],format='%Y-%m-%d %H:%M:%S')

#product name
df_data['product_name'] = df_data['product_name'].str.replace('\n','').str.replace('\t','').str.replace('  ','').str.replace(' ','_').str.lower()
#df_data['product_name'] = df_data['product_name'].str.replace('\t','')

#product price
df_data['product_price'] = df_data['product_price'].str.replace('\n','').str.replace('\r','').str.replace('$','')


#color name
df_data['color_name'] = df_data['color_name'].str.replace(' ','_').str.lower()

#fit
df_data['fit'] = df_data['fit'].apply(lambda x: x.replace(' ','_').replace('/','_').lower()) 

#size

#size number
df_data['size_number'] = df_data['size'].apply(lambda x: re.search( '\d{3}cm',x).group(0) if pd.notnull(x) else x)
df_data['size_number'] = df_data['size_number'].apply(lambda x: re.search('\d{3}',x).group(0) if pd.notnull(x) else x)

#size model
df_data['size_model'] = df_data['size'].str.extract( '(\d+/\\d+)')

# ================================== Composition ============================
df1 = df_data['composition'].str.split(',',expand=True).reset_index(drop=True)

# cotton | polyester | elastano | elasterell
df_ref = pd.DataFrame(index=np.arange(len(df_data)),columns=['cotton','polyester', 'elastane','elasterell'])

#---- Cotton ---- 
df_cotton_0 = df1.loc[df1[0].str.contains('Cotton',na=True),0]
df_cotton_0.name = 'cotton'

df_cotton_1 = df1.loc[df1[1].str.contains('Cotton',na=True),1]
df_cotton_1.name = 'cotton'

#Combine
df_cotton = df_cotton_0.combine_first(df_cotton_1)

df_ref = pd.concat([df_ref,df_cotton],axis=1)
df_ref = df_ref.iloc[:,~df_ref.columns.duplicated(keep='last')]

#----- Polyester ----
df_polyester_0 = df1.loc[df1[0].str.contains('Polyester',na=True),0]
df_polyester_0.name = 'polyester'

df_polyester_1 = df1.loc[df1[1].str.contains('Polyester',na=True),1]
df_polyester_1.name = 'polyester'

#Combine Polyester
df_polyester = df_polyester_0.combine_first(df_polyester_1)

df_ref = pd.concat([df_ref,df_polyester],axis=1)
df_ref = df_ref.iloc[:,~df_ref.columns.duplicated(keep='last')]

#------- Elastano -------
df_elastano_1 = df1.loc[df1[1].str.contains('Elastane',na=True),1]
df_elastano_1.name = 'elastane'

df_elastano_2 = df1.loc[df1[2].str.contains('Elastane',na=True),2]
df_elastano_2.name = 'elastane'

df_elastano_3 = df1.loc[df1[3].str.contains('Elastane',na=True),3]
df_elastano_3.name = 'elastane'

#Combine Elastano
df_elastano_c2 = df_elastano_1.combine_first(df_elastano_2)
df_elastano = df_elastano_c2.combine_first(df_elastano_3)

df_ref = pd.concat([df_ref,df_elastano],axis=1)
df_ref = df_ref.iloc[:,~df_ref.columns.duplicated(keep='last')]

#Elasterell
df_elasterell = df1.loc[df1[1].str.contains('Elasterell',na=True),1]
df_elasterell.name = 'elasterell'

df_ref = pd.concat([df_ref,df_elasterell],axis=1)
df_ref = df_ref.iloc[:,~df_ref.columns.duplicated(keep='last')]

#Join of combine with product_id
df_aux = pd.concat([df_data['product_id'].reset_index(drop=True),df_ref],axis=1)

#format composition data
df_aux['cotton'] = df_aux['cotton'].apply(lambda x: int(re.search('\d+',x).group(0)) / 100 if pd.notnull(x) else x)
df_aux['polyester'] = df_aux['polyester'].apply(lambda x: int(re.search('\d+',x).group(0)) / 100 if pd.notnull(x) else x)
df_aux['elastane'] = df_aux['elastane'].apply(lambda x: int(re.search('\d+',x).group(0)) / 100 if pd.notnull(x) else x)
df_aux['elasterell'] = df_aux['elasterell'].apply(lambda x: int(re.search('\d+',x).group(0)) / 100 if pd.notnull(x) else x)

#Final Join
df_aux = df_aux.groupby('product_id').max().reset_index().fillna(0)
df_data = pd.merge(df_data,df_aux,on='product_id',how='left')

## Drop colums
df_data = df_data.drop(columns=['size','product_safety','sustainable_materials','composition'],axis=1)
df_data = df_data.drop_duplicates()
df_data = df_data.reset_index(drop=True)



#df.head()

  df_data['product_price'] = df_data['product_price'].str.replace('\n','').str.replace('\r','').str.replace('$','')


In [33]:
df_data.columns

Index(['product_id', 'fit', 'product_name', 'product_price', 'color_name',
       'style_id', 'color_id', 'scrapy_datetime', 'size_number', 'size_model',
       'cotton', 'polyester', 'elastane', 'elasterell'],
      dtype='object')

In [38]:
df_data

Unnamed: 0,product_id,fit,product_name,product_price,color_name,style_id,color_id,scrapy_datetime,size_number,size_model,cotton,polyester,elastane,elasterell
0,0985197001,slim_fit,slim_jeans,19.99,black,0985197,001,2021-09-27 14:33:01,189,32/32,1.00,0.00,0.02,0.0
1,0985197002,slim_fit,slim_jeans,19.99,midnight_blue,0985197,002,2021-09-27 14:33:01,182,31/32,0.99,0.65,0.01,0.0
2,0985197003,slim_fit,slim_jeans,19.99,denim_blue,0985197,003,2021-09-27 14:33:01,182,31/32,0.99,0.65,0.01,0.0
3,0985197004,slim_fit,slim_jeans,19.99,dark_denim_blue,0985197,004,2021-09-27 14:33:01,,,1.00,0.00,0.01,0.0
4,0985197005,slim_fit,slim_jeans,19.99,dark_denim_blue,0985197,005,2021-09-27 14:33:01,,,0.99,0.65,0.01,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
176,0890565001,regular_fit,regular_selvedge_jeans,28.99,denim_blue,0890565,001,2021-09-27 14:33:01,,,0.98,0.65,0.02,0.0
177,0890565004,regular_fit,regular_selvedge_jeans,28.99,denim_gray,0890565,004,2021-09-27 14:33:01,,,0.98,0.65,0.02,0.0
178,0890565005,regular_fit,regular_selvedge_jeans,49.99,black,0890565,005,2021-09-27 14:33:01,,,0.98,0.00,0.02,0.0
179,0890565006,regular_fit,regular_selvedge_jeans,49.99,denim_blue,0890565,006,2021-09-27 14:33:01,,,0.98,0.00,0.02,0.0


In [39]:
df_data.dtypes

product_id          object
fit                 object
product_name        object
product_price       object
color_name          object
style_id            object
color_id            object
scrapy_datetime     object
size_number         object
size_model          object
cotton             float64
polyester          float64
elastane           float64
elasterell         float64
dtype: object