In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import re
import math

def get_soup(url, headers):
    page = requests.get(url, headers=headers)
    soup = BeautifulSoup(page.text, 'html.parser')
    return soup


def get_type_price(soup):
    type_price_details = []
    items_list = soup.find_all('article', 'hm-product-item')
    for i in items_list:
        type_price = {
            'url': 'https://www2.hm.com' + i.find('a', class_='item-link').get('href'),
            'id': i.get('data-articlecode'),
            'type':i.find('a', class_='link').get_text(),
            'price': i.find('span', class_='price regular').get_text().replace('$ ', '')

            }
        type_price_details.append(type_price)
    return type_price_details

def get_color(soup):
    color_details = []
    items = soup.find( 'ul', class_='inputlist clearfix')
    items_list = items.find_all('li', 'list-item')
    
    for i in items_list:
        color = {
            'color_url': 'https://www2.hm.com' + i.find('a', role='radio').get('href'),
            'id': i.find('a', role='radio').get('data-articlecode'),
            'color': i.find('a', role='radio').get( 'data-color' ) 
            }
        color_details.append(color)
    return color_details

def get_composition(soup):
    composition = {}
    items_list = soup.find( 'div', class_='details parbase' )
    item = items_list.find_all('script')[1]
    for i in item:
        try:
            composition = {
            'composition': re.search("Shell: (.+),", item.get_text()).group(1)
            }
        except:
             composition = {
            'composition': re.search("'Composition'(.+)Art. No.", item.get_text().replace('\n', '').replace('\t', '').replace('\r', '').replace("'\'", '')).group(1),       
            }                
    return composition

def clean_column(column, regex):
    new_col = []
    for line in column:
        try:
            new_col.append(re.search(regex, line).group(1))
        except:
            new_col.append(line)
    return new_col


In [2]:
url = 'https://www2.hm.com/en_us/men/products/jeans.html'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36'}
page = requests.get( url, headers=headers )
soup = BeautifulSoup(page.text, 'html.parser')

In [3]:
total_item = soup.find_all( 'h2', class_='load-more-heading' )[0].get('data-total')
number_of_pages = math.ceil( int( total_item ) / 36 )
url = url + '?page-size=' + str( int( number_of_pages*36 ) )

In [4]:
soup = get_soup(url, headers)
type_price_list = get_type_price(soup)

df_type = pd.DataFrame(type_price_list)
df_type.to_csv('products.csv', index=False)
df_type = pd.read_csv('products.csv')

for i in df_type['url']:
    soup = get_soup(i, headers)
    list_color = get_color(soup)

df_color = pd.DataFrame(list_color)
df_color.to_csv('color_details.csv', index=False)
df_color = pd.read_csv('color_details.csv')
df_color = df_color.drop_duplicates().reset_index(drop=True)

list_composition = []

for i in df_color['color_url']:
    soup = get_soup(i, headers)
    list_composition.append(get_composition(soup))

df_composition = pd.DataFrame(list_composition)
df_composition.to_csv('product_composition.csv', index=False)
df_composition = pd.read_csv('product_composition.csv')

regex = "\[\'(.+)\'\]   },"
df_composition['composition'] = clean_column(df_composition['composition'], regex)

In [5]:
color_comp = pd.concat([df_color, df_composition], axis=1)

In [6]:
product_details = pd.merge( color_comp, df_type, how='left', on='id' )

In [7]:
product_details['id'] = product_details['id'].astype('string')
product_details['style_id'] = product_details['id'].apply( lambda x: x[:-3])
product_details = product_details.drop(['color_url', 'url'], axis=1)

In [8]:
unique_products = product_details.loc[:, ['style_id', 'price', 'type']].dropna().drop_duplicates(subset='style_id').reset_index(drop=True)

In [9]:
df = pd.merge( product_details, unique_products, how='left', on='style_id' )
df = df.drop(columns=['type_x', 'price_x', 'style_id'], axis=1)
df.columns = ['id', 'color', 'composition', 'price', 'type']

In [10]:
df.to_csv('df_products.csv', index=False)