# Imports

In [63]:
import pandas as pd
import numpy as np
import requests
import re
import math

from bs4 import BeautifulSoup
from datetime import datetime

# Data Collection (products)

In [146]:
# URL
url01 = "https://www2.hm.com/en_us/men/products/jeans.html?sort=stock&image-size=small&image=model&offset=0&page-size=72"

# Parameters
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}

# Request to URL
page = requests.get(url01, headers=headers)

# Beautiful Soup object
soup = BeautifulSoup(page.text, 'html.parser')

# ========================= Product Data ====================== #

# List which contains all products
products = soup.find('ul', 'products-listing small')
  
# product_id_categort list
product_id_category = products.find_all('article', 'hm-product-item')

# product_name list
product_name = products.find_all('a', 'link')

# product_price list
product_price = products.find_all('span', 'price regular')

product_id = [p.get('data-articlecode') for p in product_id_category]
product_category = [p.get('data-category') for p in product_id_category]
product_name = [p.get_text() for p in product_name]
product_price = [p.get_text() for p in product_price]

data = pd.DataFrame([product_id, product_name, product_category, product_price]).T
data.columns = ['product_id', 'product_name', 'product_type', 'price']

data['scrape_datetime'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')


In [147]:
data.shape

(72, 5)

In [148]:
data.head()

Unnamed: 0,product_id,product_name,product_type,price,scrape_datetime
0,1024256007,Slim Jeans,men_jeans_slim,$ 19.99,2022-10-31 17:02:27
1,1024256001,Slim Jeans,men_jeans_slim,$ 19.99,2022-10-31 17:02:27
2,985159001,Skinny Jeans,men_jeans_skinny,$ 24.99,2022-10-31 17:02:27
3,1071707008,Relaxed Jeans,men_jeans_relaxed,$ 29.99,2022-10-31 17:02:27
4,1024711006,Slim Jeans,men_jeans_slim,$ 29.99,2022-10-31 17:02:27


# Data Collection (inside each product)

In [206]:
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}

#empty dataframe
df_final = pd.DataFrame()

# Auxiliar list in order to monitor new columns
aux = []

# All columns found on website
df_pattern = pd.DataFrame(columns= ['Art. No.', 'Composition', 'Fit', 'More sustainable materials', 'Size'])

for i in range(len(data)):

    #API request
    # conteúdo de headers é padrão
    url02 = "https://www2.hm.com/en_us/productpage." + data.loc[i, 'product_id'] + ".html"

    page = requests.get(url02, headers=headers)

    #Beautiful Soup object
    soup = BeautifulSoup(page.text, 'html.parser')

    # ============================= Color =========================

    #product list
    product_list = soup.find_all('a', class_='filter-option miniature active') + soup.find_all('a', class_='filter-option miniature')

    #color
    product_color = [p.get('data-color') for p in product_list] 

    #id
    product_id = [p.get('data-articlecode') for p in product_list]

    #dataframe
    df_color = pd.DataFrame([product_id, product_color]).T
    df_color.columns = ['product_id', 'color']

    for j in range(len(df_color)):
        
        # ============== API request ========================= 
        
        # conteúdo de headers é padrão
        url03 = "https://www2.hm.com/en_us/productpage." + df_color.loc[j, 'product_id'] + ".html"

        page = requests.get(url03, headers=headers)

        #Beautiful Soup object
        soup = BeautifulSoup(page.text, 'html.parser')
        
        # ============== Product name ==========================
        
        product_name = soup.find('section', class_ = 'product-name-price').find_all('h1')
        product_name = product_name[0].get_text()
        
        # ============== Product price =========================
        
        product_price = soup.find_all('div', class_ = 'primary-row product-item-price')
        product_price = re.findall(r'\d+\.?\d+', product_price[0].get_text())[0]
        

        # ============================ Composition =====================

        # Product list -- we used find and find_all because we could not return composition list only by using find_all in the beginning
        product_composition_list = soup.find('div', class_='content pdp-text pdp-content').find_all('div')

        # Composition
        product_composition = [list( filter( None, p.get_text().split('\n') ) ) for p in product_composition_list]

        # dataframe
        df_composition = pd.DataFrame(product_composition).T

        # Columns name
        df_composition.columns = df_composition.iloc[0]

        # Filling None/NA values
        df_composition = df_composition.iloc[1:].fillna(method='ffill')

        # Removing pocket lining, shell and lining
        df_composition['Composition'] = df_composition['Composition'].str.replace('Pocket lining: ', '', regex=True)
        df_composition['Composition'] = df_composition['Composition'].str.replace('Shell: ', '', regex=True)
        df_composition['Composition'] = df_composition['Composition'].str.replace('Lining: ', '', regex=True)

        # The same number of columns (pattern)
        df_composition = pd.concat( [df_pattern, df_composition] )
        
        # Rename columns
        if j == 0:
            df_composition.columns = ['product_id', 'composition', 'fit', 'product_safety', 'size']
            df_color['product_id'] = df_color['product_id'].astype(str)
            df_composition['product_id'] = df_composition['product_id'].astype(str)
            
        else:
            break

        # Keep new columns if it shows up
        aux = aux + df_composition.columns.tolist()

        # Generate Style ID + Color ID
        # All values, but the last three values
        #df_composition['style_id'] = df_composition['Art. No.'].apply(lambda x: x[:-3])
        #df_composition['color_id'] = df_composition['Art. No.'].apply(lambda x: x[-3:])

        # ======================= Merging color + composition ==========================
        #df_composition.columns = ['product_id', 'composition', 'fit', 'product_safety', 'size']
        #df_color['product_id'] = df_color['product_id'].astype(str)
        #df_composition['product_id'] = df_composition['product_id'].astype(str)
        data_merge = pd.merge(df_composition[['product_id', 'composition', 'fit', 'product_safety', 'size']], df_color, 
                              how='left', on='product_id')
        data_merge.loc[j, 'product_name'] = product_name
        data_merge.loc[j, 'product_price'] = product_price
        
        # ======================= Concatenate ==========================================
        df_final = pd.concat( [df_final, data_merge], axis=0 )
        
        
# ======================= Merging color + composition ==========================
#df_composition.columns = ['product_id', 'composition', 'fit', 'product_safety', 'size']
#df_color['product_id'] = df_color['product_id'].astype(str)
#df_composition['product_id'] = df_composition['product_id'].astype(str)
#data_merge = pd.merge(df_composition[['product_id', 'composition', 'fit', 'size']], df_color, how='left', on='product_id')

# ======================= Concatenate ==========================================
#df_final = pd.concat( [df_final, data_merge], axis=0 )
        
# Creating style_id + color_id
df_final['style_id'] = df_final['product_id'].apply(lambda x: x[:-3])
df_final['color_id'] = df_final['product_id'].apply(lambda x: x[-3:])

df_final['scrape_datetime'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')

data_raw = df_final.copy().reset_index().drop(columns='index')

data_raw.to_csv("data_raw.csv")

In [207]:
data_raw.head()

Unnamed: 0,product_id,composition,fit,product_safety,size,color,product_name,product_price,style_id,color_id,scrape_datetime
0,1024256007,"Cotton 99%, Spandex 1%",Slim fit,,"The model is 176cm/5'9"" and wears a size 31/32",Dark gray,Slim Jeans,19.99,1024256,7,2022-11-01 10:12:51
1,1024256007,"Polyester 65%, Cotton 35%",Slim fit,,"The model is 176cm/5'9"" and wears a size 31/32",Dark gray,,,1024256,7,2022-11-01 10:12:51
2,1024256001,"Cotton 99%, Spandex 1%",Slim fit,,"The model is 185cm/6'1"" and wears a size 31/32",Black,Slim Jeans,19.99,1024256,1,2022-11-01 10:12:51
3,1024256001,"Polyester 65%, Cotton 35%",Slim fit,,"The model is 185cm/6'1"" and wears a size 31/32",Black,,,1024256,1,2022-11-01 10:12:51
4,985159001,"Cotton 99%, Spandex 1%",Skinny fit,,"The model is 185cm/6'1"" and wears a size 31/32",Black,Skinny Jeans,24.99,985159,1,2022-11-01 10:12:51


# Data Cleaning

In [253]:
data = data_raw.dropna(subset=['product_id'])

data = data.reset_index(drop=True)

#data['product_price'] = data['product_price'].apply(lambda x: x.replace('$ ', '')).astype(str)
data['product_price'] = data['product_price'].astype(float)

data['scrape_datetime'] = pd.to_datetime(data['scrape_datetime'], format='%Y-%m-%d %H:%M:%S')

data['color'] = data['color'].apply(lambda x: x.replace(' ', '_').replace('/', '_').replace('-', '_').lower() )

data['fit'] = data['fit'].apply(lambda x: x.replace(' ', '_').lower() ) 

data['product_name'] = data['product_name'].astype(str)
data['product_name'] = data['product_name'].apply(lambda x: x.replace(' ', '_').replace(':', '').replace('®', '').replace('-', '_').lower() )

data = data[~data['composition'].str.contains('Pocket lining:')]
data = data[~data['composition'].str.contains('Lining:')]
data = data[~data['composition'].str.contains('Shell:')]
data = data[~data['composition'].str.contains('Pocket:')]

df1 = data['composition'].str.split(',', expand=True).reset_index(drop=True)

df_ref = pd.DataFrame(index=np.arange(len(data)), columns=['Cotton', 'Polyester', 'Spandex'])

# Cotton
df_cotton = df1[0]
df_cotton.name = 'cotton'
df_ref = pd.concat([df_ref, df_cotton], axis=1)
df_ref = df_ref.iloc[:, ~df_ref.columns.duplicated()]
df_ref = df_ref.drop(columns=['Cotton'], axis=1) 
df_ref['cotton'] = df_ref['cotton'].fillna('Cotton 0%')

# Polyester
df_polyester = df1.loc[df1[1].str.contains('Polyester', na=True), 1]
df_polyester.name = 'polyester'
df_ref = pd.concat([df_ref, df_polyester], axis=1)
df_ref = df_ref.drop(columns=['Polyester'], axis=1)
df_ref = df_ref.iloc[:, ~df_ref.columns.duplicated()] 
df_ref['polyester'] = df_ref['polyester'].fillna('Polyester 0%')

# Spandex
df_spandex = df1.loc[df1[1].str.contains('Spandex', na=True), 1]
df_spandex.name = 'spandex'
df_spandex = df_spandex.combine_first(df1[2])
df_ref = pd.concat([df_ref, df_spandex], axis=1)
df_ref = df_ref.drop(columns=['Spandex'], axis=1)
df_ref = df_ref.iloc[:, ~df_ref.columns.duplicated()]
df_ref['spandex'] = df_ref['spandex'].fillna('Spandex 0%')

data = pd.concat([data.reset_index(), df_ref.reset_index()], axis=1)
data = data.drop(columns=['index'], axis=1)
data = data.iloc[:, ~data.columns.duplicated()]

data = data.drop_duplicates()

data['cotton'] = data['cotton'].apply(lambda x: int( re.search('\d+', x).group(0))/100 if pd.notnull(x) else x)
data['polyester'] = data['polyester'].apply(lambda x: int( re.search('\d+', x).group(0))/100 if pd.notnull(x) else x)
data['spandex'] = data['spandex'].apply(lambda x: int( re.search('\d+', x).group(0))/100 if pd.notnull(x) else x)


data['model_size'] = data['size'].apply(lambda x: re.search('\d{3}', x).group(0) if pd.notnull(x) else x).astype(float)
data['jeans_size'] = data['size'].str.extract('(\d+/\\d+)')
       
data = data.drop(columns=['size'], axis=1).reset_index(drop=True)

data.to_csv("data_clean.csv")

In [254]:
data.head()

Unnamed: 0,product_id,composition,fit,product_safety,color,product_name,product_price,style_id,color_id,scrape_datetime,cotton,polyester,spandex,model_size,jeans_size
0,1024256007,"Cotton 99%, Spandex 1%",slim_fit,,dark_gray,slim_jeans,19.99,1024256,7,2022-11-01 10:12:51,0.99,0.0,0.01,176.0,31/32
1,1024256007,"Polyester 65%, Cotton 35%",slim_fit,,dark_gray,,,1024256,7,2022-11-01 10:12:51,0.65,0.0,0.0,176.0,31/32
2,1024256001,"Cotton 99%, Spandex 1%",slim_fit,,black,slim_jeans,19.99,1024256,1,2022-11-01 10:12:51,0.99,0.0,0.01,185.0,31/32
3,1024256001,"Polyester 65%, Cotton 35%",slim_fit,,black,,,1024256,1,2022-11-01 10:12:51,0.65,0.0,0.0,185.0,31/32
4,985159001,"Cotton 99%, Spandex 1%",skinny_fit,,black,skinny_jeans,24.99,985159,1,2022-11-01 10:12:51,0.99,0.0,0.01,185.0,31/32
