# Imports

In [1]:
import pandas as pd
import numpy as np
import requests
import re
import math

from bs4 import BeautifulSoup
from datetime import datetime

# Data Collection (products)

In [2]:
# URL
url01 = "https://www2.hm.com/en_us/men/products/jeans.html?sort=stock&image-size=small&image=model&offset=0&page-size=72"

# Parameters
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}

# Request to URL
page = requests.get(url01, headers=headers)

# Beautiful Soup object
soup = BeautifulSoup(page.text, 'html.parser')

# ========================= Product Data ====================== #

# List which contains all products
products = soup.find('ul', 'products-listing small')
  
# product_id_categort list
product_id_category = products.find_all('article', 'hm-product-item')

# product_name list
product_name = products.find_all('a', 'link')

# product_price list
product_price = products.find_all('span', 'price regular')

product_id = [p.get('data-articlecode') for p in product_id_category]
product_category = [p.get('data-category') for p in product_id_category]
product_name = [p.get_text() for p in product_name]
product_price = [p.get_text() for p in product_price]

data = pd.DataFrame([product_id, product_name, product_category, product_price]).T
data.columns = ['id', 'product_name', 'product_type', 'price']

data['scrape_datetime'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')


# Data Collection (inside each product)

In [13]:
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}

#empty dataframe
df_final = pd.DataFrame()

# All columns found on website
cols = ['Art. No.', 'Composition', 'Fit', 'More sustainable materials', 'Size']
df_pattern = pd.DataFrame(columns=cols)

for i in range(len(data)):

    #API request
    # conteúdo de headers é padrão
    url02 = "https://www2.hm.com/en_us/productpage." + data.loc[i, 'id'] + ".html"

    page = requests.get(url02, headers=headers)

    #Beautiful Soup object
    soup = BeautifulSoup(page.text, 'html.parser')

    # ============================= Color =========================

    #product list
    product_list = soup.find_all('a', role='radio')

    #color
    product_color = [p.get('data-color') for p in product_list] 

    #id
    product_id = [p.get('data-articlecode') for p in product_list]

    #dataframe
    df_color = pd.DataFrame([product_id, product_color]).T
    df_color.columns = ['id', 'color']

    #generate style id + color id
    df_color['style_id'] = df_color['id'].apply(lambda x: x[:-3])
    df_color['color_id'] = df_color['id'].apply(lambda x: x[-3:])

    # ============================ Composition =====================

    # Product list
    product_composition_list = soup.find_all('div', 'pdp-description-list-item')

    # Composition
    product_composition = [list( filter( None, p.get_text().split('\n') ) ) for p in product_composition_list]


    # dataframe
    df_composition = pd.DataFrame(product_composition).T

    # Columns name
    df_composition.columns = df_composition.iloc[0]

    # Filling None/NA values
    df_composition = df_composition.iloc[1:].fillna(method='ffill')
        
    # The same number of columns (pattern)
    df_composition = pd.concat( [df_pattern, df_composition] )

    # Generate Style ID + Color ID
    # All values, but the last three values
    df_composition['style_id'] = df_composition['Art. No.'].apply(lambda x: x[:-3])
    df_composition['color_id'] = df_composition['Art. No.'].apply(lambda x: x[-3:])

    # ======================= Merging color + composition ==========================
    data_merge = pd.merge(df_color, df_composition[['style_id', 'Fit', 'Composition', 'More sustainable materials', 'Size']], how='left', on='style_id')

    # ======================= Concatenate ==========================================
    df_final = pd.concat( [df_final, data_merge], axis=0 )
        
# Creating style_id + color_id
data['style_id'] = data['id'].apply(lambda x: x[:-3])
data['color_id'] = data['id'].apply(lambda x: x[-3:])

data_raw = pd.merge( data, df_final[['color', 'style_id', 'Fit', 'Composition', 'More sustainable materials', 'Size']], how='left', on='style_id')
    
data_raw.to_csv("data_raw.csv")

data_merge(data, df_final)

IndexError: single positional indexer is out-of-bounds

In [None]:
url = "https://raw.githubusercontent.com/lucasquemelli/ds_ao_dev/main/data_raw.csv"
data = pd.read_csv(url)

data['product_name'] = data['product_name'].apply(lambda x: x.replace(' ', '_').lower() )

data['price'] = data['price'].apply(lambda x: x.replace('$ ', '')).astype(float)

data['scrape_datetime'] = pd.to_datetime(data['scrape_datetime'], format='%Y-%m-%d %H:%M:%S')

data['color'] = data['color'].apply(lambda x: x.replace(' ', '_').replace('/', '_').replace('-', '_').lower() )

data['Fit'] = data['Fit'].apply(lambda x: x.replace(' ', '_').lower())

data = data[~data['Composition'].str.contains('Pocket lining:')]
data = data[~data['Composition'].str.contains('Lining:')]
data = data[~data['Composition'].str.contains('Shell:')]
data = data[~data['Composition'].str.contains('Pocket:')]

# To remove duplicates that are not considered because Na in 'compositions' columns, we selected all columns before
data = data.drop_duplicates(subset=['id', 'product_name', 'product_type', 'price', 'datetime',
                                        'style_id', 'color_id', 'color', 'Fit'], keep='last')

data = data.reset_index(drop=True)

df1 = data['Composition'].str.split(',', expand=True)

df_ref = pd.DataFrame(index=np.arange(len(data)), columns=['Cotton', 'Polyester', 'Spandex'])

# Cotton
df_cotton = df1[0]
df_cotton.name = 'cotton'
df_ref = pd.concat([df_ref, df_cotton], axis=1)
df_ref = df_ref.iloc[:, ~df_ref.columns.duplicated()]
df_ref = df_ref.drop(columns=['Cotton'], axis=1) 
df_ref['cotton'] = df_ref['cotton'].fillna('Cotton 0%')

# Polyester
df_polyester = df1.loc[df1[1].str.contains('Polyester', na=True), 1]
df_polyester.name = 'polyester'
df_ref = pd.concat([df_ref, df_polyester], axis=1)
df_ref = df_ref.drop(columns=['Polyester'], axis=1)
df_ref = df_ref.iloc[:, ~df_ref.columns.duplicated()] 
df_ref['polyester'] = df_ref['polyester'].fillna('Polyester 0%')

# Spandex
df_spandex = df1.loc[df1[1].str.contains('Spandex', na=True), 1]
df_spandex.name = 'spandex'
df_spandex = df_spandex.combine_first(df1[2])
df_ref = pd.concat([df_ref, df_spandex], axis=1)
df_ref = df_ref.drop(columns=['Spandex'], axis=1)
df_ref = df_ref.iloc[:, ~df_ref.columns.duplicated()]
df_ref['spandex'] = df_ref['spandex'].fillna('Spandex 0%')

data = pd.concat([data.reset_index(), df_ref.reset_index()], axis=1)
data = data.drop(columns=['index', 'Unnamed: 0'], axis=1)
data = data.iloc[:, ~data.columns.duplicated()]

data = data.drop_duplicates()

data['cotton'] = data['cotton'].apply(lambda x: int( re.search('\d+', x).group(0))/100 if pd.notnull(x) else x)
data['polyester'] = data['polyester'].apply(lambda x: int( re.search('\d+', x).group(0))/100 if pd.notnull(x) else x)
data['spandex'] = data['spandex'].apply(lambda x: int( re.search('\d+', x).group(0))/100 if pd.notnull(x) else x)


data['model_size'] = data['Size'].apply(lambda x: re.search('\d{3}', x).group(0) if pd.notnull(x) else x).astype(float)
data['jeans_size'] = data['Size'].str.extract('(\d+/\\d+)')
       
data = data.drop(columns=['Size'], axis=1)
data = data.reset_index(drop=True)

data.to_csv("data_clean.csv")