# IMPORTS

In [78]:
import re
import requests
import pandas as pd
import numpy  as np

from datetime import datetime

from bs4 import BeautifulSoup

## Helper functions

# DATA EXTRATION

In [61]:
url = 'https://www2.hm.com/en_us/men/products/jeans.html'

headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:106.0) Gecko/20100101 Firefox/106.0'}

page = requests.get(url, headers=headers)

soup = BeautifulSoup(page.text, 'html.parser')

## URL for all products

In [62]:
# getting number of products
total_item = soup.find_all('h2', class_='load-more-heading')[0].get('data-total')
total_item

# calculating number of pages
page_number = np.ceil(int(total_item)/36)
page_number

# generating url
url02 = url + '?page-size=' + str(int(page_number)*36)
url02

'https://www2.hm.com/en_us/men/products/jeans.html?page-size=108'

In [63]:
page = requests.get(url02, headers=headers)

soup = BeautifulSoup(page.text, 'html.parser')

## Products details

In [64]:
products = soup.find('ul', class_='products-listing small')

product_list = products.find_all('article', class_='hm-product-item')

### Product ID


In [65]:
# product id
product_id = [p.get('data-articlecode') for p in product_list]

### Product Category

In [66]:
# product category
product_category = [p.get('data-category') for p in product_list]

### Product Name

In [67]:
# product name
product_list = products.find_all('a', class_='link')
product_name = [p.get_text() for p in product_list]

### Product Price

In [68]:
# price
product_list = products.find_all('span', class_='price regular')
product_price = [p.get_text() for p in product_list]

### Creating Dataset

In [69]:
data = pd.DataFrame([product_id, product_category, product_name, product_price]).T
data.columns = ['product_id', 'product_category', 'product_name', 'product_price']

# scrapy datetime
data['scrapy_datetime'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')

In [70]:
data.head()

Unnamed: 0,product_id,product_category,product_name,product_price,scrapy_datetime
0,1024256007,men_jeans_slim,Slim Jeans,$ 19.99,2022-11-01 10:59:07
1,985159001,men_jeans_skinny,Skinny Jeans,$ 24.99,2022-11-01 10:59:07
2,1024256001,men_jeans_slim,Slim Jeans,$ 19.99,2022-11-01 10:59:07
3,1071707008,men_jeans_relaxed,Relaxed Jeans,$ 29.99,2022-11-01 10:59:07
4,993887008,men_jeans_regular,Hybrid Regular Jeans,$ 39.99,2022-11-01 10:59:07


### Product Color and Composition

In [71]:
# empty dataframe for append
df_color = pd.DataFrame()
df_composition = pd.DataFrame()

cols = ['Additional material information', 'Fit', 'Composition', 'Art. No.']
df_pattern = pd.DataFrame(columns=cols)

# unique colors for all products
aux = []

# iteration for each id product
for code in data['product_id']:
    url02 = 'https://www2.hm.com/en_us/productpage.' + str(code) + '.html'

    page = requests.get(url02, headers=headers)
    soup = BeautifulSoup(page.text, 'html.parser')
    
    # Product Color
    color_name = soup.find('a', class_='filter-option miniature active').get('data-color')

    # product id
    product_code = soup.find('a', class_='filter-option miniature active').get('data-articlecode')

    aux1 = pd.DataFrame({'product_id': product_code, 'color_name': color_name}, index=[0])
    df_color = pd.concat([df_color, aux1])

    # Product Composition 
    product_composition_list = soup.find_all('div', class_='details-attributes-list-item')

    product_composition = [list(filter(None, p.get_text().split('\n'))) for p in product_composition_list]
    
    composition = pd.DataFrame(product_composition).T

    # rename dataframe
    composition.columns = composition.iloc[0]

    # delete first row
    composition['Art. No.'] = composition['Art. No.'].fillna(method='ffill')
    composition = composition.iloc[1:]

    composition_aux = composition.fillna('').groupby(['Art. No.'], as_index=False).sum()

    df_composition = pd.concat([df_composition, composition_aux], axis=0)
    
df_composition = df_composition[['Art. No.', 'Fit', 'Size', 'Composition', 'Additional material information']]
df_composition.rename(columns={'Art. No.': 'product_id', 'Fit': 'fit', 'Size': 'size', 'Composition':
                               'compostion', 'Additional material information': 'additional_material'},
                               inplace=True)

# merging the dataframes
df_details = pd.merge(df_color, df_composition, how='left', on='product_id')

data = pd.merge(data, df_details, how='left', on='product_id')
data

Unnamed: 0,product_id,product_category,product_name,product_price,scrapy_datetime,color_name,fit,size,compostion,additional_material
0,1024256007,men_jeans_slim,Slim Jeans,$ 19.99,2022-11-01 10:59:07,Dark gray,Slim fit,,"Shell: Cotton 99%, Spandex 1%Pocket lining: Po...",
1,0985159001,men_jeans_skinny,Skinny Jeans,$ 24.99,2022-11-01 10:59:07,Black,Skinny fit,,"Shell: Cotton 99%, Spandex 1%Pocket lining: Co...",Recycled cotton 20%
2,1024256001,men_jeans_slim,Slim Jeans,$ 19.99,2022-11-01 10:59:07,Black,Slim fit,,"Shell: Cotton 99%, Spandex 1%Pocket lining: Po...",
3,1071707008,men_jeans_relaxed,Relaxed Jeans,$ 29.99,2022-11-01 10:59:07,Denim blue,Relaxed fit,Inner leg: Length: 81.0 cm (Size 33/32),Shell: Cotton 100%Pocket lining: Polyester 65%...,
4,0993887008,men_jeans_regular,Hybrid Regular Jeans,$ 39.99,2022-11-01 10:59:07,Dark gray,Regular fit,,"Cotton 78%, Polyester 21%, Spandex 1%",
...,...,...,...,...,...,...,...,...,...,...
87,0985197003,men_jeans_slim,Slim Jeans,$ 19.99,2022-11-01 10:59:07,Denim blue,Slim fit,,"Shell: Cotton 99%, Spandex 1%Pocket lining: Po...",Recycled cotton 20%
88,0811993037,men_jeans_regular,Regular Jeans,$ 29.99,2022-11-01 10:59:07,Dark blue,Regular fit,,"Cotton 99%, Spandex 1%",Recycled cotton 20%
89,0927964013,men_jeans_regular,Regular Tapered Crop Jeans,$ 19.99,2022-11-01 10:59:07,Black,Regular fit,,Shell: Cotton 100%Pocket lining: Cotton 100%,
90,0985197001,men_jeans_slim,Slim Jeans,$ 19.99,2022-11-01 10:59:07,Black,Slim fit,,"Shell: Cotton 98%, Spandex 2%Pocket lining: Co...",


## Saving the dataset

In [72]:
#data.to_csv('data_hm.csv', index=False)

# DATA MANIPULATION

In [111]:
data = pd.read_csv('data_hm.csv')

In [101]:
data.head()

Unnamed: 0,product_id,product_category,product_name,product_price,scrapy_datetime,color_name,fit,size,compostion,additional_material
0,1024256007,men_jeans_slim,Slim Jeans,$ 19.99,2022-11-01 10:59:07,Dark gray,Slim fit,,"Shell: Cotton 99%, Spandex 1%Pocket lining: Po...",
1,985159001,men_jeans_skinny,Skinny Jeans,$ 24.99,2022-11-01 10:59:07,Black,Skinny fit,,"Shell: Cotton 99%, Spandex 1%Pocket lining: Co...",Recycled cotton 20%
2,1024256001,men_jeans_slim,Slim Jeans,$ 19.99,2022-11-01 10:59:07,Black,Slim fit,,"Shell: Cotton 99%, Spandex 1%Pocket lining: Po...",
3,1071707008,men_jeans_relaxed,Relaxed Jeans,$ 29.99,2022-11-01 10:59:07,Denim blue,Relaxed fit,Inner leg: Length: 81.0 cm (Size 33/32),Shell: Cotton 100%Pocket lining: Polyester 65%...,
4,993887008,men_jeans_regular,Hybrid Regular Jeans,$ 39.99,2022-11-01 10:59:07,Dark gray,Regular fit,,"Cotton 78%, Polyester 21%, Spandex 1%",


In [112]:
# product_name
data['product_name'] = data['product_name'].apply(lambda x: x.replace(' ', '_').lower())

# product_price
data['product_price'] = data['product_price'].apply(lambda x: x.replace ('$ ', '')).astype(float)

# scrapy_datetime
data['scrapy_datetime'] = pd.to_datetime(data['scrapy_datetime'], format='%Y-%m-%d %H:%M:%S')

# color_name
data['color_name'] = data['color_name'].apply(lambda x: x.replace(' ', '_').lower())

# Fit
data['fit'] = data['fit'].apply(lambda x: x.replace(' ', '_').replace('/', '_').lower())

# size
data['inner_leg_length'] = data['size'].apply(lambda x: re.search('Length: (\d+\.\d)', x).group(1)
                                              if (pd.notnull(x) & re.search('Length: (\d+\.\d)', x)) else x)
# Composition
# Additional material information

TypeError: expected string or bytes-like object

In [108]:
re.search('Circumference: (\d+\.\d)', data.loc[3, 'size']).group(1)

AttributeError: 'NoneType' object has no attribute 'group'

In [107]:
pd.notna(data.loc[2, 'size'])

False