# Modulo 05 - Extração de Dados em HTML

## A biblioteca Beautiful Soup - Teoria

In [125]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

  from IPython.core.display import display, HTML


In [43]:
from bs4 import BeautifulSoup

In [42]:
html_doc = """<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""

In [3]:
soup = BeautifulSoup(html_doc, 'html.parser')

In [4]:
print(soup.body)

<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
</body>


In [44]:
soup.find_all('a', id= 'link1')[0]

IndexError: list index out of range

## A biblioteca Beautiful Soup - Prática I

In [45]:
import requests
import pandas as pd
from datetime import datetime
import numpy as np

In [46]:
url = 'https://www2.hm.com/en_us/men/products/jeans.html'

headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
page = requests.get(url, headers=headers)

In [47]:
soup = BeautifulSoup(page.text, 'html.parser')

In [48]:
products = soup.find('ul', class_='products-listing small')

In [49]:
product_list = products.find_all('article', class_='hm-product-item')

# product id
product_id = [p.get('data-articlecode') for p in product_list]

# product category
product_category = [p.get('data-category') for p in product_list]

In [50]:
product_list = products.find_all('a', class_='link')

# product name
product_name = [p.get_text() for p in product_list]

In [51]:
# price
product_list = products.find_all('span', class_='price regular')
product_price = [p.get_text() for p in product_list]

In [13]:
# product color

In [14]:
# product composition

In [52]:
data = pd.DataFrame([product_id, product_category, product_name, product_price]).T
data.columns = ['product_id', 'product_category', 'product_name', 'product_price']

# scrapy datetime
data['scrapy_datetime'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')

## A biblioteca Beautiful Soup - Prática II

In [53]:
url = 'https://www2.hm.com/en_us/men/products/jeans.html'
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}

page = requests.get(url, headers=headers)

In [54]:
soup = BeautifulSoup(page.text, 'html.parser')

In [55]:
total_item = soup.find_all('h2', class_='load-more-heading')[0].get('data-total')
total_item

'98'

In [56]:
page_number = np.round(int(total_item)/36)
page_number

3.0

In [57]:
url02 = url + '?page-size=' + str(int(page_number*36))
url02

'https://www2.hm.com/en_us/men/products/jeans.html?page-size=108'

## A biblioteca Beautiful Soup - Prática III

### One Product

In [58]:
# API Requests
url = 'https://www2.hm.com/en_us/productpage.1019838002.html'

headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
page = requests.get(url, headers=headers)

# Beautiful Soup object
soup = BeautifulSoup(page.text, 'html.parser')

# ============================ color name ============================
product_list = soup.find_all('a', class_='filter-option miniature')
color_name = [p.get('data-color') for p in product_list]

# product id
product_id = [p.get('data-articlecode')for p in product_list]

df_color = pd.DataFrame([product_id, color_name]).T
df_color.columns = ['product_id', 'color_name']

# generate style id + color id
df_color['style_id'] = df_color['product_id'].apply(lambda x: x[:-3])
df_color['color_id'] = df_color['product_id'].apply(lambda x: x[-3:])

# ================================================ composition ====================================================
#product_composition_list = soup.find_all('div', class_="ProductAttributesList-module--descriptionListItem__HzutH")
##product_composition = [list(filter(None, p.get_text().split('\n'))) for p in product_composition_list]

# rename dataframe
#df_composition = pd.DataFrame(product_composition).T
#df_composition.columns = df_composition.iloc[1]

# delete first row
#df_composition = df_composition.iloc[1:].fillna(method='ffill')

# generate style id + color id
#df_composition['style_id'] = df_composition['Art. No.'].apply(lambda x: x[:-3])
#df_composition['color_id'] = df_composition['Art. No.'].apply(lambda x: x[-3:])

# merge data color + composition
#data_sku = pd.merge(df_color, df_composition[['style_id', 'Fit', 'Composition']], how='left', on='style_id'

### Multiple Products

In [142]:
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}

# empty dataframe
df_details = pd.DataFrame()

# unique columns for all products
aux = []

cols = ['Art. No.', 'Composition', 'Fit', 'Product safety', 'Size']
df_pattern = pd.DataFrame( columns=cols )

for i in range(len(data)):
    # API Requests
    url = 'https://www2.hm.com/en_us/productpage.' + str(data.loc[i, 'product_id']) + '.html'

    page = requests.get(url, headers=headers)

    # Beautiful Soup object
    soup = BeautifulSoup(page.text, 'html.parser')

    # ============================ color name ============================
    product_list = soup.find_all('a', class_='filter-option miniature')
    color_name = [p.get('data-color') for p in product_list]

    # product id
    product_id = [p.get('data-articlecode')for p in product_list]

    df_color = pd.DataFrame([product_id, color_name]).T
    df_color.columns = ['product_id', 'color_name']

    # generate style id + color id
    df_color['style_id'] = df_color['product_id'].apply(lambda x: x[:-3])
    df_color['color_id'] = df_color['product_id'].apply(lambda x: x[-3:])

    # ================================================ composition ====================================================
    product_composition_list = soup.find_all('div', class_="ProductAttributesList-module--descriptionListItem__HzutH")
    product_composition = [list(filter(None, p.get_text().split('\n'))) for p in product_composition_list]

    # rename dataframe
    df_composition = pd.DataFrame(product_composition).T
    df_composition.columns = df_composition.iloc[1]

    # delete first row
    df_composition = df_composition.iloc[1:].fillna(method='ffill')

    # generate style id + color id
    df_composition['style_id'] = df_composition['Art. No.'].apply(lambda x: x[:-3])
    df_composition['color_id'] = df_composition['Art. No.'].apply(lambda x: x[-3:])

    aux = aux + df_composition.columns.tolist()
    
    # merge data color + composition
    data_sku = pd.merge(df_color, df_composition[['style_id', 'Fit', 'Composition']], how='left', on='style_id'
                        
    # all details products
    df_details = pd.concat( [df_details, data_sku], axis=0 )
                        
# Join Showroom data + details
data['style_id'] = data['product_id'].apply( lambda x: x[:-3] )
data['color_id'] = data['product_id'].apply( lambda x: x[-3:] )
                        
data_raw = pd.merge( data, df_details[['style_id', 'color_name', 'Fit', 'Composition', 'Size', 'Product safety']], how='left', on='style_id' )   

SyntaxError: invalid syntax (557787055.py, line 56)

In [135]:
data = pd.read_csv('C:/Users/pedro/OneDrive/Documentos/CURSOS/Python do DS ao DEV/products_hm.csv')

In [136]:
data

Unnamed: 0,product_id,product_category,product_name,product_price,scrapy_datetime,style_id,color_id,color_name,fit,size,product safety,composition
0,636207006.0,men_jeans_slim,Slim Jeans,$ 19.99,2021-04-11 17:48:05,636207.0,6.0,Dark denim blue,Slim fit,,,"Cotton 98%, Elastane 2%"
1,636207006.0,men_jeans_slim,Slim Jeans,$ 19.99,2021-04-11 17:48:05,636207.0,6.0,Dark gray denim,Slim fit,,,"Cotton 98%, Elastane 2%"
2,636207006.0,men_jeans_slim,Slim Jeans,$ 19.99,2021-04-11 17:48:05,636207.0,6.0,Denim blue,Slim fit,,,"Cotton 98%, Elastane 2%"
3,636207006.0,men_jeans_slim,Slim Jeans,$ 19.99,2021-04-11 17:48:05,636207.0,6.0,Gray,Slim fit,,,"Cotton 98%, Elastane 2%"
4,636207006.0,men_jeans_slim,Slim Jeans,$ 19.99,2021-04-11 17:48:05,636207.0,6.0,Light denim blue,Slim fit,,,"Cotton 98%, Elastane 2%"
...,...,...,...,...,...,...,...,...,...,...,...,...
2466,814631004.0,men_jeans_slim,Freefit® Slim Jeans,$ 49.99,2021-04-11 17:48:05,814631.0,4.0,Black/No fade black,Slim fit,,,"Cotton 90%, Elasterell-P 8%, Elastane 2%"
2467,814631004.0,men_jeans_slim,Freefit® Slim Jeans,$ 49.99,2021-04-11 17:48:05,814631.0,4.0,Dark blue,Slim fit,,,"Cotton 90%, Elasterell-P 8%, Elastane 2%"
2468,814631004.0,men_jeans_slim,Freefit® Slim Jeans,$ 49.99,2021-04-11 17:48:05,814631.0,4.0,White,Slim fit,,,"Cotton 90%, Elasterell-P 8%, Elastane 2%"
2469,814631004.0,men_jeans_slim,Freefit® Slim Jeans,$ 49.99,2021-04-11 17:48:05,814631.0,4.0,Gray,Slim fit,,,"Cotton 90%, Elasterell-P 8%, Elastane 2%"
