# Modulo 05 - Extração de Dados em HTML

## A biblioteca Beautiful Soup - Teoria

In [2]:
from bs4 import BeautifulSoup

In [1]:
html_doc = """<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""

In [4]:
soup = BeautifulSoup(html_doc, 'html.parser')

In [9]:
print(soup.body)

<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
</body>


In [18]:
soup.find_all('a', id= 'link1')[0]

<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>

## A biblioteca Beautiful Soup - Prática

In [43]:
import requests
import pandas as pd
from datetime import datetime

In [20]:
url = 'https://www2.hm.com/en_us/men/products/jeans.html'

headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
page = requests.get(url, headers=headers)

In [23]:
soup = BeautifulSoup(page.text, 'html.parser')

In [24]:
products = soup.find('ul', class_='products-listing small')

In [29]:
product_list = products.find_all('article', class_='hm-product-item')

# product id
product_id = [p.get('data-articlecode') for p in product_list]

# product category
product_category = [p.get('data-category') for p in product_list]

In [33]:
product_list = products.find_all('a', class_='link')

# product name
product_name = [p.get_text() for p in product_list]

In [39]:
# price
product_list = products.find_all('span', class_='price regular')
product_price = [p.get_text() for p in product_list]

In [40]:
# product color

In [41]:
# product composition

In [46]:
data = pd.DataFrame([product_id, product_category, product_name, product_price]).T
data.columns = ['product_id', 'product_category', 'product_name', 'product_price']

# scrapy datetime
data['scrapy_datetime'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')

'2023-01-05 11:58:14'