# Estudos da biblioteca BeautifulSoup

In [1]:
from bs4 import BeautifulSoup

In [2]:
html_doc = """<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>

<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

<p class="story">...</p>
"""

In [3]:
soup = BeautifulSoup(html_doc, 'html.parser')

Dentro do soup, nós temos acesso a todo o HTML:

In [4]:
soup

<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
</body></html>

#### Filtrando informação

In [5]:
soup.title

<title>The Dormouse's story</title>

In [6]:
soup.story

In [7]:
soup.body

<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
</body>

In [8]:
soup.p

<p class="title"><b>The Dormouse's story</b></p>

In [9]:
soup.html

<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
</body></html>

In [10]:
soup.head

<head><title>The Dormouse's story</title></head>

In [11]:
soup.body.p

<p class="title"><b>The Dormouse's story</b></p>

#### Filtrando informação de parágrafo por índice

In [12]:
soup.find_all('p')

[<p class="title"><b>The Dormouse's story</b></p>,
 <p class="story">Once upon a time there were three little sisters; and their names were
 <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
 and they lived at the bottom of a well.</p>,
 <p class="story">...</p>]

In [13]:
soup.find_all('p')[0]

<p class="title"><b>The Dormouse's story</b></p>

In [14]:
soup.find_all('p')[1]

<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>

In [15]:
soup.find_all('p')[2]

<p class="story">...</p>

#### Filtrando informação de parágrafo por nome

In [16]:
soup.find_all('p', 'story')

[<p class="story">Once upon a time there were three little sisters; and their names were
 <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
 and they lived at the bottom of a well.</p>,
 <p class="story">...</p>]

In [17]:
soup.find_all('p', 'title')

[<p class="title"><b>The Dormouse's story</b></p>]

#### Filtrando informação da tag a (link)

In [18]:
soup.find_all('a')[0]

<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>

In [19]:
soup.find_all('a')[1]

<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>

In [20]:
soup.find_all('a')[2]

<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>

#### Filtrando informação a partir do "id"

In [21]:
soup.find_all('a', id='link1')

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>]

In [22]:
soup.find_all('a', id='link2')

[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]

In [23]:
soup.find_all('a', id='link3')

[<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

#### Filtrando apenas o texto

In [24]:
soup.find_all('a', id='link1')[0].get_text()

'Elsie'

In [25]:
soup.find_all('a', id='link1')[0].string

'Elsie'

# Extração dos dados que planejamos

In [26]:
import pandas as pd
import numpy as np
import requests

from bs4 import BeautifulSoup
from datetime import datetime

In [27]:
url = "https://www2.hm.com/en_us/men/products/jeans.html"

# conteúdo de headers é padrão
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
page = requests.get(url, headers=headers)

In [28]:
page.text

'<!DOCTYPE HTML>\n<html lang="en-US" is-in-aem="false" class="no-js en-us" ng-app="hmApp" >\n    \n\n\n    <head>\r\n    \r\n  <script type="text/javascript" src="/ruxitagentjs_ICA27Vfgjoqru_10227211018092056.js" data-dtconfig="rid=RID_-1928280405|rpid=1730055537|domain=hmgroup.tech|reportUrl=/rb_58096d71-709f-4335-a959-44a125e69dd3|app=7a6f08f8d839fa96|rcdec=1209600000|featureHash=ICA27Vfgjoqru|vcv=2|rdnt=1|uxrgce=1|bp=2|srmcrv=10|cuc=gbmnxyo0|mel=100000|dpvc=1|lastModification=1642644240929|dtVersion=10227211018092056|srmcrl=1|tp=500,50,0,1|uxdcw=1500|agentUri=/ruxitagentjs_ICA27Vfgjoqru_10227211018092056.js"></script><link rel="dns-prefetch" href="https://s1-cdn.hm.com"/>\r\n  <link rel="preconnect" href="https://s1-cdn.hm.com" crossorigin/>\r\n\r\n  <link rel="dns-prefetch" href="https://tags.tiqcdn.com"/>\r\n  <link rel="preconnect" href="https://tags.tiqcdn.com" crossorigin/>\r\n\r\n  <link rel="dns-prefetch" href="https://lp2.hm.com"/>\r\n  <link rel="preconnect" href="https://l

In [29]:
soup = BeautifulSoup(page.text, 'html.parser')

In [30]:
soup

<!DOCTYPE HTML>

<html class="no-js en-us" is-in-aem="false" lang="en-US" ng-app="hmApp">
<head>
<script data-dtconfig="rid=RID_-1928280405|rpid=1730055537|domain=hmgroup.tech|reportUrl=/rb_58096d71-709f-4335-a959-44a125e69dd3|app=7a6f08f8d839fa96|rcdec=1209600000|featureHash=ICA27Vfgjoqru|vcv=2|rdnt=1|uxrgce=1|bp=2|srmcrv=10|cuc=gbmnxyo0|mel=100000|dpvc=1|lastModification=1642644240929|dtVersion=10227211018092056|srmcrl=1|tp=500,50,0,1|uxdcw=1500|agentUri=/ruxitagentjs_ICA27Vfgjoqru_10227211018092056.js" src="/ruxitagentjs_ICA27Vfgjoqru_10227211018092056.js" type="text/javascript"></script><link href="https://s1-cdn.hm.com" rel="dns-prefetch"/>
<link crossorigin="" href="https://s1-cdn.hm.com" rel="preconnect"/>
<link href="https://tags.tiqcdn.com" rel="dns-prefetch"/>
<link crossorigin="" href="https://tags.tiqcdn.com" rel="preconnect"/>
<link href="https://lp2.hm.com" rel="dns-prefetch"/>
<link crossorigin="" href="https://lp2.hm.com" rel="preconnect"/>
<link href="https://cdn-pci.o

In [31]:
products = soup.find('ul', 'products-listing small')

In [32]:
products

<ul class="products-listing small">
<li class="product-item">
<article class="hm-product-item" data-articlecode="1024256001" data-category="men_jeans_slim" data-energy-interval="" data-pre-access-end-date="" data-pre-access-groups="" data-pre-access-start-date="" data-style-with-articlecodes="" onclick="setOsaParameters(utag_data.category_id,'SMALL','1024256001'); setNotificationTicket('Oy9wbHAvcHJvZHVjdC1saXN0LXdpdGgtY291bnQvcHJvZHVjdC1saXN0OyM7cHJvZHVjdF9rZXk7MTAyNDI1Nl9ncm91cF8wMDFfZW5fdXM7MTAyNDI1NjAwMV9lbl91cztPQkpFQ1RJVkUkO05PTkU6Tk9ORTsyMTs','1024256001');">
<div class="image-container">
<a class="item-link" href="/en_us/productpage.1024256001.html" title="Slim Jeans">
<img alt="Slim JeansModel" class="item-image" data-altimage="//lp2.hm.com/hmgoepprod?set=source[/77/b5/77b566ce3845791ecb47a6252b6c1e16fd93ea68.jpg],origin[dam],category[],type[DESCRIPTIVESTILLLIFE],res[m],hmver[2]&amp;call=url[file:/product/style]" data-alttext="Slim Jeans" data-src="//lp2.hm.com/hmgoepprod?set=s

In [33]:
products_list = products.find_all('article', 'hm-product-item')

In [34]:
products_list

[<article class="hm-product-item" data-articlecode="1024256001" data-category="men_jeans_slim" data-energy-interval="" data-pre-access-end-date="" data-pre-access-groups="" data-pre-access-start-date="" data-style-with-articlecodes="" onclick="setOsaParameters(utag_data.category_id,'SMALL','1024256001'); setNotificationTicket('Oy9wbHAvcHJvZHVjdC1saXN0LXdpdGgtY291bnQvcHJvZHVjdC1saXN0OyM7cHJvZHVjdF9rZXk7MTAyNDI1Nl9ncm91cF8wMDFfZW5fdXM7MTAyNDI1NjAwMV9lbl91cztPQkpFQ1RJVkUkO05PTkU6Tk9ORTsyMTs','1024256001');">
 <div class="image-container">
 <a class="item-link" href="/en_us/productpage.1024256001.html" title="Slim Jeans">
 <img alt="Slim JeansModel" class="item-image" data-altimage="//lp2.hm.com/hmgoepprod?set=source[/77/b5/77b566ce3845791ecb47a6252b6c1e16fd93ea68.jpg],origin[dam],category[],type[DESCRIPTIVESTILLLIFE],res[m],hmver[2]&amp;call=url[file:/product/style]" data-alttext="Slim Jeans" data-src="//lp2.hm.com/hmgoepprod?set=source[/00/0b/000b5d6009a7bd05a76a9ec22f8bd784f4b2271a.jpg]

In [35]:
products_list[0]

<article class="hm-product-item" data-articlecode="1024256001" data-category="men_jeans_slim" data-energy-interval="" data-pre-access-end-date="" data-pre-access-groups="" data-pre-access-start-date="" data-style-with-articlecodes="" onclick="setOsaParameters(utag_data.category_id,'SMALL','1024256001'); setNotificationTicket('Oy9wbHAvcHJvZHVjdC1saXN0LXdpdGgtY291bnQvcHJvZHVjdC1saXN0OyM7cHJvZHVjdF9rZXk7MTAyNDI1Nl9ncm91cF8wMDFfZW5fdXM7MTAyNDI1NjAwMV9lbl91cztPQkpFQ1RJVkUkO05PTkU6Tk9ORTsyMTs','1024256001');">
<div class="image-container">
<a class="item-link" href="/en_us/productpage.1024256001.html" title="Slim Jeans">
<img alt="Slim JeansModel" class="item-image" data-altimage="//lp2.hm.com/hmgoepprod?set=source[/77/b5/77b566ce3845791ecb47a6252b6c1e16fd93ea68.jpg],origin[dam],category[],type[DESCRIPTIVESTILLLIFE],res[m],hmver[2]&amp;call=url[file:/product/style]" data-alttext="Slim Jeans" data-src="//lp2.hm.com/hmgoepprod?set=source[/00/0b/000b5d6009a7bd05a76a9ec22f8bd784f4b2271a.jpg],ori

In [36]:
len(products_list)

36

#### Filtrando id

In [37]:
products_list[0].get('data-articlecode')

'1024256001'

In [38]:
products_id = [p.get('data-articlecode') for p in products_list] 

In [39]:
products_id

['1024256001',
 '1024256003',
 '0690449056',
 '0985159001',
 '0690449022',
 '1024256004',
 '1024256007',
 '0690449051',
 '0690449043',
 '0690449036',
 '0875105016',
 '1013317001',
 '1004199005',
 '0985159004',
 '0875105023',
 '0985159005',
 '1008110001',
 '1004199002',
 '0985159008',
 '1024711001',
 '0875105018',
 '1013317008',
 '0974202003',
 '1013317006',
 '0875105024',
 '0938875014',
 '1008549002',
 '1024256006',
 '1013317010',
 '0993887004',
 '1013317002',
 '0985159002',
 '1028865001',
 '0938875007',
 '1008110003',
 '0985197001']

#### Filtrando category/type

In [40]:
products_category = [p.get('data-category') for p in products_list]

In [41]:
products_category

['men_jeans_slim',
 'men_jeans_slim',
 'men_jeans_ripped',
 'men_jeans_skinny',
 'men_jeans_ripped',
 'men_jeans_slim',
 'men_jeans_slim',
 'men_jeans_ripped',
 'men_jeans_ripped',
 'men_jeans_ripped',
 'men_jeans_relaxed',
 'men_jeans_regular',
 'men_jeans_skinny',
 'men_jeans_skinny',
 'men_jeans_relaxed',
 'men_jeans_skinny',
 'men_jeans_slim',
 'men_jeans_skinny',
 'men_jeans_skinny',
 'men_jeans_slim',
 'men_jeans_relaxed',
 'men_jeans_regular',
 'men_jeans_loose',
 'men_jeans_regular',
 'men_jeans_relaxed',
 'men_jeans_slim',
 'men_jeans_regular',
 'men_jeans_slim',
 'men_jeans_regular',
 'men_jeans_regular',
 'men_jeans_regular',
 'men_jeans_skinny',
 'men_jeans_relaxed',
 'men_jeans_slim',
 'men_jeans_slim',
 'men_jeans_slim']

#### Filtrando nome do produto

In [42]:
products.find_all('a', 'link')

[<a class="link" href="/en_us/productpage.1024256001.html">Slim Jeans</a>,
 <a class="link" href="/en_us/productpage.1024256003.html">Slim Jeans</a>,
 <a class="link" href="/en_us/productpage.0690449056.html">Skinny Jeans</a>,
 <a class="link" href="/en_us/productpage.0985159001.html">Skinny Jeans</a>,
 <a class="link" href="/en_us/productpage.0690449022.html">Skinny Jeans</a>,
 <a class="link" href="/en_us/productpage.1024256004.html">Slim Jeans</a>,
 <a class="link" href="/en_us/productpage.1024256007.html">Slim Jeans</a>,
 <a class="link" href="/en_us/productpage.0690449051.html">Skinny Jeans</a>,
 <a class="link" href="/en_us/productpage.0690449043.html">Skinny Jeans</a>,
 <a class="link" href="/en_us/productpage.0690449036.html">Skinny Jeans</a>,
 <a class="link" href="/en_us/productpage.0875105016.html">Relaxed Jeans</a>,
 <a class="link" href="/en_us/productpage.1013317001.html">Hybrid Regular Tapered Joggers</a>,
 <a class="link" href="/en_us/productpage.1004199005.html">Skinny

In [43]:
product_name = products.find_all('a', 'link')

In [44]:
product_name = [p.get_text() for p in product_name]

In [45]:
product_name

['Slim Jeans',
 'Slim Jeans',
 'Skinny Jeans',
 'Skinny Jeans',
 'Skinny Jeans',
 'Slim Jeans',
 'Slim Jeans',
 'Skinny Jeans',
 'Skinny Jeans',
 'Skinny Jeans',
 'Relaxed Jeans',
 'Hybrid Regular Tapered Joggers',
 'Skinny Cropped Jeans',
 'Skinny Jeans',
 'Relaxed Jeans',
 'Skinny Jeans',
 'Freefit® Slim Jeans',
 'Skinny Cropped Jeans',
 'Skinny Jeans',
 'Slim Jeans',
 'Relaxed Jeans',
 'Hybrid Regular Tapered Joggers',
 'Regular Denim Joggers',
 'Hybrid Regular Tapered Joggers',
 'Relaxed Jeans',
 'Slim Tapered Jeans',
 'Regular Jeans',
 'Slim Jeans',
 'Hybrid Regular Tapered Joggers',
 'Hybrid Regular Denim Joggers',
 'Hybrid Regular Tapered Joggers',
 'Skinny Jeans',
 'Relaxed Jeans with Embroidery Detail',
 'Slim Tapered Jeans',
 'Freefit® Slim Jeans',
 'Slim Jeans']

#### Filtrando preço

In [46]:
products.find_all('span', 'price regular')

[<span class="price regular">$ 19.99</span>,
 <span class="price regular">$ 19.99</span>,
 <span class="price regular">$ 39.99</span>,
 <span class="price regular">$ 24.99</span>,
 <span class="price regular">$ 39.99</span>,
 <span class="price regular">$ 19.99</span>,
 <span class="price regular">$ 19.99</span>,
 <span class="price regular">$ 39.99</span>,
 <span class="price regular">$ 39.99</span>,
 <span class="price regular">$ 39.99</span>,
 <span class="price regular">$ 34.99</span>,
 <span class="price regular">$ 39.99</span>,
 <span class="price regular">$ 29.99</span>,
 <span class="price regular">$ 24.99</span>,
 <span class="price regular">$ 34.99</span>,
 <span class="price regular">$ 24.99</span>,
 <span class="price regular">$ 59.99</span>,
 <span class="price regular">$ 29.99</span>,
 <span class="price regular">$ 24.99</span>,
 <span class="price regular">$ 29.99</span>,
 <span class="price regular">$ 34.99</span>,
 <span class="price regular">$ 39.99</span>,
 <span cla

In [47]:
product_price = products.find_all('span', 'price regular')

In [48]:
product_price = [p.get_text() for p in product_price]

In [49]:
product_price

['$ 19.99',
 '$ 19.99',
 '$ 39.99',
 '$ 24.99',
 '$ 39.99',
 '$ 19.99',
 '$ 19.99',
 '$ 39.99',
 '$ 39.99',
 '$ 39.99',
 '$ 34.99',
 '$ 39.99',
 '$ 29.99',
 '$ 24.99',
 '$ 34.99',
 '$ 24.99',
 '$ 59.99',
 '$ 29.99',
 '$ 24.99',
 '$ 29.99',
 '$ 34.99',
 '$ 39.99',
 '$ 29.99',
 '$ 39.99',
 '$ 34.99',
 '$ 39.99',
 '$ 24.99',
 '$ 19.99',
 '$ 39.99',
 '$ 44.99',
 '$ 39.99',
 '$ 24.99',
 '$ 49.99',
 '$ 39.99',
 '$ 59.99',
 '$ 19.99']

#### Dataframe com os dados coletados até o momento

In [50]:
pd.DataFrame([products_id, product_name, products_category, product_price])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,26,27,28,29,30,31,32,33,34,35
0,1024256001,1024256003,0690449056,0985159001,0690449022,1024256004,1024256007,0690449051,0690449043,0690449036,...,1008549002,1024256006,1013317010,0993887004,1013317002,0985159002,1028865001,0938875007,1008110003,0985197001
1,Slim Jeans,Slim Jeans,Skinny Jeans,Skinny Jeans,Skinny Jeans,Slim Jeans,Slim Jeans,Skinny Jeans,Skinny Jeans,Skinny Jeans,...,Regular Jeans,Slim Jeans,Hybrid Regular Tapered Joggers,Hybrid Regular Denim Joggers,Hybrid Regular Tapered Joggers,Skinny Jeans,Relaxed Jeans with Embroidery Detail,Slim Tapered Jeans,Freefit® Slim Jeans,Slim Jeans
2,men_jeans_slim,men_jeans_slim,men_jeans_ripped,men_jeans_skinny,men_jeans_ripped,men_jeans_slim,men_jeans_slim,men_jeans_ripped,men_jeans_ripped,men_jeans_ripped,...,men_jeans_regular,men_jeans_slim,men_jeans_regular,men_jeans_regular,men_jeans_regular,men_jeans_skinny,men_jeans_relaxed,men_jeans_slim,men_jeans_slim,men_jeans_slim
3,$ 19.99,$ 19.99,$ 39.99,$ 24.99,$ 39.99,$ 19.99,$ 19.99,$ 39.99,$ 39.99,$ 39.99,...,$ 24.99,$ 19.99,$ 39.99,$ 44.99,$ 39.99,$ 24.99,$ 49.99,$ 39.99,$ 59.99,$ 19.99


In [51]:
pd.DataFrame([products_id, product_name, products_category, product_price]).T

Unnamed: 0,0,1,2,3
0,1024256001,Slim Jeans,men_jeans_slim,$ 19.99
1,1024256003,Slim Jeans,men_jeans_slim,$ 19.99
2,690449056,Skinny Jeans,men_jeans_ripped,$ 39.99
3,985159001,Skinny Jeans,men_jeans_skinny,$ 24.99
4,690449022,Skinny Jeans,men_jeans_ripped,$ 39.99
5,1024256004,Slim Jeans,men_jeans_slim,$ 19.99
6,1024256007,Slim Jeans,men_jeans_slim,$ 19.99
7,690449051,Skinny Jeans,men_jeans_ripped,$ 39.99
8,690449043,Skinny Jeans,men_jeans_ripped,$ 39.99
9,690449036,Skinny Jeans,men_jeans_ripped,$ 39.99


In [52]:
data = pd.DataFrame([products_id, product_name, products_category, product_price]).T
data.columns = ['id', 'product_name', 'product_type', 'price']

In [53]:
data

Unnamed: 0,id,product_name,product_type,price
0,1024256001,Slim Jeans,men_jeans_slim,$ 19.99
1,1024256003,Slim Jeans,men_jeans_slim,$ 19.99
2,690449056,Skinny Jeans,men_jeans_ripped,$ 39.99
3,985159001,Skinny Jeans,men_jeans_skinny,$ 24.99
4,690449022,Skinny Jeans,men_jeans_ripped,$ 39.99
5,1024256004,Slim Jeans,men_jeans_slim,$ 19.99
6,1024256007,Slim Jeans,men_jeans_slim,$ 19.99
7,690449051,Skinny Jeans,men_jeans_ripped,$ 39.99
8,690449043,Skinny Jeans,men_jeans_ripped,$ 39.99
9,690449036,Skinny Jeans,men_jeans_ripped,$ 39.99


In [54]:
datetime.now()

datetime.datetime(2022, 1, 24, 14, 43, 5, 573563)

In [55]:
datetime.now().strftime('%Y-%m-%d %H:%M:%S')

'2022-01-24 14:43:05'

In [56]:
data['datetime'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')

In [57]:
data

Unnamed: 0,id,product_name,product_type,price,datetime
0,1024256001,Slim Jeans,men_jeans_slim,$ 19.99,2022-01-24 14:43:05
1,1024256003,Slim Jeans,men_jeans_slim,$ 19.99,2022-01-24 14:43:05
2,690449056,Skinny Jeans,men_jeans_ripped,$ 39.99,2022-01-24 14:43:05
3,985159001,Skinny Jeans,men_jeans_skinny,$ 24.99,2022-01-24 14:43:05
4,690449022,Skinny Jeans,men_jeans_ripped,$ 39.99,2022-01-24 14:43:05
5,1024256004,Slim Jeans,men_jeans_slim,$ 19.99,2022-01-24 14:43:05
6,1024256007,Slim Jeans,men_jeans_slim,$ 19.99,2022-01-24 14:43:05
7,690449051,Skinny Jeans,men_jeans_ripped,$ 39.99,2022-01-24 14:43:05
8,690449043,Skinny Jeans,men_jeans_ripped,$ 39.99,2022-01-24 14:43:05
9,690449036,Skinny Jeans,men_jeans_ripped,$ 39.99,2022-01-24 14:43:05


## Pagination/Paginação

Precisamos identificar o número total de páginas.

In [58]:
soup.find_all('h2', 'load-more-heading')[0]

<h2 class="load-more-heading" data-items-shown="36" data-total="64">SHOWING 36 of 64 Items</h2>

In [59]:
total_itens = soup.find_all('h2', 'load-more-heading')[0].get('data-total')
total_itens

'64'

Note que o valor acima é da classe do tipo string. Por isso, vamos transformá-lo em inteiro, conforme o comando abaixo:

In [60]:
int(total_itens)

64

Cada vitrine da H&M para calças jeans masculina possui o valor de 36 itens. Então, vamos coletar os dados de 36 em 36 itens. Por isso, para saber o número de paginação, fazemos:

In [61]:
int(total_itens)/36

1.7777777777777777

In [62]:
np.round(int(total_itens)/36)

2.0

In [63]:
pagination_number = np.round(int(total_itens)/36)
pagination_number

2.0

Então, vamos precisar de duas paginações. Agora, vamos identificar o 'page-size' da nossa url para saber o tamanho da vitrine que temos que colocar na nossa url para trazer todos os itens.

In [64]:
url02 = url + '?page-size=' + str(int(pagination_number*36))
url02

'https://www2.hm.com/en_us/men/products/jeans.html?page-size=72'

Embora o número total de itens seja 57, para trazer todos os itens devemos fazer uma paginação tal qual o valor de page-size seja o acima. 

#### Restante da vitrine

In [65]:
url02 = "https://www2.hm.com/en_us/men/products/jeans.html?sort=stock&image-size=small&image=model&offset=0&page-size=72"

# conteúdo de headers é padrão
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
page = requests.get(url02, headers=headers)

soup = BeautifulSoup(page.text, 'html.parser')

products_list = products.find_all('article', 'hm-product-item')
products_id = [p.get('data-articlecode') for p in products_list]
products_category = [p.get('data-category') for p in products_list]

product_name = products.find_all('a', 'link')
product_name = [p.get_text() for p in product_name]

product_price = products.find_all('span', 'price regular')
product_price = [p.get_text() for p in product_price]

data = pd.DataFrame([products_id, product_name, products_category, product_price]).T
data.columns = ['id', 'product_name', 'product_type', 'price']

datetime.now()
datetime.now().strftime('%Y-%m-%d %H:%M:%S')
data['datetime'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')

In [66]:
data

Unnamed: 0,id,product_name,product_type,price,datetime
0,1024256001,Slim Jeans,men_jeans_slim,$ 19.99,2022-01-24 14:43:08
1,1024256003,Slim Jeans,men_jeans_slim,$ 19.99,2022-01-24 14:43:08
2,690449056,Skinny Jeans,men_jeans_ripped,$ 39.99,2022-01-24 14:43:08
3,985159001,Skinny Jeans,men_jeans_skinny,$ 24.99,2022-01-24 14:43:08
4,690449022,Skinny Jeans,men_jeans_ripped,$ 39.99,2022-01-24 14:43:08
5,1024256004,Slim Jeans,men_jeans_slim,$ 19.99,2022-01-24 14:43:08
6,1024256007,Slim Jeans,men_jeans_slim,$ 19.99,2022-01-24 14:43:08
7,690449051,Skinny Jeans,men_jeans_ripped,$ 39.99,2022-01-24 14:43:08
8,690449043,Skinny Jeans,men_jeans_ripped,$ 39.99,2022-01-24 14:43:08
9,690449036,Skinny Jeans,men_jeans_ripped,$ 39.99,2022-01-24 14:43:08


#### Filtrando por cor

A cor só pode ser coletada na página do produto e não na vitrine.

Inicialmente, vamos fazer só para um produto.

In [67]:
#API request
url = "https://www2.hm.com/en_us/productpage.1024256001.html"

# conteúdo de headers é padrão
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
page = requests.get(url, headers=headers)

In [68]:
#Beautiful Soup object
soup = BeautifulSoup(page.text, 'html.parser')

In [69]:
type(soup)

bs4.BeautifulSoup

In [70]:
soup.find_all('li', 'list-item')

[<li class="list-item">
 <a class="link" data-remodal-trigger="signin-account" href="/en_us/account">My Account</a>
 </li>,
 <li class="list-item">
 <a class="link" href="/en_us/member/info.html">Loyalty Program Info</a>
 </li>,
 <li class="list-item">
 <a class="link" data-remodal-trigger="join" href="#">Not a Member yet? Join here!</a>
 </li>,
 <li class="list-item">
 <a class="link" href="/en_us/account">My Account</a>
 </li>,
 <li class="list-item">
 <a class="link" href="/en_us/member/info.html">Loyalty Program Info</a>
 </li>,
 <li class="list-item">
 <a class="link" href="/en_us/logout" onclick="trackLogout()">Sign out</a>
 </li>,
 <li class="list-item">
 <a aria-checked="true" class="filter-option miniature active" data-articlecode="1024256001" data-color="Black" data-sizes="" href="/en_us/productpage.1024256001.html" id="filter-colour-1024256001" role="radio" title="Black">
 <noscript data-alt="Black" data-src="//lp2.hm.com/hmgoepprod?set=quality%5B79%5D%2Csource%5B%2F77%2Fb5%

In [71]:
soup.find_all('a', 'filter-option miniature')

[<a aria-checked="false" class="filter-option miniature" data-articlecode="1024256002" data-color="Light denim blue" data-sizes="" href="/en_us/productpage.1024256002.html" id="filter-colour-1024256002" role="radio" title="Light denim blue">
 <noscript data-alt="Light denim blue" data-src="//lp2.hm.com/hmgoepprod?set=quality%5B79%5D%2Csource%5B%2Fda%2Fba%2Fdabaf3f73477f46b068e2e02033aa0222206ef7f.jpg%5D%2Corigin%5Bdam%5D%2Ccategory%5B%5D%2Ctype%5BDESCRIPTIVESTILLLIFE%5D%2Cres%5Bm%5D%2Chmver%5B2%5D&amp;call=url[file:/product/miniature]">
 <img alt="Light denim blue" src="//lp2.hm.com/hmgoepprod?set=quality%5B79%5D%2Csource%5B%2Fda%2Fba%2Fdabaf3f73477f46b068e2e02033aa0222206ef7f.jpg%5D%2Corigin%5Bdam%5D%2Ccategory%5B%5D%2Ctype%5BDESCRIPTIVESTILLLIFE%5D%2Cres%5Bm%5D%2Chmver%5B2%5D&amp;call=url[file:/product/miniature]"/>
 </noscript>
 <span></span>
 </a>,
 <a aria-checked="false" class="filter-option miniature" data-articlecode="1024256003" data-color="Light denim blue" data-sizes="" href

In [72]:
soup.find_all('a', 'filter-option miniature')[0].get('data-color')

'Light denim blue'

In [73]:
product_list = soup.find_all('a', 'filter-option miniature')

#cor
product_color = [p.get('data-color') for p in product_list] 

#id
product_id = [p.get('data-articlecode') for p in product_list]

In [74]:
product_color

['Light denim blue',
 'Light denim blue',
 'Denim blue',
 'Dark blue',
 'Dark denim blue',
 'Dark gray']

In [75]:
product_id

['1024256002',
 '1024256003',
 '1024256004',
 '1024256005',
 '1024256006',
 '1024256007']

In [76]:
df_color = pd.DataFrame([product_id, product_color]).T
df_color.columns = ['id', 'color']
df_color

Unnamed: 0,id,color
0,1024256002,Light denim blue
1,1024256003,Light denim blue
2,1024256004,Denim blue
3,1024256005,Dark blue
4,1024256006,Dark denim blue
5,1024256007,Dark gray


In [77]:
#Generate style ID + color ID
df_color['style_id'] = df_color['id'].apply(lambda x: x[:-3])
df_color['color_id'] = df_color['id'].apply(lambda x: x[-3:])
df_color

Unnamed: 0,id,color,style_id,color_id
0,1024256002,Light denim blue,1024256,2
1,1024256003,Light denim blue,1024256,3
2,1024256004,Denim blue,1024256,4
3,1024256005,Dark blue,1024256,5
4,1024256006,Dark denim blue,1024256,6
5,1024256007,Dark gray,1024256,7


#### Filtrando por material/composition

O material só pode ser filtrado na página do produto e não na vitrine.

In [78]:
url = "https://www2.hm.com/en_us/productpage.1024256001.html"

# conteúdo de headers é padrão
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
page = requests.get(url, headers=headers)

soup = BeautifulSoup(page.text, 'html.parser')

In [79]:
soup.find_all('div', 'content pdp-text pdp-content')

[<div class="content pdp-text pdp-content">
 <p class="pdp-description-text">5-pocket jeans in stretch cotton denim. Regular waist, zip fly with button, and slim legs.</p>
 <dl class="pdp-description-list">
 <div class="pdp-description-list-item">
 <dt>Size</dt>
 <dd>The model is 185cm/6'1" and wears a size 31/32</dd>
 </div>
 <div class="pdp-description-list-item">
 <dt>Fit</dt>
 <dd>
 <ul>
 <li>Slim fit</li>
 </ul>
 </dd>
 </div>
 <div class="pdp-description-list-item">
 <dt>Composition</dt>
 <dd>
 <ul>
 <li>Pocket lining: Polyester 65%, Cotton 35%</li>
 <li>Shell: Cotton 99%, Spandex 1%</li>
 </ul>
 </dd>
 </div>
 <!--ARTICLE NUMBER-->
 <div class="pdp-description-list-item">
 <dt>Art. No.</dt>
 <dd>1024256001</dd>
 </div>
 </dl>
 <!-- / pdp-description-list -->
 <section class="review-answers selected-rating" id="js-selected-rating"></section>
 <!-- / pdp-true-to-size -->
 </div>]

In [80]:
soup.find_all('div', 'pdp-description-list-item')

[<div class="pdp-description-list-item">
 <dt>Size</dt>
 <dd>The model is 185cm/6'1" and wears a size 31/32</dd>
 </div>,
 <div class="pdp-description-list-item">
 <dt>Fit</dt>
 <dd>
 <ul>
 <li>Slim fit</li>
 </ul>
 </dd>
 </div>,
 <div class="pdp-description-list-item">
 <dt>Composition</dt>
 <dd>
 <ul>
 <li>Pocket lining: Polyester 65%, Cotton 35%</li>
 <li>Shell: Cotton 99%, Spandex 1%</li>
 </ul>
 </dd>
 </div>,
 <div class="pdp-description-list-item">
 <dt>Art. No.</dt>
 <dd>1024256001</dd>
 </div>]

In [81]:
soup.find_all('div', 'pdp-description-list-item')[1]

<div class="pdp-description-list-item">
<dt>Fit</dt>
<dd>
<ul>
<li>Slim fit</li>
</ul>
</dd>
</div>

In [82]:
type(soup.find_all('div', 'pdp-description-list-item')[1])

bs4.element.Tag

In [83]:
soup.find_all('div', 'pdp-description-list-item')[1].get_text()

'\nFit\n\n\nSlim fit\n\n\n'

In [84]:
type(soup.find_all('div', 'pdp-description-list-item')[1].get_text())

str

In [85]:
soup.find_all('div', 'pdp-description-list-item')[1].get_text().split('\n')

['', 'Fit', '', '', 'Slim fit', '', '', '']

In [86]:
filter( None, soup.find_all('div', 'pdp-description-list-item')[1].get_text().split('\n'))

<filter at 0x2a3d229b310>

In [87]:
list(filter( None, soup.find_all('div', 'pdp-description-list-item')[1].get_text().split('\n')))

['Fit', 'Slim fit']

In [88]:
product_composition_list = soup.find_all('div', 'pdp-description-list-item')

product_composition = [list( filter( None, p.get_text().split('\n') ) ) for p in product_composition_list]

In [89]:
product_composition

[['Size', 'The model is 185cm/6\'1" and wears a size 31/32'],
 ['Fit', 'Slim fit'],
 ['Composition',
  'Pocket lining: Polyester 65%, Cotton 35%',
  'Shell: Cotton 99%, Spandex 1%'],
 ['Art. No.', '1024256001']]

In [90]:
product_composition.pop(0)
product_composition

[['Fit', 'Slim fit'],
 ['Composition',
  'Pocket lining: Polyester 65%, Cotton 35%',
  'Shell: Cotton 99%, Spandex 1%'],
 ['Art. No.', '1024256001']]

In [91]:
pd.DataFrame(product_composition).T

Unnamed: 0,0,1,2
0,Fit,Composition,Art. No.
1,Slim fit,"Pocket lining: Polyester 65%, Cotton 35%",1024256001
2,,"Shell: Cotton 99%, Spandex 1%",


In [92]:
df_aux = pd.DataFrame(product_composition).T
df_aux

Unnamed: 0,0,1,2
0,Fit,Composition,Art. No.
1,Slim fit,"Pocket lining: Polyester 65%, Cotton 35%",1024256001
2,,"Shell: Cotton 99%, Spandex 1%",


In [93]:
df_aux.columns = df_aux.iloc[0]
df_aux

Unnamed: 0,Fit,Composition,Art. No.
0,Fit,Composition,Art. No.
1,Slim fit,"Pocket lining: Polyester 65%, Cotton 35%",1024256001
2,,"Shell: Cotton 99%, Spandex 1%",


In [94]:
df_aux = df_aux.iloc[1:].fillna(method='ffill')
df_aux

Unnamed: 0,Fit,Composition,Art. No.
1,Slim fit,"Pocket lining: Polyester 65%, Cotton 35%",1024256001
2,Slim fit,"Shell: Cotton 99%, Spandex 1%",1024256001


Style ID

      -> Product ID
        -> Color ID
          -> SKU
      
      Style ID + Color ID -> Product ID
      Style ID + Color ID + Size ID => SKU

In [95]:
# Generate Style ID + Color ID
# All values, but the last three values
df_aux['style_id'] = df_aux['Art. No.'].apply(lambda x: x[:-3])

df_aux['color_id'] = df_aux['Art. No.'].apply(lambda x: x[-3:])

In [96]:
df_aux

Unnamed: 0,Fit,Composition,Art. No.,style_id,color_id
1,Slim fit,"Pocket lining: Polyester 65%, Cotton 35%",1024256001,1024256,1
2,Slim fit,"Shell: Cotton 99%, Spandex 1%",1024256001,1024256,1


#### Mesclando as células

In [97]:
pd.merge(df_color, df_aux[['style_id', 'Fit', 'Composition']], how='left', on='style_id')

Unnamed: 0,id,color,style_id,color_id,Fit,Composition
0,1024256002,Light denim blue,1024256,2,Slim fit,"Pocket lining: Polyester 65%, Cotton 35%"
1,1024256002,Light denim blue,1024256,2,Slim fit,"Shell: Cotton 99%, Spandex 1%"
2,1024256003,Light denim blue,1024256,3,Slim fit,"Pocket lining: Polyester 65%, Cotton 35%"
3,1024256003,Light denim blue,1024256,3,Slim fit,"Shell: Cotton 99%, Spandex 1%"
4,1024256004,Denim blue,1024256,4,Slim fit,"Pocket lining: Polyester 65%, Cotton 35%"
5,1024256004,Denim blue,1024256,4,Slim fit,"Shell: Cotton 99%, Spandex 1%"
6,1024256005,Dark blue,1024256,5,Slim fit,"Pocket lining: Polyester 65%, Cotton 35%"
7,1024256005,Dark blue,1024256,5,Slim fit,"Shell: Cotton 99%, Spandex 1%"
8,1024256006,Dark denim blue,1024256,6,Slim fit,"Pocket lining: Polyester 65%, Cotton 35%"
9,1024256006,Dark denim blue,1024256,6,Slim fit,"Shell: Cotton 99%, Spandex 1%"


In [98]:
data_merge = pd.merge(df_color, df_aux[['style_id', 'Fit', 'Composition']], how='left', on='style_id')

# Único comando para coleta de cor e composição

In [99]:
#API request
# conteúdo de headers é padrão
url = "https://www2.hm.com/en_us/productpage.1024256001.html"

headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
page = requests.get(url, headers=headers)

#Beautiful Soup object
soup = BeautifulSoup(page.text, 'html.parser')

# ============================= Color =========================

#product list
product_list = soup.find_all('a', 'filter-option miniature')

#color
product_color = [p.get('data-color') for p in product_list] 

#id
product_id = [p.get('data-articlecode') for p in product_list]

#dataframe
df_color = pd.DataFrame([product_id, product_color]).T
df_color.columns = ['id', 'color']

#generate style id + color id
df_color['style_id'] = df_color['id'].apply(lambda x: x[:-3])
df_color['color_id'] = df_color['id'].apply(lambda x: x[-3:])

# ============================ Composition =====================

# Product list
product_composition_list = soup.find_all('div', 'pdp-description-list-item')

# Composition
product_composition = [list( filter( None, p.get_text().split('\n') ) ) for p in product_composition_list]

# Deleting row
product_composition.pop(0)

# dataframe
df_composition = pd.DataFrame(product_composition).T

# Columns name
df_composition.columns = df_composition.iloc[0]

# Filling None/NA values
df_composition = df_composition.iloc[1:].fillna(method='ffill')

# Generate Style ID + Color ID
# All values, but the last three values
df_composition['style_id'] = df_composition['Art. No.'].apply(lambda x: x[:-3])
df_composition['color_id'] = df_composition['Art. No.'].apply(lambda x: x[-3:])

# ======================= Merging color + composition ==========================
data_merge = pd.merge(df_color, df_composition[['style_id', 'Fit', 'Composition']], how='left', on='style_id')

In [100]:
data_merge

Unnamed: 0,id,color,style_id,color_id,Fit,Composition
0,1024256002,Light denim blue,1024256,2,Slim fit,"Pocket lining: Polyester 65%, Cotton 35%"
1,1024256002,Light denim blue,1024256,2,Slim fit,"Shell: Cotton 99%, Spandex 1%"
2,1024256003,Light denim blue,1024256,3,Slim fit,"Pocket lining: Polyester 65%, Cotton 35%"
3,1024256003,Light denim blue,1024256,3,Slim fit,"Shell: Cotton 99%, Spandex 1%"
4,1024256004,Denim blue,1024256,4,Slim fit,"Pocket lining: Polyester 65%, Cotton 35%"
5,1024256004,Denim blue,1024256,4,Slim fit,"Shell: Cotton 99%, Spandex 1%"
6,1024256005,Dark blue,1024256,5,Slim fit,"Pocket lining: Polyester 65%, Cotton 35%"
7,1024256005,Dark blue,1024256,5,Slim fit,"Shell: Cotton 99%, Spandex 1%"
8,1024256006,Dark denim blue,1024256,6,Slim fit,"Pocket lining: Polyester 65%, Cotton 35%"
9,1024256006,Dark denim blue,1024256,6,Slim fit,"Shell: Cotton 99%, Spandex 1%"


# Multiple Products

Vamos coletar as cores e a composição de cada produto da vitrine, conforme cada id.

In [101]:
data

Unnamed: 0,id,product_name,product_type,price,datetime
0,1024256001,Slim Jeans,men_jeans_slim,$ 19.99,2022-01-24 14:43:08
1,1024256003,Slim Jeans,men_jeans_slim,$ 19.99,2022-01-24 14:43:08
2,690449056,Skinny Jeans,men_jeans_ripped,$ 39.99,2022-01-24 14:43:08
3,985159001,Skinny Jeans,men_jeans_skinny,$ 24.99,2022-01-24 14:43:08
4,690449022,Skinny Jeans,men_jeans_ripped,$ 39.99,2022-01-24 14:43:08
5,1024256004,Slim Jeans,men_jeans_slim,$ 19.99,2022-01-24 14:43:08
6,1024256007,Slim Jeans,men_jeans_slim,$ 19.99,2022-01-24 14:43:08
7,690449051,Skinny Jeans,men_jeans_ripped,$ 39.99,2022-01-24 14:43:08
8,690449043,Skinny Jeans,men_jeans_ripped,$ 39.99,2022-01-24 14:43:08
9,690449036,Skinny Jeans,men_jeans_ripped,$ 39.99,2022-01-24 14:43:08


In [102]:
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}

#empty dataframe
df_final = pd.DataFrame()

for i in range(len(data)):

    #API request
    # conteúdo de headers é padrão
    url = "https://www2.hm.com/en_us/productpage." + data.loc[i, 'id'] + ".html"

    page = requests.get(url, headers=headers)

    #Beautiful Soup object
    soup = BeautifulSoup(page.text, 'html.parser')

    # ============================= Color =========================

    #product list
    product_list = soup.find_all('a', 'filter-option miniature')

    #color
    product_color = [p.get('data-color') for p in product_list] 

    #id
    product_id = [p.get('data-articlecode') for p in product_list]

    #dataframe
    df_color = pd.DataFrame([product_id, product_color]).T
    df_color.columns = ['id', 'color']

    #generate style id + color id
    df_color['style_id'] = df_color['id'].apply(lambda x: x[:-3])
    df_color['color_id'] = df_color['id'].apply(lambda x: x[-3:])

    # ============================ Composition =====================

    # Product list
    product_composition_list = soup.find_all('div', 'pdp-description-list-item')

    # Composition
    product_composition = [list( filter( None, p.get_text().split('\n') ) ) for p in product_composition_list]

  
    # dataframe
    df_composition = pd.DataFrame(product_composition).T

    # Columns name
    df_composition.columns = df_composition.iloc[0]

    # Filling None/NA values
    df_composition = df_composition.iloc[1:].fillna(method='ffill')

    # Generate Style ID + Color ID
    # All values, but the last three values
    df_composition['style_id'] = df_composition['Art. No.'].apply(lambda x: x[:-3])
    df_composition['color_id'] = df_composition['Art. No.'].apply(lambda x: x[-3:])

    # ======================= Merging color + composition ==========================
    data_merge = pd.merge(df_color, df_composition[['style_id', 'Fit', 'Composition']], how='left', on='style_id')

    # ======================= Concatenate ==========================================
    df_final = pd.concat( [df_final, data_merge], axis=0 )

In [103]:
df_final

Unnamed: 0,id,color,style_id,color_id,Fit,Composition
0,1024256002,Light denim blue,1024256,002,Slim fit,"Pocket lining: Polyester 65%, Cotton 35%"
1,1024256002,Light denim blue,1024256,002,Slim fit,"Shell: Cotton 99%, Spandex 1%"
2,1024256003,Light denim blue,1024256,003,Slim fit,"Pocket lining: Polyester 65%, Cotton 35%"
3,1024256003,Light denim blue,1024256,003,Slim fit,"Shell: Cotton 99%, Spandex 1%"
4,1024256004,Denim blue,1024256,004,Slim fit,"Pocket lining: Polyester 65%, Cotton 35%"
...,...,...,...,...,...,...
7,0985197005,Dark denim blue,0985197,005,Slim fit,"Shell: Cotton 98%, Spandex 2%"
8,0985197006,Light denim blue,0985197,006,Slim fit,Pocket lining: Cotton 100%
9,0985197006,Light denim blue,0985197,006,Slim fit,"Shell: Cotton 98%, Spandex 2%"
10,0985197007,Dark gray,0985197,007,Slim fit,Pocket lining: Cotton 100%


# Multiple products incluindo a coluna Size e outras que estiverem no catálogo

In [104]:
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}

#empty dataframe
df_final = pd.DataFrame()

# Unique columns for all products
aux = []

for i in range(len(data)):

    #API request
    # conteúdo de headers é padrão
    url = "https://www2.hm.com/en_us/productpage." + data.loc[i, 'id'] + ".html"

    page = requests.get(url, headers=headers)

    #Beautiful Soup object
    soup = BeautifulSoup(page.text, 'html.parser')

    # ============================= Color =========================

    #product list
    product_list = soup.find_all('a', 'filter-option miniature')

    #color
    product_color = [p.get('data-color') for p in product_list] 

    #id
    product_id = [p.get('data-articlecode') for p in product_list]

    #dataframe
    df_color = pd.DataFrame([product_id, product_color]).T
    df_color.columns = ['id', 'color']

    #generate style id + color id
    df_color['style_id'] = df_color['id'].apply(lambda x: x[:-3])
    df_color['color_id'] = df_color['id'].apply(lambda x: x[-3:])

    # ============================ Composition =====================

    # Product list
    product_composition_list = soup.find_all('div', 'pdp-description-list-item')

    # Composition
    product_composition = [list( filter( None, p.get_text().split('\n') ) ) for p in product_composition_list]

  
    # dataframe
    df_composition = pd.DataFrame(product_composition).T

    # Columns name
    df_composition.columns = df_composition.iloc[0]

    # Filling None/NA values
    df_composition = df_composition.iloc[1:].fillna(method='ffill')

    # Generate Style ID + Color ID
    # All values, but the last three values
    df_composition['style_id'] = df_composition['Art. No.'].apply(lambda x: x[:-3])
    df_composition['color_id'] = df_composition['Art. No.'].apply(lambda x: x[-3:])

    # ========== Columns we want
    aux = aux + df_composition.columns.tolist()
    
    # ======================= Merging color + composition ==========================
    data_merge = pd.merge(df_color, df_composition[['style_id', 'Fit', 'Composition']], how='left', on='style_id')

    # ======================= Concatenate ==========================================
    df_final = pd.concat( [df_final, data_merge], axis=0 )

In [105]:
aux

['Size',
 'Fit',
 'Composition',
 'Art. No.',
 'style_id',
 'color_id',
 'Size',
 'Fit',
 'Composition',
 'Art. No.',
 'style_id',
 'color_id',
 'Size',
 'Fit',
 'Composition',
 'Art. No.',
 'style_id',
 'color_id',
 'Size',
 'Fit',
 'Composition',
 'More sustainable materials',
 'Art. No.',
 'style_id',
 'color_id',
 'Size',
 'Fit',
 'Composition',
 'Art. No.',
 'style_id',
 'color_id',
 'Fit',
 'Composition',
 'Art. No.',
 'style_id',
 'color_id',
 'Size',
 'Fit',
 'Composition',
 'Art. No.',
 'style_id',
 'color_id',
 'Fit',
 'Composition',
 'More sustainable materials',
 'Art. No.',
 'style_id',
 'color_id',
 'Size',
 'Fit',
 'Composition',
 'Art. No.',
 'style_id',
 'color_id',
 'Size',
 'Fit',
 'Composition',
 'Art. No.',
 'style_id',
 'color_id',
 'Fit',
 'Composition',
 'Art. No.',
 'style_id',
 'color_id',
 'Fit',
 'Composition',
 'More sustainable materials',
 'Art. No.',
 'style_id',
 'color_id',
 'Size',
 'Fit',
 'Composition',
 'Art. No.',
 'style_id',
 'color_id',
 'Size'

In [106]:
set(aux)

{'Art. No.',
 'Composition',
 'Fit',
 'More sustainable materials',
 'Size',
 'color_id',
 'style_id'}

Agora vamos criar uma lista com todas as possíveis colunas da vitrine: 

In [107]:
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}

#empty dataframe
df_final = pd.DataFrame()

# All columns found on website
cols = ['Art. No.', 'Composition', 'Fit', 'More sustainable materials', 'Size']
df_pattern = pd.DataFrame(columns=cols)

for i in range(len(data)):

    #API request
    # conteúdo de headers é padrão
    url = "https://www2.hm.com/en_us/productpage." + data.loc[i, 'id'] + ".html"

    page = requests.get(url, headers=headers)

    #Beautiful Soup object
    soup = BeautifulSoup(page.text, 'html.parser')

    # ============================= Color =========================

    #product list
    product_list = soup.find_all('a', 'filter-option miniature')

    #color
    product_color = [p.get('data-color') for p in product_list] 

    #id
    product_id = [p.get('data-articlecode') for p in product_list]

    #dataframe
    df_color = pd.DataFrame([product_id, product_color]).T
    df_color.columns = ['id', 'color']

    #generate style id + color id
    df_color['style_id'] = df_color['id'].apply(lambda x: x[:-3])
    df_color['color_id'] = df_color['id'].apply(lambda x: x[-3:])

    # ============================ Composition =====================

    # Product list
    product_composition_list = soup.find_all('div', 'pdp-description-list-item')

    # Composition
    product_composition = [list( filter( None, p.get_text().split('\n') ) ) for p in product_composition_list]

  
    # dataframe
    df_composition = pd.DataFrame(product_composition).T

    # Columns name
    df_composition.columns = df_composition.iloc[0]

    # Filling None/NA values
    df_composition = df_composition.iloc[1:].fillna(method='ffill')

    # The same number of columns (pattern)
    df_composition = pd.concat( [df_pattern, df_composition] )
    
    # Generate Style ID + Color ID
    # All values, but the last three values
    df_composition['style_id'] = df_composition['Art. No.'].apply(lambda x: x[:-3])
    df_composition['color_id'] = df_composition['Art. No.'].apply(lambda x: x[-3:])
    
    # ======================= Merging color + composition ==========================
    data_merge = pd.merge(df_color, df_composition[['style_id', 'Fit', 'Composition', 'More sustainable materials', 'Size']], how='left', on='style_id')

    # ======================= Concatenate ==========================================
    df_final = pd.concat( [df_final, data_merge], axis=0 )

In [108]:
df_final

Unnamed: 0,id,color,style_id,color_id,Fit,Composition,More sustainable materials,Size
0,1024256002,Light denim blue,1024256,002,Slim fit,"Pocket lining: Polyester 65%, Cotton 35%",,"The model is 185cm/6'1"" and wears a size 31/32"
1,1024256002,Light denim blue,1024256,002,Slim fit,"Shell: Cotton 99%, Spandex 1%",,"The model is 185cm/6'1"" and wears a size 31/32"
2,1024256003,Light denim blue,1024256,003,Slim fit,"Pocket lining: Polyester 65%, Cotton 35%",,"The model is 185cm/6'1"" and wears a size 31/32"
3,1024256003,Light denim blue,1024256,003,Slim fit,"Shell: Cotton 99%, Spandex 1%",,"The model is 185cm/6'1"" and wears a size 31/32"
4,1024256004,Denim blue,1024256,004,Slim fit,"Pocket lining: Polyester 65%, Cotton 35%",,"The model is 185cm/6'1"" and wears a size 31/32"
...,...,...,...,...,...,...,...,...
7,0985197005,Dark denim blue,0985197,005,Slim fit,"Shell: Cotton 98%, Spandex 2%",,"The model is 189cm/6'2"" and wears a size 32/32"
8,0985197006,Light denim blue,0985197,006,Slim fit,Pocket lining: Cotton 100%,,"The model is 189cm/6'2"" and wears a size 32/32"
9,0985197006,Light denim blue,0985197,006,Slim fit,"Shell: Cotton 98%, Spandex 2%",,"The model is 189cm/6'2"" and wears a size 32/32"
10,0985197007,Dark gray,0985197,007,Slim fit,Pocket lining: Cotton 100%,,"The model is 189cm/6'2"" and wears a size 32/32"


# Juntando todos os dados

In [109]:
data

Unnamed: 0,id,product_name,product_type,price,datetime
0,1024256001,Slim Jeans,men_jeans_slim,$ 19.99,2022-01-24 14:43:08
1,1024256003,Slim Jeans,men_jeans_slim,$ 19.99,2022-01-24 14:43:08
2,690449056,Skinny Jeans,men_jeans_ripped,$ 39.99,2022-01-24 14:43:08
3,985159001,Skinny Jeans,men_jeans_skinny,$ 24.99,2022-01-24 14:43:08
4,690449022,Skinny Jeans,men_jeans_ripped,$ 39.99,2022-01-24 14:43:08
5,1024256004,Slim Jeans,men_jeans_slim,$ 19.99,2022-01-24 14:43:08
6,1024256007,Slim Jeans,men_jeans_slim,$ 19.99,2022-01-24 14:43:08
7,690449051,Skinny Jeans,men_jeans_ripped,$ 39.99,2022-01-24 14:43:08
8,690449043,Skinny Jeans,men_jeans_ripped,$ 39.99,2022-01-24 14:43:08
9,690449036,Skinny Jeans,men_jeans_ripped,$ 39.99,2022-01-24 14:43:08


In [110]:
# Creating style_id + color_id
data['style_id'] = data['id'].apply(lambda x: x[:-3])
data['color_id'] = data['id'].apply(lambda x: x[-3:])

data_raw = pd.merge( data, df_final[['color', 'style_id', 'Fit', 'Composition', 'More sustainable materials', 'Size']], how='left', on='style_id')

In [111]:
data_raw.head()

Unnamed: 0,id,product_name,product_type,price,datetime,style_id,color_id,color,Fit,Composition,More sustainable materials,Size
0,1024256001,Slim Jeans,men_jeans_slim,$ 19.99,2022-01-24 14:43:08,1024256,1,Light denim blue,Slim fit,"Pocket lining: Polyester 65%, Cotton 35%",,"The model is 185cm/6'1"" and wears a size 31/32"
1,1024256001,Slim Jeans,men_jeans_slim,$ 19.99,2022-01-24 14:43:08,1024256,1,Light denim blue,Slim fit,"Shell: Cotton 99%, Spandex 1%",,"The model is 185cm/6'1"" and wears a size 31/32"
2,1024256001,Slim Jeans,men_jeans_slim,$ 19.99,2022-01-24 14:43:08,1024256,1,Light denim blue,Slim fit,"Pocket lining: Polyester 65%, Cotton 35%",,"The model is 185cm/6'1"" and wears a size 31/32"
3,1024256001,Slim Jeans,men_jeans_slim,$ 19.99,2022-01-24 14:43:08,1024256,1,Light denim blue,Slim fit,"Shell: Cotton 99%, Spandex 1%",,"The model is 185cm/6'1"" and wears a size 31/32"
4,1024256001,Slim Jeans,men_jeans_slim,$ 19.99,2022-01-24 14:43:08,1024256,1,Denim blue,Slim fit,"Pocket lining: Polyester 65%, Cotton 35%",,"The model is 185cm/6'1"" and wears a size 31/32"


In [112]:
data_raw

Unnamed: 0,id,product_name,product_type,price,datetime,style_id,color_id,color,Fit,Composition,More sustainable materials,Size
0,1024256001,Slim Jeans,men_jeans_slim,$ 19.99,2022-01-24 14:43:08,1024256,001,Light denim blue,Slim fit,"Pocket lining: Polyester 65%, Cotton 35%",,"The model is 185cm/6'1"" and wears a size 31/32"
1,1024256001,Slim Jeans,men_jeans_slim,$ 19.99,2022-01-24 14:43:08,1024256,001,Light denim blue,Slim fit,"Shell: Cotton 99%, Spandex 1%",,"The model is 185cm/6'1"" and wears a size 31/32"
2,1024256001,Slim Jeans,men_jeans_slim,$ 19.99,2022-01-24 14:43:08,1024256,001,Light denim blue,Slim fit,"Pocket lining: Polyester 65%, Cotton 35%",,"The model is 185cm/6'1"" and wears a size 31/32"
3,1024256001,Slim Jeans,men_jeans_slim,$ 19.99,2022-01-24 14:43:08,1024256,001,Light denim blue,Slim fit,"Shell: Cotton 99%, Spandex 1%",,"The model is 185cm/6'1"" and wears a size 31/32"
4,1024256001,Slim Jeans,men_jeans_slim,$ 19.99,2022-01-24 14:43:08,1024256,001,Denim blue,Slim fit,"Pocket lining: Polyester 65%, Cotton 35%",,"The model is 185cm/6'1"" and wears a size 31/32"
...,...,...,...,...,...,...,...,...,...,...,...,...
1950,0985197001,Slim Jeans,men_jeans_slim,$ 19.99,2022-01-24 14:43:08,0985197,001,Dark denim blue,Slim fit,"Shell: Cotton 98%, Spandex 2%",,"The model is 189cm/6'2"" and wears a size 32/32"
1951,0985197001,Slim Jeans,men_jeans_slim,$ 19.99,2022-01-24 14:43:08,0985197,001,Light denim blue,Slim fit,Pocket lining: Cotton 100%,,"The model is 189cm/6'2"" and wears a size 32/32"
1952,0985197001,Slim Jeans,men_jeans_slim,$ 19.99,2022-01-24 14:43:08,0985197,001,Light denim blue,Slim fit,"Shell: Cotton 98%, Spandex 2%",,"The model is 189cm/6'2"" and wears a size 32/32"
1953,0985197001,Slim Jeans,men_jeans_slim,$ 19.99,2022-01-24 14:43:08,0985197,001,Dark gray,Slim fit,Pocket lining: Cotton 100%,,"The model is 189cm/6'2"" and wears a size 32/32"


# Page size total (toda a vitrine/ todas as páginas/paginação)

In [113]:
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}

#empty dataframe
df_final = pd.DataFrame()

# All columns found on website
cols = ['Art. No.', 'Composition', 'Fit', 'More sustainable materials', 'Size']
df_pattern = pd.DataFrame(columns=cols)

for i in range(len(data)):

    #API request
    # conteúdo de headers é padrão
    url = "https://www2.hm.com/en_us/productpage." + data.loc[i, 'id'] + ".html"+ "?page-size=" + str(int(pagination_number*36))

    page = requests.get(url, headers=headers)

    #Beautiful Soup object
    soup = BeautifulSoup(page.text, 'html.parser')

    # ============================= Color =========================

    #product list
    product_list = soup.find_all('a', 'filter-option miniature')

    #color
    product_color = [p.get('data-color') for p in product_list] 

    #id
    product_id = [p.get('data-articlecode') for p in product_list]

    #dataframe
    df_color = pd.DataFrame([product_id, product_color]).T
    df_color.columns = ['id', 'color']

    #generate style id + color id
    df_color['style_id'] = df_color['id'].apply(lambda x: x[:-3])
    df_color['color_id'] = df_color['id'].apply(lambda x: x[-3:])

    # ============================ Composition =====================

    # Product list
    product_composition_list = soup.find_all('div', 'pdp-description-list-item')

    # Composition
    product_composition = [list( filter( None, p.get_text().split('\n') ) ) for p in product_composition_list]

  
    # dataframe
    df_composition = pd.DataFrame(product_composition).T

    # Columns name
    df_composition.columns = df_composition.iloc[0]

    # Filling None/NA values
    df_composition = df_composition.iloc[1:].fillna(method='ffill')

    # The same number of columns (pattern)
    df_composition = pd.concat( [df_pattern, df_composition] )
    
    # Generate Style ID + Color ID
    # All values, but the last three values
    df_composition['style_id'] = df_composition['Art. No.'].apply(lambda x: x[:-3])
    df_composition['color_id'] = df_composition['Art. No.'].apply(lambda x: x[-3:])
    
    # ======================= Merging color + composition ==========================
    data_merge = pd.merge(df_color, df_composition[['style_id', 'Fit', 'Composition', 'More sustainable materials', 'Size']], how='left', on='style_id')

    # ======================= Concatenate ==========================================
    df_final = pd.concat( [df_final, data_merge], axis=0 )

In [114]:
df_final

Unnamed: 0,id,color,style_id,color_id,Fit,Composition,More sustainable materials,Size
0,1024256002,Light denim blue,1024256,002,Slim fit,"Pocket lining: Polyester 65%, Cotton 35%",,"The model is 185cm/6'1"" and wears a size 31/32"
1,1024256002,Light denim blue,1024256,002,Slim fit,"Shell: Cotton 99%, Spandex 1%",,"The model is 185cm/6'1"" and wears a size 31/32"
2,1024256003,Light denim blue,1024256,003,Slim fit,"Pocket lining: Polyester 65%, Cotton 35%",,"The model is 185cm/6'1"" and wears a size 31/32"
3,1024256003,Light denim blue,1024256,003,Slim fit,"Shell: Cotton 99%, Spandex 1%",,"The model is 185cm/6'1"" and wears a size 31/32"
4,1024256004,Denim blue,1024256,004,Slim fit,"Pocket lining: Polyester 65%, Cotton 35%",,"The model is 185cm/6'1"" and wears a size 31/32"
...,...,...,...,...,...,...,...,...
7,0985197005,Dark denim blue,0985197,005,Slim fit,"Shell: Cotton 98%, Spandex 2%",,"The model is 189cm/6'2"" and wears a size 32/32"
8,0985197006,Light denim blue,0985197,006,Slim fit,Pocket lining: Cotton 100%,,"The model is 189cm/6'2"" and wears a size 32/32"
9,0985197006,Light denim blue,0985197,006,Slim fit,"Shell: Cotton 98%, Spandex 2%",,"The model is 189cm/6'2"" and wears a size 32/32"
10,0985197007,Dark gray,0985197,007,Slim fit,Pocket lining: Cotton 100%,,"The model is 189cm/6'2"" and wears a size 32/32"
