In [1]:
from bs4 import BeautifulSoup
import requests
import numpy as np
import pandas as pd
import datetime

In [2]:
americanas_baseurl = "https://www.americanas.com.br"

In [3]:
soup = BeautifulSoup(requests.get(americanas_baseurl).text, "html5lib")

In [4]:
# get categories from sitemap
sitemap_url = americanas_baseurl + "/mapa-do-site"
soup = BeautifulSoup(requests.get(sitemap_url).text, "html5lib")

categories={}

for cat in soup.find_all("li", {"class":"child-level-1"}):
    cat_a = cat.find("a")
    cat_name, cat_url = (cat_a.text, cat_a['href'])
    categories[cat_name] = {'url':cat_url, 'subcategories':{}}
    for subcat in cat.find_all("li",{"class":"child-level-2"}):
        subcat_a = subcat.find("a")
        subcat_name, subcat_url = (subcat_a.text, subcat_a['href'])
        categories[cat_name]['subcategories'][subcat_name] = subcat_url

In [5]:
subcategories = dict( (subcat,url) for cat,subcats in categories.items() for subcat,url in subcats['subcategories'].items() )

In [6]:
subcategories_urls = [ url for c,url in subcategories.items()]

In [7]:
subcategories_urls[100:105]

['/categoria/automotivo/tuning',
 '/categoria/automotivo/vestuario-para-motociclistas',
 '/categoria/bebes/alimentacao',
 '/categoria/bebes/banho-do-bebe',
 '/categoria/bebes/bercario']

In [8]:
def getProductsData(cat_url):
    print("Retrieving products from "+cat_url)
    cat_page = BeautifulSoup( requests.get(americanas_baseurl+"/"+cat_url).text,"html5lib" )
    
    products = []
    num_products = int(cat_page.find("aside",{'class':"sortbar"}).find("span").text.split()[0].replace('.',''))
    
    # these are url params that should be used for loading items pages
    limit = 24 # This is apparently hardcoded
    num_offsets = num_products//limit if num_products%limit==0 else (num_products//limit)+1
    
    # Collects all products moving through offsets
    for offset in range(num_offsets): 
        print("  Collecting items from offset {} of {}".format(offset+1, num_offsets))
        
        requestUrl = americanas_baseurl+"/"+cat_url+"?limite="+str(limit)+"&offset="+str(offset*limit)
        productsPage = BeautifulSoup( requests.get(requestUrl).text, "html5lib" )
        print("  {}".format(requestUrl))
        
        try: 
            productItems = productsPage.find("div",{'data-component':'productgrid'}).find_all("div",{'class':'product-grid-item'})

            # Retrieve data
            for i in productItems:
                """
                (pr_id, title, price, previous_price, category_url)
                """
                item = i.find("a")
                item_title, item_code = (item['title'],item['href'].split("?")[0].split("/")[-1])
                item_price = ' '.join( i.text for i in item.find_all("span",{'class':"value"}) )

                try: item_rating = item.find("span",{'class':"rating-star-average"}).text
                except: item_rating = None
                    
                try: item_num_ratings = item.find("div",{'class':"rating-star-counter"}).text[1]
                except: item_num_ratings = None
                    
                try: item_previous_price = item.find("del",{'class':"card-product-price-from-value"}).text
                except: item_previous_price = None

                products += [(item_code,
                              item_title,
                              item_price,
                              item_previous_price, 
                              item_rating,
                              item_num_ratings,
                              cat_url,
                              datetime.datetime.utcnow())]

        except:
            pass
        
    print("Done\n")
    return products

In [9]:
prods = [ prod for cat in subcategories_urls[0:1] for prod in getProductsData(cat)]

Retrieving products from /categoria/alimentos-e-bebidas/alimentacao-infantil
  Collecting items from offset 1 of 12
  https://www.americanas.com.br//categoria/alimentos-e-bebidas/alimentacao-infantil?limite=24&offset=0
  Collecting items from offset 2 of 12
  https://www.americanas.com.br//categoria/alimentos-e-bebidas/alimentacao-infantil?limite=24&offset=24
  Collecting items from offset 3 of 12
  https://www.americanas.com.br//categoria/alimentos-e-bebidas/alimentacao-infantil?limite=24&offset=48
  Collecting items from offset 4 of 12
  https://www.americanas.com.br//categoria/alimentos-e-bebidas/alimentacao-infantil?limite=24&offset=72
  Collecting items from offset 5 of 12
  https://www.americanas.com.br//categoria/alimentos-e-bebidas/alimentacao-infantil?limite=24&offset=96
  Collecting items from offset 6 of 12
  https://www.americanas.com.br//categoria/alimentos-e-bebidas/alimentacao-infantil?limite=24&offset=120
  Collecting items from offset 7 of 12
  https://www.americanas.c

In [10]:
df = pd.DataFrame.from_records(prods).set_index(0)
df

Unnamed: 0_level_0,1,2,3,4,5,6,7
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
15814452,Neocate Lcp Fórmula Infantil Em Pó Lata 400g,"R$ 146,00",,,,/categoria/alimentos-e-bebidas/alimentacao-inf...,2017-11-26 04:02:44.787545
15815100,Aptamil Profutura 1 Fórmula Infantil Lata 800g,"R$ 37,50",,5.0,1,/categoria/alimentos-e-bebidas/alimentacao-inf...,2017-11-26 04:02:44.789542
10731594,"Leite Em Pó Aptamil Pepti Com Dha, Ara E Prebi...","R$ 115,14","R$ 116,30",,,/categoria/alimentos-e-bebidas/alimentacao-inf...,2017-11-26 04:02:44.791368
15815004,Aptamil Profutura 2 Fórmula Infantil Lata 800g,"R$ 36,40",,,,/categoria/alimentos-e-bebidas/alimentacao-inf...,2017-11-26 04:02:44.793069
10654424,Ensure Em Pó Sabor Chocolate Com 900 Gramas,"R$ 79,90",,4.0,1,/categoria/alimentos-e-bebidas/alimentacao-inf...,2017-11-26 04:02:44.794997
122661624,Nan Comfor 1 800g - Nestlé,"R$ 41,70",,,,/categoria/alimentos-e-bebidas/alimentacao-inf...,2017-11-26 04:02:44.796522
15814954,Puramino Fórmula Infantil Lata 400g,"R$ 163,55",,,,/categoria/alimentos-e-bebidas/alimentacao-inf...,2017-11-26 04:02:44.797983
20138327,Kit: 3 Leite Em Po Nan Supreme 1 400g,"R$ 89,00","R$ 119,00",,,/categoria/alimentos-e-bebidas/alimentacao-inf...,2017-11-26 04:02:44.799730
10778593,Leite Em Pó Nan 2 Pro Com 800 Gramarelos,"R$ 55,99",,,,/categoria/alimentos-e-bebidas/alimentacao-inf...,2017-11-26 04:02:44.801250
10654094,Ensure Em Pó Sabor Banana Com 900 Gramas,"R$ 79,90",,,,/categoria/alimentos-e-bebidas/alimentacao-inf...,2017-11-26 04:02:44.802956
