## Pegando dados de preços dos sites: Mercado Livre, Submarino e Magalu

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
from datetime import datetime

In [2]:
# Seleciono plavras chave para procurar no site do mercado livre
search_commands = ['celular', 'tv', 'geladeira', 'microondas', 'fogao', 'iphone', 'som', 'luminaria', 'notebook', 'smartwatch']

### Mercado Livre

Criarei um dataframe que irá conter todas as informações dos produtos procurados no "search_commands"

In [3]:
%%time
data = pd.DataFrame(columns=['product_name', 'product_price', 'product_search', 'search_date', 'website', 'url'])
n=0
errors={}
for search in search_commands:
    
    url = 'https://lista.mercadolivre.com.br/{}#D[A:{}]'.format(search, search)
    r = requests.get(url)
    soup = BeautifulSoup(r.content, 'html.parser') 
    items = soup.find_all('li', class_='ui-search-layout__item')
    
    # Salvo os dados de cada item do site em um dataframe
    for item in items:
        try:
            try:
                name = item.find('h2', class_="ui-search-item__title ui-search-item__group__element").text
            except:
                name = item.find('h2', class_="ui-search-item__title").text
            price = item.find('span', class_="price-tag-fraction").text.replace('.', '')

            data.loc[n, 'product_name'] = name
            data.loc[n, 'product_price'] = price
            data.loc[n, 'product_search'] = search
            data.loc[n, 'search_date'] = datetime.today()
            data.loc[n, 'website'] = 'Mercado Livre'
            data.loc[n, 'url'] = url
            n+=1
        except:
            errors.update({search:url})
            pass

Wall time: 17 s


### Submarino

In [4]:
%%time

for search in search_commands:
    
    url = 'https://www.submarino.com.br/busca/{}'.format(search)
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0'}
    r = requests.get(url, headers=headers)
    soup = BeautifulSoup(r.content, 'html.parser') 
    items = soup.find_all('div', class_='product-grid-item ProductGrid__GridColumn-sc-49j2r8-0 eZaEaE ColUI-gjy0oc-0 ifczFg ViewUI-sc-1ijittn-6 iXIDWU')
    
    
    # Salvo os dados de cada item do site em um dataframe
    for item in items:
        try:
            try:
                name = item.find('h2', class_='TitleUI-sc-1f5n3tj-13 dTabgr TitleH2-sc-1wh9e1x-1 fINzxm').text.strip()
                price = item.div.div.find('span', class_="PriceUI-sc-1f5n3tj-9 ebPdEH PriceUI-sc-1q8ynzz-0 inNBs TextUI-sc-12tokcy-0 CIZtP").text.strip().replace('R$', '').replace('.', '').replace(',', '.')
            except:
                price = item.div.div.find('span', class_="PriceUI-sc-1f5n3tj-9 RjuaG PriceUI-sc-1q8ynzz-0 inNBs TextUI-sc-12tokcy-0 CIZtP").text.strip().replace('R$', '').replace('.', '').replace(',', '.')
            
            data.loc[n, 'product_name'] = name
            data.loc[n, 'product_price'] = price
            data.loc[n, 'product_search'] = search
            data.loc[n, 'search_date'] = datetime.today()
            data.loc[n, 'website'] = 'Submarino'
            data.loc[n, 'url'] = url
            n+=1
        except:
            errors.update({search:url})
            pass

Wall time: 21.5 s


### Magazine Luiza

In [5]:
%%time

for search in search_commands:
    
    url = 'https://www.magazineluiza.com.br/busca/{}/'.format(search)
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0'}
    r = requests.get(url, headers=headers)
    soup = BeautifulSoup(r.content, 'html.parser') 
    items = soup.find_all('li', class_='product')
    
    
    # Salvo os dados de cada item do site em um dataframe
    for item in items:
        try:
          
            name = item.h3.text.strip()
            price = item.find('span', class_='price-value').text.replace('R$', '').replace('.','').strip().replace(',','.')
            
            data.loc[n, 'product_name'] = name
            data.loc[n, 'product_price'] = price
            data.loc[n, 'product_search'] = search
            data.loc[n, 'search_date'] = datetime.today()
            data.loc[n, 'website'] = 'Magazine Luiza'
            data.loc[n, 'url'] = url
            n+=1
        except:
            errors.update({search:[name, price]})
            pass

Wall time: 17 s


In [6]:
# Como a base de dados fica:
data

Unnamed: 0,product_name,product_price,product_search,search_date,website,url
0,Samsung Galaxy A01 Core Dual SIM 32 GB azul 2 ...,689,celular,2020-12-11 10:32:59.461051,Mercado Livre,https://lista.mercadolivre.com.br/celular#D[A:...
1,LG K22 Dual SIM 32 GB red 2 GB RAM,789,celular,2020-12-11 10:32:59.463049,Mercado Livre,https://lista.mercadolivre.com.br/celular#D[A:...
2,LG K22 Dual SIM 32 GB blue 2 GB RAM,949,celular,2020-12-11 10:32:59.465047,Mercado Livre,https://lista.mercadolivre.com.br/celular#D[A:...
3,Samsung Galaxy A01 Core Dual SIM 32 GB preto 2...,689,celular,2020-12-11 10:32:59.467047,Mercado Livre,https://lista.mercadolivre.com.br/celular#D[A:...
4,LG K41S Dual SIM 32 GB titânio 3 GB RAM,1168,celular,2020-12-11 10:32:59.468046,Mercado Livre,https://lista.mercadolivre.com.br/celular#D[A:...
...,...,...,...,...,...,...
982,Notebook Asus ZenBook 14 UX431FA-AN203T - Inte...,5224.05,notebook,2020-12-11 10:33:51.100937,Magazine Luiza,https://www.magazineluiza.com.br/busca/notebook/
983,Notebook Asus ZenBook 14 UX434FAC-A6339T - Int...,5699.05,notebook,2020-12-11 10:33:51.102937,Magazine Luiza,https://www.magazineluiza.com.br/busca/notebook/
984,Notebook Gamer Acer Predator Helios 300 - PH31...,7789.05,notebook,2020-12-11 10:33:51.104936,Magazine Luiza,https://www.magazineluiza.com.br/busca/notebook/
985,Notebook Vaio FE14 VJFE41F11X-B0911H Intel Cor...,3039.05,notebook,2020-12-11 10:33:51.106935,Magazine Luiza,https://www.magazineluiza.com.br/busca/notebook/


### Curiosidades

Aqui faço uma comparação dos preços médios das procuras por "celular" e "iphone"

In [7]:
cel = data[data['product_search']=='celular']['product_price'].astype('float').mean()
iphone = data[data['product_search']=='iphone']['product_price'].astype('float').mean()

In [8]:
print('Ticket médio por busca: \n-Iphone: R$ {:.2f}\n-Celular: R$ {:.2f}'.format(iphone, cel))

Ticket médio por busca: 
-Iphone: R$ 5346.12
-Celular: R$ 1496.66
