## Requests - scrapping

In [50]:
import requests

In [51]:
page = "https://www.beerwulf.com/fr-fr/p/bieres/desperados-fut-2l"
response = requests.get(page)

In [52]:
# html de toute la page
response.text

'\r\n<!doctype html>\r\n<html class="no-js" lang="fr-FR"\r\n      data-original-lang="fr-FR"\r\n      data-rendered-at="Thu, 26 Sep 2019 13:56:20 GMT"\r\n      data-sku-ui-url="/fr-FR/api/product/get/ui"\r\n      data-release="1.7.15-906-1"\r\n      data-datalayer="{&quot;page&quot;:{&quot;type&quot;:&quot;product&quot;,&quot;category1&quot;:&quot;Beers&quot;,&quot;category2&quot;:null,&quot;category3&quot;:null,&quot;language&quot;:&quot;fr-FR&quot;,&quot;country&quot;:&quot;FR&quot;,&quot;currency&quot;:null},&quot;user&quot;:null}"\r\n      data-datalayer-url="/fr-FR/api/gtm/dataLayer">\r\n<head>\r\n    \r\n    <meta charset="utf-8" />\r\n\r\n    <title>Desperados TORP - F&#251;t de 2L  | Achat bi&#232;re en ligne | Beerwulf</title>\r\n\r\n    <!-- Website Meta information -->\r\n    <meta property="og:url" content="https://www.beerwulf.com/fr-fr/p/bieres/desperados-fut-2l" />\r\n    <meta name="twitter:card" content="summary_large_image">\r\n    <meta name="twitter:site" content="@

In [53]:
# Extraction par tâtonnement
begin = '<span class="price">'
idx = response.text.index(begin)
response.text[idx+len(begin):idx+100].split('<')[0]

'€ 10,70'

## BeautifulSoup - mise en forme des données

In [54]:
from bs4 import BeautifulSoup

In [55]:
html = """
<title>Helloooo</title>
<ul>
<li>Item 1</li>
<li>Item 2</li>
</ul>
"""

In [56]:
#soup contient tout le document
soup = BeautifulSoup(html, "lxml")
soup

<html><head><title>Helloooo</title>
</head><body><ul>
<li>Item 1</li>
<li>Item 2</li>
</ul>
</body></html>

In [57]:
#find renvoie la première balise trouvée
titre = soup.find('title')
titre

<title>Helloooo</title>

In [58]:
titre.contents

['Helloooo']

In [60]:
#soup.find_all('li', class="italic")

## Application au site scrappé

In [61]:
beersoup = BeautifulSoup(response.text, 'lxml')

In [62]:
# On extrait le span qui a la classe "price"
beersoup.select('span.price')

[<span class="price">€ 10,70</span>]

In [63]:
# [0] pour sélectionner le premier prix trouvé et text pour le contenu de la balise
beersoup.select('span.price')[0].text

'€ 10,70'

In [66]:
price = beersoup.select('span.price')[0].text
print(price)

€ 10,70


In [67]:
beersoup.find('dt', text="Contenu").find_next_sibling()

<dd class="small-6 medium-9 columns">200cl</dd>

In [68]:
def get_soup_from_url(url):
    page = requests.get(url)
    return BeautifulSoup(page.text, "lxml")

def extract_beer_infos(url):
    soup = get_soup_from_url(url)
    
    #Extract price
    price = beersoup.select('span.price')[0].text
    price = float(price[2:].replace(',','.')) #'€ 7,29' => '7.29'
    
    #Extract volume
    volume = soup.find('dt', text='Contenu').find_next_sibling().text
    volume = int(volume[:-2])
    
    #Extract note
    note = soup.find('div', class_="stars")
    note = int(note.attrs['data-percent'])
    
    #Extract EBC
    #ebc = soup.find('div', class_='ebc')
    #children = ebc.find_all('div', class_="active")
    #position = ebc.find_all('div').index(active)
    
    #Structure infos
    infos = {
        'price': price,
        'volume': volume
    }
    return infos

extract_beer_infos(page)

{'price': 10.7, 'volume': 200}

In [None]:
# requests ne permet pas de trouver les balises qui apparaissent après le chargement de la page (avec du js ou autre)
# on peut utiliser selenium dans ce cas par exemple