In [288]:
import requests
from bs4 import BeautifulSoup
import numpy as np
import timeit

import pandas as pd

# Scrape online data

In [3]:
URL = 'https://www.beeradvocate.com/beer/profile/2958/87869/'
page = requests.get(URL)

soup = BeautifulSoup(page.content, 'html.parser')


In [50]:
res = soup.find(id='info_box')

In [16]:
beerstats = soup.find_all(class_='beerstats')

In [32]:
bold = res.find_all('b')

In [33]:
bold

[<b>Beer Geek Stats:</b>,
 <b>Style:</b>,
 <b>Belgian Saison</b>,
 <b>ABV:</b>,
 <b>5%</b>,
 <b>Score:</b>,
 <b>94</b>]

In [57]:
soup.find(title="Learn more about this style.").find('b').text

'Belgian Saison'

In [65]:
soup.find_all(title="Percentage of alcohol by volume.")[-1].find('b').text[:-1]

'5'

In [55]:
soup.find(class_="ba-score Tooltip").text

'94'

In [54]:
soup.find(class_="ba-ravg Tooltip").text

'4.26'

In [60]:
soup.find(title="View more beers and info from this brewery.").text

'BFM (Brasserie des Franches-Montagnes)'

In [64]:
soup.find_all(href=lambda x: x and x.startswith('/place/directory/'))[-1].text

'Switzerland'

In [75]:
soup.find(class_="titleBar")

<div class="titleBar">
<h1>XV (√225 Saison)<br/><span style="color:#999999; font-size:0.75em;">BFM (Brasserie des Franches-Montagnes)</span></h1>
</div>

In [81]:
soup.find('title').text.split('|')[0].strip()

'XV (√225 Saison)'

In [94]:
def parse_beer_page(url):
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')
    
    name = soup.find('title').text.split('|')[0].strip()
    style = soup.find(title="Learn more about this style.").find('b').text
    alc_perc = float(soup.find_all(title="Percentage of alcohol by volume.")[-1].find('b').text[:-1])
    
    score_temp = soup.find(class_="ba-score Tooltip")
    if score_temp is not None:
        score = float(score_temp.text)
    else: score = np.NaN
    rating = float(soup.find(class_="ba-ravg Tooltip").text)
    brewery = soup.find(title="View more beers and info from this brewery.").text
    country = soup.find_all(href=lambda x: x and x.startswith('/place/directory/'))[-1].text
    
    return dict(name=name, style=style, alc_perc=alc_perc, 
                score=score, rating=rating, brewery=brewery, country=country)
    

In [165]:
def get_beer_country(url, session=requests):
    try:
        page = session.get(url)
        soup = BeautifulSoup(page.content, 'html.parser')
        country = soup.find_all(href=lambda x: x and x.startswith('/place/directory/'))[-1].text
    except:
        country = ''
    return country

In [99]:
parse_beer_page('https://www.beeradvocate.com/beer/profile/2385/87878/')

{'name': 'Dark Ale',
 'style': 'American Brown Ale',
 'alc_perc': 4.0,
 'score': nan,
 'rating': 3.44,
 'brewery': 'Black Duck Brewery',
 'country': 'Australia'}

In [106]:
get_beer_country('https://www.beeradvocate.com/beer/profile/2/3/')

'United States'

In [109]:
countries = []
url = 'https://www.beeradvocate.com/beer/profile/2/{}/'
for i in range(100):
    try:
        countries.append(get_beer_country(url.format(i)))
    except:
        print(i)
    

1
2
22
25
53
57
60
64
70
77
91
98


In [110]:
countries

['United States',
 'United States',
 'United States',
 'United States',
 'United States',
 'United States',
 'United States',
 'United States',
 'United States',
 'United States',
 'United States',
 'United States',
 'Belgium',
 'United States',
 'United States',
 'United States',
 'United States',
 'United Kingdom',
 'United Kingdom',
 'United Kingdom',
 'United States',
 'United States',
 'United States',
 'United States',
 'United States',
 'Netherlands',
 'Canada',
 'Canada',
 'Canada',
 'Canada',
 'Canada',
 'United States',
 'United Kingdom',
 'United Kingdom',
 'United States',
 'United States',
 'United States',
 'Canada',
 'Canada',
 'Canada',
 'Canada',
 'Canada',
 'Canada',
 'United States',
 'United States',
 'United States',
 'United States',
 'United States',
 'United Kingdom',
 'United States',
 'United States',
 'United Kingdom',
 'United States',
 'United States',
 'United States',
 'United States',
 'United States',
 'United States',
 'United States',
 'United States'

In [167]:
s = requests.Session()
timeit.timeit(lambda: get_beer_country(url.format(15002),s), number=5)

3.777793766999821

In [146]:
timeit.timeit(lambda: requests.head(url.format(19)), number=1)

0.8501878259994555

In [147]:
requests.head(url.format(19))

<Response [301]>

In [149]:
1.5* 100000 / 3600

41.666666666666664

## Check parallel requests

In [160]:
import eventlet

urls = [url.format(i) for i in range(50)]
pool = eventlet.GreenPool(50)

countries = []
def pool_work():
    for country in pool.imap(get_beer_country, urls):
        countries.append(country)
        print('one is done')

timeit.timeit(pool_work, number=1)

one is done
one is done
one is done
one is done
one is done
one is done
one is done
one is done
one is done
one is done
one is done
one is done
one is done
one is done
one is done
one is done
one is done
one is done
one is done
one is done
one is done
one is done
one is done
one is done
one is done
one is done
one is done
one is done
one is done
one is done
one is done
one is done
one is done
one is done
one is done
one is done
one is done
one is done
one is done
one is done
one is done
one is done
one is done
one is done
one is done
one is done
one is done
one is done
one is done
one is done


60.20601609700043

In [154]:
urls

['https://www.beeradvocate.com/beer/profile/2/3/',
 'https://www.beeradvocate.com/beer/profile/2/4/',
 'https://www.beeradvocate.com/beer/profile/2/5/',
 'https://www.beeradvocate.com/beer/profile/2/6/',
 'https://www.beeradvocate.com/beer/profile/2/7/',
 'https://www.beeradvocate.com/beer/profile/2/8/',
 'https://www.beeradvocate.com/beer/profile/2/9/',
 'https://www.beeradvocate.com/beer/profile/2/10/',
 'https://www.beeradvocate.com/beer/profile/2/11/',
 'https://www.beeradvocate.com/beer/profile/2/12/',
 'https://www.beeradvocate.com/beer/profile/2/13/',
 'https://www.beeradvocate.com/beer/profile/2/14/',
 'https://www.beeradvocate.com/beer/profile/2/15/',
 'https://www.beeradvocate.com/beer/profile/2/16/',
 'https://www.beeradvocate.com/beer/profile/2/17/',
 'https://www.beeradvocate.com/beer/profile/2/18/',
 'https://www.beeradvocate.com/beer/profile/2/19/',
 'https://www.beeradvocate.com/beer/profile/2/20/',
 'https://www.beeradvocate.com/beer/profile/2/21/',
 'https://www.beera

In [169]:
from concurrent.futures import ThreadPoolExecutor
from requests_futures.sessions import FuturesSession

session = FuturesSession(max_workers=5)
cs = []
timeit.timeit(lambda: cs.append(get_beer_country(url.format(15002),session)), number=5)

0.007156055999985256

In [172]:
cs

['', '', '', '', '']

In [175]:
from concurrent.futures import as_completed
from requests_futures.sessions import FuturesSession

def get_beer_countries_parall(urls):
    countries = []
    session = FuturesSession(max_workers=5)
    
    futures = [session.get(url) for url in urls]
    for future in as_completed(futures):
        try:
            resp = future.result()
            soup = BeautifulSoup(resp.content, 'html.parser')
            country = soup.find_all(href=lambda x: x and x.startswith('/place/directory/'))[-1].text
            countries.append(country)
        except:
            pass
    
    return countries

In [187]:
def get_swiss_beers_parall(urls):
    swissbeers = []
    session = FuturesSession(max_workers=5)
    
    futures = [session.get(url) for url in urls]
    for future in as_completed(futures):
        try:
            resp = future.result()
            soup = BeautifulSoup(resp.content, 'html.parser')
            country = soup.find_all(href=lambda x: x and x.startswith('/place/directory/'))[-1].text
            if country == 'Switzerland':
                swissbeers.append(resp.url)
        except:
            pass
    
    return swissbeers

In [176]:
urls = [url.format(i) for i in range(50)]

timeit.timeit(lambda: get_beer_countries_parall(urls), number=1)

6.967507653000212

In [184]:
session = FuturesSession(max_workers=5)

future = session.get(url.format(19))

In [185]:
resp = future.result()

In [186]:
resp.url

'https://www.beeradvocate.com/beer/profile/9/19/'

In [188]:
swiss_beers_urls = get_swiss_beers_parall([url.format(i) for i in range(1000)])

In [189]:
swiss_beers_urls

[]

In [191]:
460000 * (7/50)

64400.00000000001

In [194]:
30*60/(7/50)

12857.142857142855

In [195]:
swiss_beers_urls_to12k = get_swiss_beers_parall([url.format(i) for i in range(12000)])

KeyboardInterrupt: 

## Login to BeerAdvocate

In [275]:
s = requests.Session()
url_login = 'https://www.beeradvocate.com/community/login/login'
with open('ba_credentials','r') as f:
    login, password = f.read().splitlines()

In [276]:
 
payload = {'login': login, 'password': password}
page = s.post(url_login, data=payload)

In [277]:
with open('test.html','bw') as f:
    f.write(page.content)

In [10]:
url_ch = 'https://www.beeradvocate.com/place/list/?c_id=CH&s_id=0&brewery=Y'
url_ch2 = 'https://www.beeradvocate.com/place/list/?start=20&&c_id=CH&brewery=Y&sort=name'
ch_page1 = s.get(url_ch)

In [278]:
url_ch_gen = 'https://www.beeradvocate.com/place/list/?start={}&&c_id=CH&brewery=Y&sort=name'

In [281]:
ch_end = s.get(url_ch_gen.format(140))
soup_ch_end = BeautifulSoup(ch_end.content)
list_ch_end = soup_ch_end.find_all(href=lambda x: x and x.startswith('/beer/profile/'))
list_ch_end

[]

In [274]:
with open('test2.html','bw') as f:
    f.write(ch_end.content)

In [258]:
len(list_ch_end)

0

## List all CH breweries

In [298]:
base_url = 'https://www.beeradvocate.com'
def brewery_extract(row):
    cols = row.find_all('td')
    first_el = cols[0].find('a')
    link = base_url +first_el['href']
    brewery_name = first_el.text
    def treat_dash(text):
        if text=='-':
            return 'nan'
        else:
            return text
    ratings, nbeers = [treat_dash(x.text) for x in cols[-2:]]
    ratings = float(ratings)
    nbeers = int(nbeers)
    dic = dict(
        name = brewery_name, link = link,
        ratings = ratings, n_beers = nbeers
        )
    return dic
    

In [299]:
def treat_address(list_ad):
    ad = ''
    for el in list_ad:
        if el.name == 'br':
            ad += '\n'
        elif el.name == 'a':
            ad += el.get_text()
        else:
            ad += str(el).strip()
    ad = ad.split('\n\n')
    ad = list(map(lambda x: x.strip(), ad))
    if len(ad) == 2:
        address, tel = ad
    else:
        address = ad[0]
        tel = ''
    return dict(address=address, telephone=tel)

In [302]:
def breweries_from_page(page):
    table_rows = iter(page.find('table').find_all('tr'))
    breweries = []
    for row in table_rows:
        #columns = row.find_all('td')
        #table.append([el.text.rstrip() for el in columns])
        if row.find(href=lambda x: x and x.startswith('/beer/profile/')):
            brew_dic = brewery_extract(row)
            address_row = next(table_rows).find('td')
            list_ad = list(address_row.children)
            brew_dic.update(treat_address(list_ad))
            breweries.append(brew_dic)
    return breweries

In [303]:
s = requests.Session()
url_login = 'https://www.beeradvocate.com/community/login/login'

with open('ba_credentials','r') as f:
    login, password = f.read().splitlines() 
payload = {'login': login, 'password': password}
page = s.post(url_login, data=payload)

url_ch_gen = 'https://www.beeradvocate.com/place/list/?start={}&&c_id=CH&brewery=Y&sort=name'
start_n = 0
search = True
breweries_ch = []
while search:
    brew_page = BeautifulSoup(s.get(url_ch_gen.format(start_n)).content)
    more_breweries = brew_page.find_all(href=lambda x: x and x.startswith('/beer/profile/'))
    if len(more_breweries) >0 :
        breweries_ch += breweries_from_page(brew_page)
    else:
        search=False
    
    start_n += 20
    print(start_n)
    


20
40
60
80
100
120
140
160


In [304]:
breweries_ch_df = pd.DataFrame(breweries_ch)

In [305]:
breweries_ch_df

Unnamed: 0,name,link,ratings,n_beers,address,telephone
0,4655 Brewing Company,https://www.beeradvocate.com/beer/profile/41488/,,4,"Stüsslingen, 4655\nSwitzerland",
1,523,https://www.beeradvocate.com/beer/profile/44206/,3.79,2,"Sonnenweg 30\nKöniz, 3098\nSwitzerland",+41 77 450 7655
2,7Peaks Brasserie,https://www.beeradvocate.com/beer/profile/39040/,2.72,9,"Route de la Patinoire 15a\nMorgins, 1875\nSwit...",+41 78 797 1875
3,À tue-tête,https://www.beeradvocate.com/beer/profile/52843/,4.14,9,"Chemin de la Biole 10\nAigle, 1860\nSwitzerland",+41 79 235 56 49
4,Ahoi Bier,https://www.beeradvocate.com/beer/profile/42305/,3.67,4,"Schlieren, 8952,Switzerland",
...,...,...,...,...,...,...
135,Volta Bräu,https://www.beeradvocate.com/beer/profile/39036/,,2,"Voltastrasse 30\nBasel, 4056\nSwitzerland",+41 61 690 91 29
136,Wadi-Brau-Huss AG,https://www.beeradvocate.com/beer/profile/9378/,,0,"Florhofstrasse 13\nWadenswill, 8820\nSwitzerland",01 783 9392
137,Wartauer Seegal Bräu,https://www.beeradvocate.com/beer/profile/39074/,,8,"Hauptstrasse 33/35\nTrübbach, 9477\nSwitzerland",+41 78 878 19 52
138,Weedbeer,https://www.beeradvocate.com/beer/profile/32639/,3.00,1,"Chemin de l'Etraz 14\nOrbe, 1350\nSwitzerland",+41 21 312 68 21


In [241]:
with open('test2.html','bw') as f:
    f.write(ch_page1.content)

In [11]:
soup_ch1 = BeautifulSoup(ch_page1.content)

In [12]:
list_ch1 = soup_ch1.find_all(href=lambda x: x and x.startswith('/beer/profile/'))
list_ch1[0]['href']

'/beer/profile/41488/'

In [254]:
breweries

[{'name': '4655 Brewing Company',
  'link': 'https://www.beeradvocate.com/beer/profile/41488/',
  'ratings': 3.79,
  'n_beers': 2,
  'address': 'Stüsslingen, 4655\nSwitzerland',
  'telephone': ''},
 {'name': '523',
  'link': 'https://www.beeradvocate.com/beer/profile/44206/',
  'ratings': 3.79,
  'n_beers': 2,
  'address': 'Sonnenweg 30\nKöniz, 3098\nSwitzerland',
  'telephone': '+41 77 450 7655'},
 {'name': '7Peaks Brasserie',
  'link': 'https://www.beeradvocate.com/beer/profile/39040/',
  'ratings': 3.79,
  'n_beers': 2,
  'address': 'Route de la Patinoire 15a\nMorgins, 1875\nSwitzerland',
  'telephone': '+41 78 797 1875'},
 {'name': 'À tue-tête',
  'link': 'https://www.beeradvocate.com/beer/profile/52843/',
  'ratings': 3.79,
  'n_beers': 2,
  'address': 'Chemin de la Biole 10\nAigle, 1860\nSwitzerland',
  'telephone': '+41 79 235 56 49'},
 {'name': 'Ahoi Bier',
  'link': 'https://www.beeradvocate.com/beer/profile/42305/',
  'ratings': 3.79,
  'n_beers': 2,
  'address': 'Schlieren, 

In [167]:
table_rows[6].get_text(separator='\n')

'Sonnenweg 30\nKöniz\n, 3098\nSwitzerland\n+41 77 450 7655\nBrewery'

In [190]:
table_rows[6].find('td').get_text(separator='\n').split('\n')

['\xa0', 'Stüsslingen', ', 4655', 'Switzerland']

In [112]:
brew_row = table_rows[3].find_all('td')
brew = brew_row[0].find('a')
link = brew['href']
name = brew.text
(link, name)

('/beer/profile/41488/', '4655 Brewing Company')

In [231]:
from bs4 import NavigableString
table_rows = soup_ch1.find('table').find_all('tr')


address_row = table_rows[10].find('td')

list_ad = list(address_row.children)
treat_address(list_ad)


{'address': 'Chemin de la Biole 10\nAigle, 1860\nSwitzerland',
 'telephone': '+41 79 235 56 49'}

In [239]:
[x.text for x in soup_ch1.find('table').find_all('tr')[5].find_all('td')[-2:]]

['3.79', '2']

In [252]:
brewery_extract(soup_ch1.find('table').find_all('tr')[5])

{'name': '523',
 'link': 'https://www.beeradvocate.com/beer/profile/44206/',
 'ratings': 3.79,
 'n_beers': 2}

In [246]:
soup_ch1.find('table').find_all('tr')[5].find_all('td')[0]

<td align="left" colspan="2" valign="top"><a href="/beer/profile/44206/"><b>523</b></a></td>

In [39]:
table[5]

['523', '-', '-', '3.79', '2']

In [257]:
list_ch1_conv = list(map(brewery_extract, list_ch1))

In [260]:
url_base = 'https://www.beeradvocate.com'
link_brewery = url_base + list_ch1_conv[0][1]

In [263]:
brew_1 = s.get(link_brewery)
brew_1_s = BeautifulSoup(brew_1.content)

In [264]:
list_beers_brew_1 = brew_1_s.find_all(href=lambda x: x and x.startswith('/beer/profile/'))

In [265]:
list_beers_brew_1

[<a href="/beer/profile/41488/?view=beers&amp;show=arc">Archived (5)</a>,
 <a href="/beer/profile/41488/?view=beers&amp;show=all">All</a>,
 <a href="/beer/profile/41488/?view=beers&amp;show=feed">Feed</a>,
 <a href="/beer/profile/41488/219290/"><b>Black Swan Coffee Stout</b></a>,
 <a href="/beer/profile/41488/183565/"><b>Rain Drop Pale Ale</b></a>,
 <a href="/beer/profile/41488/196255/"><b>Retired Bear Amber Ale</b></a>,
 <a href="/beer/profile/41488/187713/"><b>White Swan Amarillo Wheat</b></a>,
 <a href="/beer/profile/41488/#XenForo"><i class="uix_icon uix_icon-jumpToTop"></i> <span class="uix_hide">Top</span></a>]

In [297]:
float('nan') + 3

nan