In [1]:
from bs4 import BeautifulSoup
from collections import namedtuple
import requests
import re
import pandas as pd
import threading

In [2]:
# We define a namedtuple for the beer instead of a class
Beer = namedtuple('Beer', ['beer_name', 'brewery', 'ba_score', 'ranking', 'reviews', 'ratings', 'pdev',
                           'wants', 'gots', 'trade', 'location', 'style', 'alcohol', 'availability',
                           'note_desc'])

In [3]:
def scrape_beer_page(page_url):
    """
    THe function scrapes the beer information from a beer page. 
    
    :param page_url: beer page to scrape
    :return: beer object which contains the scraped information
    """

    page_response = requests.get(page_url, timeout=5)

    full_soup = BeautifulSoup(page_response.content, 'lxml')

    title = full_soup.find('div', class_='titleBar')

    beer_name = list(title.h1.strings)[0].strip()
    brewery = title.span.string.replace('|', '').strip()

    ba_content = full_soup.find(id='ba-content')
    ba_score = ba_content.find(id='score_box').find('span', class_='ba-ravg').string

    item_stats_content = ba_content.find(id='item_stats')

    stats = ['Ranking', 'Reviews', 'Ratings', 'pDev', 'Wants', 'Gots', 'Trade']

    for elem in item_stats_content.find_all('dt'):
        stat_name = elem.next_element.string.replace(':', '')
        stat_value = elem.find_next('dd').string.strip()
        if stat_name == 'Ranking':
            ranking = stat_value
        elif stat_name == 'Reviews':
            reviews = stat_value
        elif stat_name == 'Ratings':
            ratings = stat_value
        elif stat_name == 'pDev':
            pdev = stat_value
        elif stat_name == 'Wants':
            wants = stat_value
        elif stat_name == 'Gots':
            gots = stat_value
        elif stat_name == 'Trade':
            trade = stat_value

    # Info-box
    info_box = full_soup.find(id='info_box')

    location_marker = info_box.find(string=re.compile("Brewed by"))
    location = location_marker.find_next('a').find_next('a').string

    style_marker = info_box.find('b', string='Style:')
    style = style_marker.find_next('a').string.strip()

    alc_marker = info_box.find('b', string=re.compile("Alcohol by"))
    alcohol = alc_marker.next_sibling.strip()

    avail_marker = info_box.find('b', string=re.compile("Availability"))
    availability = avail_marker.next_sibling.strip()

    note_marker = info_box.find('b', string=re.compile("Notes / Commercial"))
    note_desc = note_marker.find_next('br').next_element
    
    return Beer(beer_name, brewery, ba_score, ranking, reviews, ratings, pdev,
                wants, gots, trade, location, style, alcohol, availability,
                note_desc)

In [6]:
page_url = 'https://www.beeradvocate.com/beer/profile/16043/112329/'
beer = scrape_beer_page(page_url)
print(beer)

AttributeError: 'NoneType' object has no attribute 'string'

In [7]:
def scrape_beer_urls(page_url):
    """
    This function gets the urls from a list of beer urls and puts them in a list.
    """
    page_response = requests.get(page_url, timeout=5)

    full_soup = BeautifulSoup(page_response.content, 'lxml')

    beer_table = full_soup.find('div', id='ba-content').table

    def is_a_and_parent_is_td(tag):
        return tag.parent.name == 'td' and tag.name == 'a'

    all_beers = beer_table.find_all(is_a_and_parent_is_td)
    beer_urls = []

    for beer_tag in all_beers:
        beer_urls.append('https://www.beeradvocate.com' + beer_tag.get('href'))
        print('https://www.beeradvocate.com' + beer_tag.get('href'))

    return beer_urls

In [8]:
url = 'https://www.beeradvocate.com/lists/top/'
beer_urls = scrape_beer_urls(url)

https://www.beeradvocate.com/beer/profile/23222/78820/
https://www.beeradvocate.com/beer/profile/26/42349/
https://www.beeradvocate.com/beer/profile/17981/110635/
https://www.beeradvocate.com/beer/profile/25888/87246/
https://www.beeradvocate.com/beer/profile/46317/16814/
https://www.beeradvocate.com/beer/profile/863/21690/
https://www.beeradvocate.com/beer/profile/23222/76421/
https://www.beeradvocate.com/beer/profile/28743/87846/
https://www.beeradvocate.com/beer/profile/28743/237238/
https://www.beeradvocate.com/beer/profile/23222/162502/
https://www.beeradvocate.com/beer/profile/28743/146770/
https://www.beeradvocate.com/beer/profile/1199/47658/
https://www.beeradvocate.com/beer/profile/33824/172669/
https://www.beeradvocate.com/beer/profile/863/7971/
https://www.beeradvocate.com/beer/profile/17981/56764/
https://www.beeradvocate.com/beer/profile/23222/78660/
https://www.beeradvocate.com/beer/profile/388/5281/
https://www.beeradvocate.com/beer/profile/28743/207976/
https://www.beer

In [10]:
all_beers = []
unique_beers = []
beers = list(unique_beers)

# For each beer page within start and end index pull the info and append it
def scrape_all_beers(beers, start, end):
    for beer in beers[start:end]:
        try:
            my_beer = scrape_beer_page(beer)
            all_beers.append(my_beer)
        except Exception:
            print('error with item')

            
# This utilizes multi-threading to start scraping all the beer pages from the list we got earlier
def split_processing(beers, num_splits=40):
    split_size = len(beers) // num_splits
    threads = []
    for i in range(num_splits):
        # determine the indices of the list this thread will handle
        start = i * split_size
        # special case on the last chunk to account for uneven splits
        end = None if i+1 == num_splits else (i+1) * split_size
        # create the thread
        threads.append(
            threading.Thread(target=scrape_all_beers, args=(beers, start, end))
        )
        threads[-1].start() # start the thread we just created

    # wait for all threads to finish                                            
    for t in threads:
        t.join()

split_processing(beers)

In [11]:
# This page writes the beer information into a csv file
col_names = Beer._fields
beer_data = pd.DataFrame.from_records(all_beers, columns = col_names)
beer_data.to_csv('beer_advocate.csv', index=False)