In [1]:
from os import path
from datetime import datetime
import string
import re
import warnings
warnings.filterwarnings('ignore')
import urllib.request
from itertools import compress
import time
import random

import lxml.html as LH
import requests
from bs4 import BeautifulSoup as bs

import numpy as np
import pandas as pd
from scipy.spatial.distance import cdist

def splitDftoDict(df, split_col):
    dict_to_return = {}
    for element in df[split_col].unique():
        dict_to_return[element] = df[df[split_col]==element]
    return dict_to_return

In [2]:
url = "https://www.beeradvocate.com/beer/styles/"
r = requests.get(url)
all_tags = bs(r.content, "html.parser")

In [3]:
beer_style_links = [x.get('href') for x in all_tags.findAll('a', attrs={'href': re.compile("/beer/styles/\d{1,}/")})]
beer_style_names = [x.text for x in all_tags.findAll('a', attrs={'href': re.compile("/beer/styles/\d{1,}/")})]

In [7]:
def get_beer_style_info(beer_style, beer_link, page=0):
    print(beer_style, end=' ')
    time.sleep(abs(random.normalvariate(2,0.5)))
    beer_style_url = "https://www.beeradvocate.com{beer_style}?sort=revsD&start={page}".format(beer_style=beer_link, page=0)
    r = requests.get(beer_style_url)
    all_tags = bs(r.content, "html.parser")
    
    beer_links = [x.get('href') for x in all_tags.findAll('a', attrs={'href': re.compile("/beer/profile/\d{1,}/")})][::2]
    tbl = [x.text.strip() for x in all_tags.findAll('td', attrs={'class' : re.compile("hr_bottom_light")})]
    beers = tbl[::6]
    brewery = tbl[1:][::6]
    abv = tbl[2:][::6]
    ratings = [x.replace(",", "") for x in tbl[3:][::6]]
    score = tbl[4:][::6]
    beer_style_name = [beer_style] * len(beers)
    beer_style_name_clean = [re.sub(string=beer_style, 
                                    pattern="[(]|[)]", 
                                    repl="").strip().replace(" / ", "_").replace(" ", "_").lower().strip()] * len(beers)
    
    df = pd.DataFrame.from_dict({"beer":beers,
                   "brewery":brewery, 
                   "abv":abv, 
                   "ratings":ratings, 
                   "score":score,
                   "link" : beer_links,
                   "beer_style" : beer_style_name,
                   "beer_style_clean" : beer_style_name_clean})
    return df

In [8]:
beer_style_info_lists = [get_beer_style_info(n, l) for n,l in zip(beer_style_names, beer_style_links)]
beer_df = pd.concat(beer_style_info_lists, axis=0)

German Bock German Doppelbock German Eisbock German Maibock German Weizenbock American Brown Ale English Brown Ale English Dark Mild Ale German Altbier American Black Ale Belgian Dark Ale Belgian Dubbel German Roggenbier Scottish Ale Winter Warmer American Amber / Red Lager European Dark Lager German Märzen / Oktoberfest German Rauchbier German Schwarzbier Munich Dunkel Lager Vienna Lager American Cream Ale Bière de Champagne / Bière Brut Braggot California Common / Steam Beer American Brut IPA American Imperial IPA American IPA Belgian IPA English India Pale Ale (IPA) New England IPA American Amber / Red Ale American Blonde Ale American Pale Ale (APA) Belgian Blonde Ale  Belgian Pale Ale Belgian Saison English Bitter English Extra Special / Strong Bitter (ESB) English Pale Ale English Pale Mild Ale French Bière de Garde German Kölsch Irish Red Ale American Adjunct Lager American Imperial Pilsner American Lager American Light Lager American Malt Liquor Bohemian Pilsener European Export

In [9]:
beer_df.to_gbq(project_id='scarlet-labs', destination_table="beer.beer_info_master_table", if_exists="replace", verbose=False)

In [2]:
beer_df = pd.read_gbq(project_id='scarlet-labs', 
                      query="select * from `scarlet-labs.beer.beer_info_master_table` order by ratings desc", 
                      dialect='standard')

Requesting query... ok.
Query running...
Query done.
Cache hit.

Retrieving results...
Got 5516 rows.

Total time taken 1.34 s.
Finished at 2018-12-11 21:45:52.


In [3]:
beer_df.head()

Unnamed: 0,abv,beer,beer_style,brewery,link,ratings,score
0,6.7,Hazy Little Thing IPA,New England IPA,Sierra Nevada Brewing Co.,/beer/profile/140/317146/,999,3.97
1,5.2,Karma Belgian Style Pale Ale,Belgian Pale Ale,Avery Brewing Company,/beer/profile/30/23042/,997,3.4
2,5.1,Tire Bite Golden Ale,American Blonde Ale,Flying Dog Brewery,/beer/profile/68/2151/,997,3.0
3,7.5,Agnus Dei (Abbey Pale Ale),Belgian Tripel,Brouwerij Corsendonk,/beer/profile/41/139/,996,3.97
4,4.7,Sea Dog Blueberry Wheat Ale,Fruit and Field Beer,Sea Dog Brewing Company,/beer/profile/137/345/,996,3.44


In [4]:
def get_beer_vector(beer_link, ratings):
    total_pages = min([12, round(int(ratings)/25)-1])
    df_list = []
    for page in range(0,total_pages):
        time.sleep(abs(random.normalvariate(1,0.25)))
        url = "https://www.beeradvocate.com{beer_link}?view=beer&sort=&start={page}".format(beer_link=beer_link, page=page)
        r = requests.get(url)
        all_tags = bs(r.content, "html.parser")
        taste_list = [x for i,x in enumerate([x.text.strip() 
                                   for x in all_tags.findAll('span', attrs={'class' : re.compile("muted")})]) 
           if re.search(pattern="[|]", string=x)]
        
        df_page = pd.DataFrame([[float(x.split(":")[1]) for x in vector.split("|")] 
                       for vector in taste_list], columns=["look", "smell", "taste", "feel", "overall"])
        df_list.append(df_page)
        page += 25
    
    vectors_to_matrix = pd.concat(df_list, axis=0)
    beer_vector = vectors_to_matrix.mean().to_frame().transpose()
    
    beer_vector['records'] = vectors_to_matrix.shape[0]
    beer_vector['link'] = beer_link
    
    return beer_vector

In [21]:
beer_style_dict = splitDftoDict(df=beer_df, split_col="beer_style")

In [23]:
def get_beer_style_recommendations(df):
    beer_vector_list = [get_beer_vector(beer_link=x, ratings=y) for x,y in df[['link', 'ratings']].values.tolist()]
    beer_vector_df = pd.concat(beer_vector_list, axis=0)
    
    a = beer_vector_df[["look", "smell", "taste", "feel", "overall", "records"]]
    d = pd.DataFrame(cdist(a, a), columns=beer_vector_df['link'])
    d_ranked = d.rank(axis=1).transpose()
    rec_df = pd.DataFrame([{"link":d_ranked.index[d_ranked.iloc[:,col] == 1].values[0], 
                   "recommendations":list(d_ranked.index[(d_ranked.iloc[:,col] > 1) & (d_ranked.iloc[:,col] < 5)].values)} 
     for col in list(range(len(d.columns)))])

    rec_df_wide = rec_df.recommendations.apply(pd.Series).merge(rec_df, left_index=True, right_index=True).drop(["recommendations"], axis=1)
    rec_df_wide.columns = ["rec1", "rec2", "rec3", "link"]
    return rec_df_wide

In [24]:
tmp_df = get_beer_style_recommendations(beer_style_dict['American Barleywine'])

In [5]:
beer_vector_list = [get_beer_vector(beer_link=x, ratings=y) for x,y in beer_df[['link', 'ratings']].iloc[0:10,:].values.tolist()]

In [6]:
beer_vector_df = pd.concat(beer_vector_list, axis=0)

In [7]:
a = beer_vector_df[["look", "smell", "taste", "feel", "overall", "records"]]
d = pd.DataFrame(cdist(a, a), columns=beer_vector_df['link'])
d_ranked = d.rank(axis=1).transpose()
rec_df = pd.DataFrame([{"link":d_ranked.index[d_ranked.iloc[:,col] == 1].values[0], 
               "recommendations":list(d_ranked.index[(d_ranked.iloc[:,col] > 1) & (d_ranked.iloc[:,col] < 5)].values)} 
 for col in list(range(len(d.columns)))])

rec_df_wide = rec_df.recommendations.apply(pd.Series).merge(rec_df, left_index=True, right_index=True).drop(["recommendations"], axis=1)
rec_df_wide.columns = ["rec1", "rec2", "rec3", "link"]

In [11]:
rec_df_wide.to_gbq(project_id='scarlet-labs', destination_table="beer.beer_recommendations", if_exists="replace", verbose=False)