In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

In [None]:
data_path = './../../../data'
advocate_dir = os.path.join(data_path, 'BeerAdvocate')
stems_ba = pd.read_pickle(os.path.join(advocate_dir, 'reviews_with_exp_stems.pkl'))

In [2]:
expert_terms = {
    "flavor": ['caramel', 'toffee', 'bready', 'biscuity', 'nutty', 'roasted', 'chocolate', 'coffee', 'mocha','molasses','syrupy','smokey','toasted','vanilla','cocoa','brown sugar',
              'hoppy', 'citrusy', 'tropical', 'floral', 'piney', 'resinous', 'herbal', 'grassy', 'earthy', 'dank', 'spicy', 'peppery', 'juicy', 'zesty', 'tangy', 'sharp', 'resin', 'bitterness',
               'estery',' fruity', 'berry',' cherry',' apple',' banana', 'pear', 'stone fruit', 'plum', 'fig', 'raisin', 'peach',' apricot', 'dark fruit', 'citrus zest', 'lemon', 'orange peel', 'pineapple', 'mango', 'lychee',
               'oaky', 'woody', 'bourbon', 'whiskey', 'rye', 'tequila', 'brandy','vinous', 'sour', 'tart', 'acidic', 'lacto', 'brett', 'farmhouse', 'honeyed', 'clove', 'bubblegum', 'funky', 'barnyard'], 
    "aroma": ['bouquet', 'nose', 'aromatic', 'fragrant','perfumed', 'faint', 'musky', 'dank', 'subtle', 'fruity', 'floral', 'yeasty', 'clean', 'malty', 'crisp', 'pungent', 'spicy', 'smokey', 'earthy'],  
    "mouthfeel": [ 'body','full-bodied', 'medium-bodied', 'light-bodied', 'thick', 'thin', 'mouthfeel', 'creamy', 'smooth', 'velvety', 'oily', 'astringent','tannic', 'chalky', 'drying', 'slick', 'watery', 'effervescent', 'fizzy', 'tingly', 'prickly', 'carbonation', 'viscosity', 'warming', 'coating', 'biting', 'sharp'],
    "brewing": [ 'dry hopping', 'double dry hopping', 'barrel-aged', 'bottle-conditioned', 'open fermentation', 'secondary fermentation', 'wort', 'mash, sparging', 'cold crashing', 'decoction', 'conditioning', 'lacto', 'yeast strain', 'adjuncts', 'grains', 'malt', 'specialty grains'],
    "technical": [ 'balance', 'complexity', 'depth', 'layers', 'nuanced', 'refined', 'structured', 'profile', 'round', 'harmonious', 'clean', 'crisp', 'finish', 'lingering', 'evolving', 'sharp', 'clarity', 'purity', 'dense', 'robust', 'powerful', 'light', 'restrained', 'vibrant', 'subdued', 'heavy', 'integrity', 'layered', 'exemplar', 'benchmark', 'classic', 'signature style', 'finesse', 'elegance'],
    "appearance": ['hazy', 'cloudy', 'opaque', 'translucent', 'clear', 'bright', 'unfiltered', 'filtered', 'lacing', 'foam', 'frothy', 'rocky head', 'stable head', 'head retention', 'appearance', 'golden', 'amber', 'ruby', 'copper', 'dark', 'black', 'mahogany', 'light', 'straw', 'yellow', 'chestnut', 'tan', 'off-white head' ],
    "judgment": [ 'exemplar', 'well-integrated', 'restrained', 'harmonious', 'round', 'classic example', 'benchmark', 'flawless', 'exceptional', 'outstanding', 'world-class', 'traditional', 'innovative', 'unconventional', 'unique', 'reference point', 'nuanced' ],
    "off_flavors": [ 'oxidized', 'metallic', 'cardboard', 'stale', 'sulfur', 'skunky', 'diacetyl', 'astringent', 'phenolic', 'acetaldehyde', 'DMS', 'cloying', 'solvent-like', 'overly bitter', 'thin', 'harsh', 'vegetal', 'buttery', 'musty, moldy', 'medicinal' ],
    "miscellaneous": [ 'initial impression', 'mid-palate', 'aftertaste', 'finish', 'lingering', 'mouth-coating', 'evolving flavor', 'balanced start', 'unfolding', 'developing', 'peak', 'bright finish', 'dry finish', 'clean ending', 'reminiscent', 'similar to', 'akin to', 'comparable', 'surpasses', 'diverges from', 'evokes', 'hints of', 'resembles', 'distinct from', 'notes of', 'echoes' ]
}

In [3]:
expert_terms_stemmed = {}
for category, terms in expert_terms.items():
    tokens = [word_tokenize(term.lower()) for term in terms]
    
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(word[0]) for word in tokens]
    expert_terms_stemmed[category] =  stemmed_tokens

In [4]:
def exp_term_score(text_tokens):
    scores = {category: sum(1 for term in text_tokens if term in terms)
              for category, terms in expert_terms_stemmed.items()}
    scores["expertness_score"] = sum(scores.values())
    return scores

def score_df(df):
    scores_df = df["stems"].map(exp_term_score).apply(pd.Series)
    
    # Drop intermediate column and merge results
    return pd.concat([df, scores_df], axis=1)


In [24]:
reviews_Stout = stems_ba[stems_ba['beer_id']==11757]

In [25]:
import time

start_time = time.time()

df_stout = score_df(reviews_Stout)
print("--- %s seconds ---" % (time.time() - start_time))

--- 0.21785283088684082 seconds ---


In [30]:
import time

start_time = time.time()

df_ba = score_df(stems_ba)
print("--- %s seconds ---" % (time.time() - start_time))

--- 515.0353217124939 seconds ---


In [31]:
df_ba.to_pickle(os.path.join(advocate_dir, 'rev_w_scores.pkl'))
df_ba.head()

Unnamed: 0,beer_name,beer_id,brewery_name,brewery_id,style,abv,date,user_name,user_id,appearance,...,flavor,aroma,mouthfeel,brewing,technical,appearance.1,judgment,off_flavors,miscellaneous,expertness_score
0,Régab,142544,Societe des Brasseries du Gabon (SOBRAGA),37262,Euro Pale Lager,4.5,2015-08-20,nmann08,nmann08.184925,3.25,...,2,3,2,2,0,3,0,0,1,13
1,Barelegs Brew,19590,Strangford Lough Brewing Company Ltd,10093,English Pale Ale,4.5,2009-02-20,StJamesGate,stjamesgate.163714,3.0,...,3,1,2,2,0,4,0,1,1,14
2,Barelegs Brew,19590,Strangford Lough Brewing Company Ltd,10093,English Pale Ale,4.5,2006-03-13,mdagnew,mdagnew.19527,4.0,...,10,3,6,1,3,4,0,1,3,31
3,Barelegs Brew,19590,Strangford Lough Brewing Company Ltd,10093,English Pale Ale,4.5,2004-12-01,helloloser12345,helloloser12345.10867,4.0,...,3,1,0,0,1,5,0,0,2,12
4,Barelegs Brew,19590,Strangford Lough Brewing Company Ltd,10093,English Pale Ale,4.5,2004-08-30,cypressbob,cypressbob.3708,4.0,...,2,1,1,1,2,6,0,0,1,14


In [32]:
del df_ba

In [None]:
data_path = './../../../data'
rb_dir = os.path.join(data_path, 'RateBeer')
stems_rb = pd.read_pickle(os.path.join(rb_dir, 'reviews_with_exp_stems.pkl'))

In [6]:
import time
start_time = time.time()

df_rb = score_df(stems_rb)
print("--- %s seconds ---" % (time.time() - start_time))

--- 1682.2388100624084 seconds ---


In [7]:
df_rb.to_pickle(os.path.join(rb_dir, 'rev_w_scores.pkl'))
df_rb.head()

Unnamed: 0,beer_name,beer_id,brewery_name,brewery_id,style,abv,date,user_name,user_id,appearance,...,flavor,aroma,mouthfeel,brewing,technical,appearance.1,judgment,off_flavors,miscellaneous,expertness_score
4,Castel Beer (Gabon),105273,Sobraga,3198,Pale Lager,5.2,2015-10-23,kevnic2008,122778,2.0,...,0,1,2,0,3,3,0,0,1,10
5,Castel Beer (Gabon),105273,Sobraga,3198,Pale Lager,5.2,2015-10-12,Beerhunter111,227834,2.0,...,0,0,1,2,1,3,0,0,1,8
6,Castel Beer (Gabon),105273,Sobraga,3198,Pale Lager,5.2,2015-10-07,Erzengel,83106,4.0,...,1,1,2,0,0,0,0,0,1,5
8,Castel Beer (Gabon),105273,Sobraga,3198,Pale Lager,5.2,2014-05-29,Lowenbrau,37316,3.0,...,0,1,2,1,1,5,0,1,1,12
9,Castel Beer (Gabon),105273,Sobraga,3198,Pale Lager,5.2,2013-07-18,seynie,75452,2.0,...,0,1,1,0,1,3,0,0,0,6


In [8]:
df_rb

Unnamed: 0,beer_name,beer_id,brewery_name,brewery_id,style,abv,date,user_name,user_id,appearance,...,flavor,aroma,mouthfeel,brewing,technical,appearance.1,judgment,off_flavors,miscellaneous,expertness_score
4,Castel Beer (Gabon),105273,Sobraga,3198,Pale Lager,5.2,2015-10-23,kevnic2008,122778,2.0,...,0,1,2,0,3,3,0,0,1,10
5,Castel Beer (Gabon),105273,Sobraga,3198,Pale Lager,5.2,2015-10-12,Beerhunter111,227834,2.0,...,0,0,1,2,1,3,0,0,1,8
6,Castel Beer (Gabon),105273,Sobraga,3198,Pale Lager,5.2,2015-10-07,Erzengel,83106,4.0,...,1,1,2,0,0,0,0,0,1,5
8,Castel Beer (Gabon),105273,Sobraga,3198,Pale Lager,5.2,2014-05-29,Lowenbrau,37316,3.0,...,0,1,2,1,1,5,0,1,1,12
9,Castel Beer (Gabon),105273,Sobraga,3198,Pale Lager,5.2,2013-07-18,seynie,75452,2.0,...,0,1,1,0,1,3,0,0,0,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7122044,Svejk Blonde,220897,Svejk Beer Garden,17155,Pale Lager,,2014-09-18,Travlr,83882,3.0,...,0,1,0,0,0,5,0,0,0,6
7122045,Svejk Blonde,220897,Svejk Beer Garden,17155,Pale Lager,,2013-12-01,TBone,10233,2.0,...,0,2,1,0,3,3,0,0,2,11
7122046,Svejk Dark,220898,Svejk Beer Garden,17155,Dunkel/Tmavý,,2014-11-04,Rob_D_UK,257161,3.0,...,4,0,0,0,0,2,0,0,0,6
7122047,Svejk Dark,220898,Svejk Beer Garden,17155,Dunkel/Tmavý,,2014-09-16,Travlr,83882,3.0,...,3,0,2,0,1,4,0,1,0,11
