In [19]:
import pandas as pd
import numpy as np
import string
import re
import jellyfish
import warnings
import json
from scipy.stats import rankdata
file_path = "C:/Users/mnest/Google Drive/RWD/Beer_Advocate/"
warnings.filterwarnings('ignore')

In [9]:
#a little spell checker never hurt anyone
def amatch(a, b):
    best_match_list = [jellyfish.jaro_distance(a.lower(),c.lower()) for c in b]
    return(b[np.argmax(best_match_list)])

In [10]:
beer_brew_info_df = pd.read_csv(file_path + "beer_brew_df.csv", header = 0)
beer_brew_info_df = beer_brew_info_df.loc[beer_brew_info_df['beer_ratings_tot'] > 200, ]

beer_styles = [re.sub(string=beer_style.lower(), 
                      pattern="[^\w]|[^a-z]", repl="") for beer_style in beer_brew_info_df['beer_style'].unique()]

In [11]:
def get_beer_review_info(beer_style):
    x = pd.read_csv(file_path + "beer_review_data_1000/beer_ratings_%s.csv" % beer_style, 
                    names= ['look', 'smell', 'taste', 'feel', 'overall', 'beer_review', 'username', 'review_date', 'beer_id'])
    x = x.merge(beer_brew_info_df[['beer_id', 'beer', 'brewery', 'beer_abv', 'beer_style_new','beer_bros']], on = "beer_id", how = "inner")
    return(x)

In [12]:
beer_review_list = [get_beer_review_info(beer_style) for beer_style in beer_styles]

In [31]:
#if you prefer to work with the dictionary...
beer_style_dict = {}
tmp = zip(beer_styles, beer_review_list)
for beer_style, df in tmp:
    beer_style_dict[beer_style] = df

In [13]:
#if you prefer to work with everything in one nice data.frame
beer_review_df = pd.concat(beer_review_list)

In [14]:
def common_reviews_by_beer_name(beername1, beername2):
    beer1_id = beer_review_df.loc[beer_review_df['beer'] == beername1, ]['username']
    beer2_id = beer_review_df.loc[beer_review_df['beer'] == beername2, ]['username']
    same_reviewers = set(beer1_id).intersection(set(beer2_id))
    if len(same_reviewers) <= 0:
        same_reviewers = np.NAN
    return(same_reviewers)

In [15]:
def get_review_stats(beername, usernames):
    beername = [beername]
    #metrics = ["look", "smell", "taste", "feel", "overall"]
    beer_df = beer_review_df.loc[(beer_review_df['beer'].isin(beername)) & (beer_review_df['username'].isin(usernames)), ]
    beer_df = beer_df.sort_values("username").drop_duplicates()
    return(beer_df)

In [16]:
def beer_similarity(beer1, beer2):
    metrics = ["look", "smell", "taste", "feel", "overall"]
    common_users = common_reviews_by_beer_name(beer1, beer2)
    #if(any(np.isnan(common_users))):
        #return np.NAN
    try:
        beer1_reviews = get_review_stats(beer1, common_users)
        beer2_reviews = get_review_stats(beer2, common_users)
    except:
        return(np.NAN)
    beer_relationships = []
    for metric in metrics:
        tmp = pd.concat([beer1_reviews[metric].to_frame().reset_index(drop=True), 
                         beer2_reviews[metric].to_frame().reset_index(drop=True)], axis=1)
        
        tmp.columns = [metric + "1", metric + "2"]
        beer_relationships.append(tmp[metric+"1"].corr(tmp[metric+"2"]))
        
    weights = np.array([1.06, 1.24, 1.40, 1.10, 1.20])
    return(np.sum(weights * beer_relationships))

In [17]:
def beer_recommend(beer_you_tried, beer_style_to_try):
    beer_you_tried = amatch(beer_you_tried, beer_review_df['beer'].unique())
    beer_style_to_try = amatch(beer_style_to_try, beer_review_df['beer_style_new'].unique())
    #print(beer_you_tried)
    beer_you_tried_df = beer_brew_info_df.loc[beer_brew_info_df['beer'] == beer_you_tried, ]
    
    beer_style_df = beer_review_df.loc[(beer_review_df['beer_style_new'] == beer_style_to_try)
                                       & (beer_review_df['beer'] != beer_you_tried)
                                       & (beer_review_df['beer_bros'] >= float(85))
                                       & (np.abs(beer_review_df['beer_abv'] - float(beer_you_tried_df['beer_abv'])) <= 2), ]
    #print("total beers being scored is: %d" % len(beer_style_df.index))
    if(len(beer_style_df.index) < 1):
        return(np.NAN)
    beer_scores = np.array([beer_similarity(beer_you_tried, b) for b in beer_style_df['beer'].unique()])
    beer_scores_top_5_bool = rankdata(-beer_scores, method = "ordinal") < 6
    if(any(np.isnan(beer_scores[beer_scores_top_5_bool])) or len(beer_scores[beer_scores_top_5_bool]) < 5):
        return(np.NAN)
    best_beer = np.array(beer_style_df['beer_id'].unique())[beer_scores_top_5_bool]
    best_beer_df = beer_brew_info_df.loc[beer_brew_info_df['beer_id'].isin(best_beer), ]
    best_beer_df.insert(0, "beer_sim", beer_scores[beer_scores_top_5_bool])
    best_beer_df = best_beer_df.sort_values("beer_sim", ascending = False)
    return(dict(zip(list(best_beer_df['beer_id']), list(best_beer_df['beer_sim']))))

In [14]:
beer_recommend("Bell's Amber Ale", "stout")

{'113/571/': 1.0088234228291328,
 '119/179874/': 2.2583762898011903,
 '209/752/': 1.0617765550688236,
 '402/1192/': 1.9464140313675113,
 '590/2810/': 0.87299082021945185}

In [18]:
beers_to_score = list(beer_brew_info_df.sort("beer_ratings_tot", ascending=False)['beer'])[1001:3000]
beer_style_distinct = np.array(beer_brew_info_df['beer_style_new'].unique())
beer_style_list = list(set(beer_style_distinct).intersection(set(["IPA", "Light Beer", "Lager", "Stout", "Flavored", "Pilsner", "Malt Liquor", "Porter", "Red Ale", "Blonde Ale", "Black Ale", "Brown Ale", "Wheat Ale", "Pale Ale", "Wild Ale"]))) 
beer_dict = {}
i=0
for beer in beers_to_score:
    i+=1
    if(i%100 == 0):
        print(i, end = ' ')
    beer_dict[beer] = {}
    for beer_style in beer_style_list:
        try:
            beer_dict[beer][beer_style] = beer_recommend(beer, beer_style)
        except:
            pass
        
        

100 200 300 400 500 600 700 800 900 1000 

In [22]:
with open(file_path + 'beer_json.json', 'w') as fp:
    json.dump(beer_dict, fp)

In [13]:
np.array(beer_brew_info_df['beer'])[57]

'Pursuit Of Hoppiness'