### Beer Recommender

This notebook uses a dataset of beer reviews to recommend similar beers according to user's taste

#### Imports

In [2]:
import numpy as np 
import pandas as pd

#### loading data and selecting the first 300 beers
There more than 1M reviews in this dataset. For this reason, we are limiting the number of entries, otherwise it can easily run out of memmory

In [3]:
#df = pd.read_csv('data/beer_reviews/beer_reviews.csv') Full dataset at https://drive.google.com/file/d/1LatWjs4ghvv9xATkDeaSeDRoSObTwiAv/view?usp=sharing
df = pd.read_csv('data/beer_reviews.csv')

number_of_beers = 300
top_n = df['beer_name'].value_counts().index[:number_of_beers]
df = df[df['beer_name'].isin(top_n)]

#### Visualizing the top 5 rows

In [4]:
df.head()

Unnamed: 0,brewery_id,brewery_name,review_time,review_overall,review_aroma,review_appearance,review_profilename,beer_style,review_palate,review_taste,beer_name,beer_abv,beer_beerid
798,1075,Caldera Brewing Company,1212201268,4.5,4.5,4.0,grumpy,American Double / Imperial Stout,4.0,4.5,Imperial Stout,,42964
1228,9020,Yazoo Brewing Company,1224350360,4.0,4.0,3.0,Likeburning,Hefeweizen,4.0,3.5,Hefeweizen,5.0,20575
1559,11715,Destiny Brewing Company,1137124057,4.0,3.5,4.0,blitheringidiot,American Pale Ale (APA),3.5,3.5,Pale Ale,4.5,26420
1560,11715,Destiny Brewing Company,1129504403,4.0,2.5,4.0,NeroFiddled,American Pale Ale (APA),4.0,3.5,Pale Ale,4.5,26420
1563,11715,Destiny Brewing Company,1137125989,3.5,3.0,4.0,blitheringidiot,American IPA,4.0,4.0,IPA,,26132


#### Size of dataset with the specified number of beers

In [5]:
len(df)

397448

#### Showing the unique beers of the dataset

In [6]:
df['beer_name'].unique()

array(['Imperial Stout', 'Hefeweizen', 'Pale Ale', 'IPA', 'Oktoberfest',
       'Porter', 'Oatmeal Stout', 'Nut Brown Ale', "Founders Red's Rye PA",
       'B.O.R.I.S. The Crusher Oatmeal-Imperial Stout',
       'Founders Breakfast Stout', 'Founders Double Trouble',
       'Harvest Ale', 'Imperial IPA', 'Mocha Porter', 'India Pale Ale',
       'Christmas Ale', 'Founders Centennial IPA', 'Chocolate Stout',
       'Founders Devil Dancer', 'Founders Backwoods Bastard',
       'Founders Imperial Stout', 'Founders Dirty Bastard',
       'Founders KBS (Kentucky Breakfast Stout)', 'Founders Porter',
       'Heineken Lager Beer', 'Pilsner Urquell', 'Sierra Nevada Stout',
       'India Pale Ale (IPA)', 'Aecht Schlenkerla Rauchbier Märzen',
       'Sierra Nevada Kellerweis Hefeweizen',
       'Sierra Nevada Anniversary Ale (2007-2009)',
       'Sierra Nevada Southern Hemisphere Harvest Fresh Hop Ale',
       'Vanilla Porter', 'Sierra Nevada Summerfest Lager',
       'Sierra Nevada Celebration Al

In [7]:
#df[df['beer_name'] == 'Budweiser']

#### Computing the pivot table, creating a comparison table for each reviewer and each beer. Values with 0 represents beers that haven't been rated by a specific user

In [8]:
matrix = df.pivot_table(index=['review_profilename'], columns=['beer_name'], values='review_overall').fillna(0)

In [9]:
matrix.head()

beer_name,#9,120 Minute IPA,1554 Enlightened Black Ale,60 Minute IPA,90 Minute IPA,A Little Sumpin' Sumpin' Ale,Adam,Aecht Schlenkerla Rauchbier Märzen,Alaskan Smoked Porter,AleSmith IPA,...,Weihenstephaner Korbinian,Westmalle Trappist Dubbel,Westmalle Trappist Tripel,Wisconsin Belgian Red,World Wide Stout,Yeti Imperial Stout,Young's Double Chocolate Stout,Yuengling Traditional Lager,YuleSmith (Summer),Éphémère (Apple)
review_profilename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0110x011,0.0,0.0,0.0,0.0,5.0,0.0,4.0,0.0,0.0,5.0,...,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,5.0,0.0
02maxima,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
03SVTCobra,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
05Harley,0.0,4.0,0.0,0.0,4.0,4.5,4.0,0.0,0.0,4.0,...,4.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0
0Naught0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Creating a correlation matrix, comparing rating of each pair of beers. Excluding beers with less than 100 rates to create a more consistent model

In [10]:
corr_matrix = matrix.corr(method='pearson', min_periods=100)

In [11]:
corr_matrix

beer_name,#9,120 Minute IPA,1554 Enlightened Black Ale,60 Minute IPA,90 Minute IPA,A Little Sumpin' Sumpin' Ale,Adam,Aecht Schlenkerla Rauchbier Märzen,Alaskan Smoked Porter,AleSmith IPA,...,Weihenstephaner Korbinian,Westmalle Trappist Dubbel,Westmalle Trappist Tripel,Wisconsin Belgian Red,World Wide Stout,Yeti Imperial Stout,Young's Double Chocolate Stout,Yuengling Traditional Lager,YuleSmith (Summer),Éphémère (Apple)
beer_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
#9,1.000000,0.224955,0.232418,0.337037,0.304790,0.199823,0.182306,0.217224,0.189856,0.188755,...,0.239105,0.192240,0.229970,0.173393,0.241162,0.253820,0.275878,0.306044,0.183323,0.277918
120 Minute IPA,0.224955,1.000000,0.204487,0.320038,0.347839,0.205198,0.247972,0.223051,0.260810,0.277803,...,0.241968,0.227943,0.251516,0.250448,0.377679,0.291127,0.222390,0.178749,0.272105,0.240785
1554 Enlightened Black Ale,0.232418,0.204487,1.000000,0.268608,0.255553,0.208299,0.234874,0.219023,0.281868,0.232516,...,0.259078,0.229446,0.229045,0.245552,0.220349,0.253920,0.267540,0.180040,0.222751,0.240387
60 Minute IPA,0.337037,0.320038,0.268608,1.000000,0.469774,0.268452,0.225423,0.274024,0.255407,0.266635,...,0.285391,0.260255,0.300449,0.236057,0.304056,0.338965,0.334076,0.269049,0.267225,0.296772
90 Minute IPA,0.304790,0.347839,0.255553,0.469774,1.000000,0.261886,0.254780,0.264697,0.275016,0.296233,...,0.295423,0.281009,0.327920,0.262839,0.349010,0.358259,0.318596,0.233231,0.306845,0.284356
A Little Sumpin' Sumpin' Ale,0.199823,0.205198,0.208299,0.268452,0.261886,1.000000,0.224383,0.181367,0.190139,0.233695,...,0.206367,0.168468,0.199286,0.184925,0.204591,0.268875,0.199376,0.167491,0.203457,0.173494
Adam,0.182306,0.247972,0.234874,0.225423,0.254780,0.224383,1.000000,0.238534,0.366440,0.389596,...,0.276316,0.244325,0.261756,0.346362,0.320427,0.324237,0.202086,0.148750,0.378360,0.213385
Aecht Schlenkerla Rauchbier Märzen,0.217224,0.223051,0.219023,0.274024,0.264697,0.181367,0.238534,1.000000,0.255350,0.213940,...,0.326400,0.298382,0.301717,0.194143,0.262137,0.264879,0.291931,0.179928,0.211977,0.283525
Alaskan Smoked Porter,0.189856,0.260810,0.281868,0.255407,0.275016,0.190139,0.366440,0.255350,1.000000,0.376811,...,0.277075,0.261927,0.255297,0.347153,0.295533,0.338121,0.253543,0.165412,0.373734,0.252864
AleSmith IPA,0.188755,0.277803,0.232516,0.266635,0.296233,0.233695,0.389596,0.213940,0.376811,1.000000,...,0.262169,0.237790,0.260506,0.362657,0.275771,0.332926,0.215738,0.150347,0.552436,0.215437


#### Compute the similarity of the selected beers to the rest of entries, excluding beers that the user already like

In [12]:
def get_similar_beers(arr_favorite_beers):
    df_selected = corr_matrix[arr_favorite_beers]
    df_ranked = df_selected.sum(axis=1).sort_values(ascending=False)
    similar_beers_df = df_ranked[df_ranked.index.isin(favorite_beers)==False].index
    
    return similar_beers_df[:10].tolist()

#### List of beers I like and want to get similar items

In [13]:
favorite_beers = ["Budweiser"]

In [14]:
ranked_beers = get_similar_beers(favorite_beers)

#### Printing the most similar beers to the ones I selected, based on the reviews of the other items

In [15]:
for _ in range(len(ranked_beers)):
    print("{}. {}".format((_+1), ranked_beers[_]))

1. Bud Light
2. Heineken Lager Beer
3. Samuel Adams Boston Lager
4. Miller High Life
5. Corona Extra
6. Pabst Blue Ribbon (PBR)
7. Coors Light
8. Guinness Draught
9. Red Stripe Jamaican Lager
10. Miller Lite
