In [1]:
# import dependencies
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

In [2]:
# create dataframe
beer_df = pd.read_csv('https://query.data.world/s/nuub3qupegsd33g3nimifjpajqeq2o')

# drop all rows with any NaN values
beer_df.dropna(inplace=True)

# remove duplicate rows in place
beer_df.drop_duplicates(inplace=True)


In [3]:
beer_df.isnull().values.any()

False

In [4]:
# Adds Categorical features of beer strengths 
beer_df["beer_strength"] = pd.cut(beer_df["beer_abv"], [0,1.9,3.9,6,9,58], 
                                labels=["Light", "Mid", "Full", "Heavy", "Very Strong"])

In [5]:
# condense styles of beer 
mod_beer_df =beer_df
conditions =[
    (mod_beer_df.beer_style.str.contains('ale', case=False)),
    (mod_beer_df.beer_style.str.contains('dark ale', case=False)),
    (mod_beer_df.beer_style.str.contains('ipa', case=False)),
    (mod_beer_df.beer_style.str.contains("pilsener", case=False)),
    (mod_beer_df.beer_style.str.contains("pilsner", case=False)),
    (mod_beer_df.beer_style.str.contains("barleywine", case=False)),
    (mod_beer_df.beer_style.str.contains("bitter", case=False)),
    (mod_beer_df.beer_style.str.contains("porter", case=False)),
    (mod_beer_df.beer_style.str.contains("kölsch", case=False)),
    (mod_beer_df.beer_style.str.contains('pel', case=False)),
    (mod_beer_df.beer_style.str.contains('lager', case=False)),
    (mod_beer_df.beer_style.str.contains('stout', case=False)),
]

styles = ['ale','dark ale','ipa',"pilsener","pilsener","barleywine","bitter","porter","kölsch","trappist",'lager','stout']
mod_beer_df["meta_style"] = np.select(conditions, styles, default='exotic')


In [7]:
mod_beer_df

Unnamed: 0,brewery_id,brewery_name,review_time,review_overall,review_aroma,review_appearance,review_profilename,beer_style,review_palate,review_taste,beer_name,beer_abv,beer_beerid,beer_strength,meta_style
0,10325,Vecchio Birraio,1234817823,1.5,2.0,2.5,stcules,Hefeweizen,1.5,1.5,Sausa Weizen,5.0,47986,Full,exotic
1,10325,Vecchio Birraio,1235915097,3.0,2.5,3.0,stcules,English Strong Ale,3.0,3.0,Red Moon,6.2,48213,Heavy,ale
2,10325,Vecchio Birraio,1235916604,3.0,2.5,3.0,stcules,Foreign / Export Stout,3.0,3.0,Black Horse Black Beer,6.5,48215,Heavy,stout
3,10325,Vecchio Birraio,1234725145,3.0,3.0,3.5,stcules,German Pilsener,2.5,3.0,Sausa Pils,5.0,47969,Full,pilsener
4,1075,Caldera Brewing Company,1293735206,4.0,4.5,4.0,johnmichaelsen,American Double / Imperial IPA,4.0,4.5,Cauldron DIPA,7.7,64883,Heavy,ipa
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1586609,14359,The Defiant Brewing Company,1162684892,5.0,4.0,3.5,maddogruss,Pumpkin Ale,4.0,4.0,The Horseman's Ale,5.2,33061,Full,ale
1586610,14359,The Defiant Brewing Company,1161048566,4.0,5.0,2.5,yelterdow,Pumpkin Ale,2.0,4.0,The Horseman's Ale,5.2,33061,Full,ale
1586611,14359,The Defiant Brewing Company,1160702513,4.5,3.5,3.0,TongoRad,Pumpkin Ale,3.5,4.0,The Horseman's Ale,5.2,33061,Full,ale
1586612,14359,The Defiant Brewing Company,1160023044,4.0,4.5,4.5,dherling,Pumpkin Ale,4.5,4.5,The Horseman's Ale,5.2,33061,Full,ale


In [9]:
#Create sub group on review numbers only for easier groupby calculations
beer_sub_grp = mod_beer_df[["beer_name", "review_overall", "review_aroma", "review_appearance", "review_palate", "review_taste"]]
beer_sub_grp.describe()

Unnamed: 0,review_overall,review_aroma,review_appearance,review_palate,review_taste
count,1518478.0,1518478.0,1518478.0,1518478.0,1518478.0
mean,3.823938,3.746218,3.850383,3.753735,3.804082
std,0.7172663,0.695344,0.6143106,0.679335,0.7286079
min,0.0,1.0,0.0,1.0,1.0
25%,3.5,3.5,3.5,3.5,3.5
50%,4.0,4.0,4.0,4.0,4.0
75%,4.5,4.0,4.0,4.0,4.5
max,5.0,5.0,5.0,5.0,5.0


In [10]:
val_info = pd.DataFrame(mod_beer_df[["review_overall", "review_aroma", "review_appearance", "review_palate", "review_taste"]])
mod_beer_df["sub_review_score"] = val_info.mean(axis=1)

In [11]:
grp_beers_mean= pd.DataFrame(beer_sub_grp.groupby(["beer_name"]).mean())
grp_beers_var= pd.DataFrame(beer_sub_grp.groupby(["beer_name"]).var())

In [13]:
mod_beer_df = pd.merge(mod_beer_df, grp_beers_mean, on='beer_name',how='outer',suffixes=('_OG','_MEAN'))

In [14]:
mod_beer_df

Unnamed: 0,brewery_id,brewery_name,review_time,review_overall_OG,review_aroma_OG,review_appearance_OG,review_profilename,beer_style,review_palate_OG,review_taste_OG,...,beer_abv,beer_beerid,beer_strength,meta_style,sub_review_score,review_overall_MEAN,review_aroma_MEAN,review_appearance_MEAN,review_palate_MEAN,review_taste_MEAN
0,10325,Vecchio Birraio,1234817823,1.5,2.0,2.5,stcules,Hefeweizen,1.5,1.5,...,5.0,47986,Full,exotic,1.8,1.500000,2.000000,2.500000,1.5,1.500000
1,10325,Vecchio Birraio,1235915097,3.0,2.5,3.0,stcules,English Strong Ale,3.0,3.0,...,6.2,48213,Heavy,ale,2.9,3.000000,2.500000,3.000000,3.0,3.000000
2,10325,Vecchio Birraio,1235916604,3.0,2.5,3.0,stcules,Foreign / Export Stout,3.0,3.0,...,6.5,48215,Heavy,stout,2.9,3.000000,2.500000,3.000000,3.0,3.000000
3,10325,Vecchio Birraio,1234725145,3.0,3.0,3.5,stcules,German Pilsener,2.5,3.0,...,5.0,47969,Full,pilsener,3.0,3.000000,3.000000,3.500000,2.5,3.000000
4,1075,Caldera Brewing Company,1293735206,4.0,4.5,4.0,johnmichaelsen,American Double / Imperial IPA,4.0,4.5,...,7.7,64883,Heavy,ipa,4.2,4.000000,4.500000,4.000000,4.0,4.500000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1518473,14359,The Defiant Brewing Company,1162684892,5.0,4.0,3.5,maddogruss,Pumpkin Ale,4.0,4.0,...,5.2,33061,Full,ale,4.1,4.357143,4.357143,3.964286,4.0,4.107143
1518474,14359,The Defiant Brewing Company,1161048566,4.0,5.0,2.5,yelterdow,Pumpkin Ale,2.0,4.0,...,5.2,33061,Full,ale,3.5,4.357143,4.357143,3.964286,4.0,4.107143
1518475,14359,The Defiant Brewing Company,1160702513,4.5,3.5,3.0,TongoRad,Pumpkin Ale,3.5,4.0,...,5.2,33061,Full,ale,3.7,4.357143,4.357143,3.964286,4.0,4.107143
1518476,14359,The Defiant Brewing Company,1160023044,4.0,4.5,4.5,dherling,Pumpkin Ale,4.5,4.5,...,5.2,33061,Full,ale,4.4,4.357143,4.357143,3.964286,4.0,4.107143
