In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import os
from sklearn.preprocessing import LabelBinarizer

In [2]:
beer = pd.read_csv('data/beer_reviews.csv')
beer.head()

Unnamed: 0,brewery_id,brewery_name,review_time,review_overall,review_aroma,review_appearance,review_profilename,beer_style,review_palate,review_taste,beer_name,beer_abv,beer_beerid
0,10325,Vecchio Birraio,1234817823,1.5,2.0,2.5,stcules,Hefeweizen,1.5,1.5,Sausa Weizen,5.0,47986
1,10325,Vecchio Birraio,1235915097,3.0,2.5,3.0,stcules,English Strong Ale,3.0,3.0,Red Moon,6.2,48213
2,10325,Vecchio Birraio,1235916604,3.0,2.5,3.0,stcules,Foreign / Export Stout,3.0,3.0,Black Horse Black Beer,6.5,48215
3,10325,Vecchio Birraio,1234725145,3.0,3.0,3.5,stcules,German Pilsener,2.5,3.0,Sausa Pils,5.0,47969
4,1075,Caldera Brewing Company,1293735206,4.0,4.5,4.0,johnmichaelsen,American Double / Imperial IPA,4.0,4.5,Cauldron DIPA,7.7,64883


In [3]:
clean_beer = beer.drop(["brewery_name", "review_time", "review_profilename"], axis=1)
clean_beer.head(2)

Unnamed: 0,brewery_id,review_overall,review_aroma,review_appearance,beer_style,review_palate,review_taste,beer_name,beer_abv,beer_beerid
0,10325,1.5,2.0,2.5,Hefeweizen,1.5,1.5,Sausa Weizen,5.0,47986
1,10325,3.0,2.5,3.0,English Strong Ale,3.0,3.0,Red Moon,6.2,48213


In [4]:
style_cols = pd.get_dummies(beer.beer_style)

In [5]:
beer_df = pd.concat([clean_beer, style_cols], axis=1)

In [6]:
beer_df.head(2)

Unnamed: 0,brewery_id,review_overall,review_aroma,review_appearance,beer_style,review_palate,review_taste,beer_name,beer_abv,beer_beerid,...,Scotch Ale / Wee Heavy,Scottish Ale,Scottish Gruit / Ancient Herbed Ale,Smoked Beer,Tripel,Vienna Lager,Weizenbock,Wheatwine,Winter Warmer,Witbier
0,10325,1.5,2.0,2.5,Hefeweizen,1.5,1.5,Sausa Weizen,5.0,47986,...,0,0,0,0,0,0,0,0,0,0
1,10325,3.0,2.5,3.0,English Strong Ale,3.0,3.0,Red Moon,6.2,48213,...,0,0,0,0,0,0,0,0,0,0


In [7]:
dropped_df = beer_df.drop(["beer_name", "beer_style"], axis=1)

In [8]:
dropped_df.head()

Unnamed: 0,brewery_id,review_overall,review_aroma,review_appearance,review_palate,review_taste,beer_abv,beer_beerid,Altbier,American Adjunct Lager,...,Scotch Ale / Wee Heavy,Scottish Ale,Scottish Gruit / Ancient Herbed Ale,Smoked Beer,Tripel,Vienna Lager,Weizenbock,Wheatwine,Winter Warmer,Witbier
0,10325,1.5,2.0,2.5,1.5,1.5,5.0,47986,0,0,...,0,0,0,0,0,0,0,0,0,0
1,10325,3.0,2.5,3.0,3.0,3.0,6.2,48213,0,0,...,0,0,0,0,0,0,0,0,0,0
2,10325,3.0,2.5,3.0,3.0,3.0,6.5,48215,0,0,...,0,0,0,0,0,0,0,0,0,0
3,10325,3.0,3.0,3.5,2.5,3.0,5.0,47969,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1075,4.0,4.5,4.0,4.0,4.5,7.7,64883,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
new_beer_df = dropped_df.fillna(0)

In [10]:
new_beer_df.head()

Unnamed: 0,brewery_id,review_overall,review_aroma,review_appearance,review_palate,review_taste,beer_abv,beer_beerid,Altbier,American Adjunct Lager,...,Scotch Ale / Wee Heavy,Scottish Ale,Scottish Gruit / Ancient Herbed Ale,Smoked Beer,Tripel,Vienna Lager,Weizenbock,Wheatwine,Winter Warmer,Witbier
0,10325,1.5,2.0,2.5,1.5,1.5,5.0,47986,0,0,...,0,0,0,0,0,0,0,0,0,0
1,10325,3.0,2.5,3.0,3.0,3.0,6.2,48213,0,0,...,0,0,0,0,0,0,0,0,0,0
2,10325,3.0,2.5,3.0,3.0,3.0,6.5,48215,0,0,...,0,0,0,0,0,0,0,0,0,0
3,10325,3.0,3.0,3.5,2.5,3.0,5.0,47969,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1075,4.0,4.5,4.0,4.0,4.5,7.7,64883,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
from sklearn.cluster import KMeans

In [12]:
kmeans = KMeans(n_clusters=8)

In [13]:
# Fit the model to the data
kmeans.fit(new_beer_df)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=8, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)

In [14]:
# Predict the clusters
predicted_clusters = kmeans.predict(new_beer_df)
predicted_clusters

array([0, 0, 0, ..., 0, 0, 0])

In [15]:
beer_data = pd.concat([beer_df, pd.DataFrame(predicted_clusters)], axis=1)

In [16]:
beer_data.head(2)

Unnamed: 0,brewery_id,review_overall,review_aroma,review_appearance,beer_style,review_palate,review_taste,beer_name,beer_abv,beer_beerid,...,Scottish Ale,Scottish Gruit / Ancient Herbed Ale,Smoked Beer,Tripel,Vienna Lager,Weizenbock,Wheatwine,Winter Warmer,Witbier,0
0,10325,1.5,2.0,2.5,Hefeweizen,1.5,1.5,Sausa Weizen,5.0,47986,...,0,0,0,0,0,0,0,0,0,0
1,10325,3.0,2.5,3.0,English Strong Ale,3.0,3.0,Red Moon,6.2,48213,...,0,0,0,0,0,0,0,0,0,0


In [18]:
# beer_data.rename(columns={"0":"cluster"})
final_beer = beer_data.rename(columns={"0": "cluster", "beer_beerid": "beer_id"})
final_beer.head()

Unnamed: 0,brewery_id,review_overall,review_aroma,review_appearance,beer_style,review_palate,review_taste,beer_name,beer_abv,beer_id,...,Scottish Ale,Scottish Gruit / Ancient Herbed Ale,Smoked Beer,Tripel,Vienna Lager,Weizenbock,Wheatwine,Winter Warmer,Witbier,0
0,10325,1.5,2.0,2.5,Hefeweizen,1.5,1.5,Sausa Weizen,5.0,47986,...,0,0,0,0,0,0,0,0,0,0
1,10325,3.0,2.5,3.0,English Strong Ale,3.0,3.0,Red Moon,6.2,48213,...,0,0,0,0,0,0,0,0,0,0
2,10325,3.0,2.5,3.0,Foreign / Export Stout,3.0,3.0,Black Horse Black Beer,6.5,48215,...,0,0,0,0,0,0,0,0,0,0
3,10325,3.0,3.0,3.5,German Pilsener,2.5,3.0,Sausa Pils,5.0,47969,...,0,0,0,0,0,0,0,0,0,0
4,1075,4.0,4.5,4.0,American Double / Imperial IPA,4.0,4.5,Cauldron DIPA,7.7,64883,...,0,0,0,0,0,0,0,0,0,7
