In [1]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA 

%matplotlib inline
import matplotlib.pyplot as plt

In [2]:
#Data import and feature additions
beer_df = pd.read_csv('https://query.data.world/s/nuub3qupegsd33g3nimifjpajqeq2o')
beer_df.dropna(inplace=True)
beer_df.drop_duplicates(inplace=True)
beer_df["beer_strength"] = pd.cut(beer_df["beer_abv"], [0,1.9,3.9,6,9,58], 
                                labels=["Light", "Mid", "Full", "Heavy", "Very Strong"])
mod_beer_df =beer_df
conditions =[
    (mod_beer_df.beer_style.str.contains('ale', case=False)),
    (mod_beer_df.beer_style.str.contains('dark ale', case=False)),
    (mod_beer_df.beer_style.str.contains('ipa', case=False)),
    (mod_beer_df.beer_style.str.contains("pilsener", case=False)),
    (mod_beer_df.beer_style.str.contains("pilsner", case=False)),
    (mod_beer_df.beer_style.str.contains("barleywine", case=False)),
    (mod_beer_df.beer_style.str.contains("bitter", case=False)),
    (mod_beer_df.beer_style.str.contains("porter", case=False)),
    (mod_beer_df.beer_style.str.contains("kölsch", case=False)),
    (mod_beer_df.beer_style.str.contains('pel', case=False)),
    (mod_beer_df.beer_style.str.contains('lager', case=False)),
    (mod_beer_df.beer_style.str.contains('stout', case=False)),
]

styles = ['ale','dark ale','ipa',"pilsener","pilsener","barleywine","bitter","porter","kölsch","trappist",'lager','stout']
mod_beer_df["meta_style"] = np.select(conditions, styles, default='exotic')
beer_sub_grp = mod_beer_df[["beer_name", "review_overall", "review_aroma", "review_appearance", "review_palate", "review_taste"]]
grp_beers_mean= pd.DataFrame(beer_sub_grp.groupby(["beer_name"]).mean())
mod_beer_df = pd.merge(mod_beer_df, grp_beers_mean, on='beer_name',how='outer',suffixes=('_OG','_MEAN'))
val_info = pd.DataFrame(mod_beer_df[["review_overall_OG", "review_aroma_OG", "review_appearance_OG", "review_palate_OG", "review_taste_OG"]])
mod_beer_df["sub_review_score"] = val_info.mean(axis=1)

In [3]:
mod_beer_df

Unnamed: 0,brewery_id,brewery_name,review_time,review_overall_OG,review_aroma_OG,review_appearance_OG,review_profilename,beer_style,review_palate_OG,review_taste_OG,...,beer_abv,beer_beerid,beer_strength,meta_style,review_overall_MEAN,review_aroma_MEAN,review_appearance_MEAN,review_palate_MEAN,review_taste_MEAN,sub_review_score
0,10325,Vecchio Birraio,1234817823,1.5,2.0,2.5,stcules,Hefeweizen,1.5,1.5,...,5.0,47986,Full,exotic,1.500000,2.000000,2.500000,1.5,1.500000,1.8
1,10325,Vecchio Birraio,1235915097,3.0,2.5,3.0,stcules,English Strong Ale,3.0,3.0,...,6.2,48213,Heavy,ale,3.000000,2.500000,3.000000,3.0,3.000000,2.9
2,10325,Vecchio Birraio,1235916604,3.0,2.5,3.0,stcules,Foreign / Export Stout,3.0,3.0,...,6.5,48215,Heavy,stout,3.000000,2.500000,3.000000,3.0,3.000000,2.9
3,10325,Vecchio Birraio,1234725145,3.0,3.0,3.5,stcules,German Pilsener,2.5,3.0,...,5.0,47969,Full,pilsener,3.000000,3.000000,3.500000,2.5,3.000000,3.0
4,1075,Caldera Brewing Company,1293735206,4.0,4.5,4.0,johnmichaelsen,American Double / Imperial IPA,4.0,4.5,...,7.7,64883,Heavy,ipa,4.000000,4.500000,4.000000,4.0,4.500000,4.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1518473,14359,The Defiant Brewing Company,1162684892,5.0,4.0,3.5,maddogruss,Pumpkin Ale,4.0,4.0,...,5.2,33061,Full,ale,4.357143,4.357143,3.964286,4.0,4.107143,4.1
1518474,14359,The Defiant Brewing Company,1161048566,4.0,5.0,2.5,yelterdow,Pumpkin Ale,2.0,4.0,...,5.2,33061,Full,ale,4.357143,4.357143,3.964286,4.0,4.107143,3.5
1518475,14359,The Defiant Brewing Company,1160702513,4.5,3.5,3.0,TongoRad,Pumpkin Ale,3.5,4.0,...,5.2,33061,Full,ale,4.357143,4.357143,3.964286,4.0,4.107143,3.7
1518476,14359,The Defiant Brewing Company,1160023044,4.0,4.5,4.5,dherling,Pumpkin Ale,4.5,4.5,...,5.2,33061,Full,ale,4.357143,4.357143,3.964286,4.0,4.107143,4.4


In [4]:
#Scaling Overall rating 
scaler = StandardScaler()
sub_score_info = mod_beer_df["sub_review_score"].values.reshape(-1, 1)
sub_score_scaled = scaler.fit_transform(sub_score_info)
mod_beer_df["scaled_score"] = sub_score_scaled

In [5]:
# Encoding Categorical Beer Style Labels
y_sytle = mod_beer_df["meta_style"]
label_encoder = LabelEncoder()
label_encoder.fit(y_sytle)
encoded_style = label_encoder.transform(y_sytle)

In [6]:
encoded_style

array([3, 0, 9, ..., 0, 0, 0])

In [7]:
# Encoding Categorical Beer strength labels
y_abvstr = mod_beer_df["beer_strength"]
label_encoder.fit(y_abvstr)
encoded_strength = label_encoder.transform(y_abvstr)

In [15]:
#Minimise DF for testing purpose of Kmeans
kmean_df = mod_beer_df[["review_overall_OG","review_palate_OG", "review_taste_OG"]]

In [16]:
#Adding encoded labels information back to DF 
kmean_df["coded_style"] = encoded_style.tolist()
kmean_df["coded_abvstr"] = encoded_strength.tolist()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [10]:
style_checker = kmean_df[["meta_style","coded_style",]]
style_checker.drop_duplicates()

Unnamed: 0,meta_style,coded_style
0,exotic,3
1,ale,0
2,stout,9
3,pilsener,7
4,ipa,4
14,lager,6
687,porter,8
2113,bitter,2
2134,barleywine,1
2155,trappist,10


In [17]:
#Copy Dataframe 
dataset = kmean_df

In [18]:
#Convert DF to Numpy Array
dataset = dataset.to_numpy()

In [19]:
#Split Array 
dataset_train, dataset_test = train_test_split(dataset, random_state=11)

In [20]:
kmeans = KMeans(n_clusters=11)

In [21]:
model = kmeans.fit(dataset)

In [22]:
kmean_df["classes"] = model.predict(dataset)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [23]:
predicted_clusters = model.predict(dataset_train)

In [24]:
dataset_train[0]

array([4.5, 5. , 4. , 9. , 0. ])

In [25]:
predict = model.predict(dataset_test[0].reshape(1,-1))

In [None]:
usrinput = 

In [26]:
predict2 = model.predict

array([2], dtype=int32)

In [27]:
model.cluster_centers_

array([[4.00562197e+00, 4.09689345e+00, 4.16062517e+00, 9.21302964e+00,
        3.99249235e+00],
       [3.00422860e+00, 3.03276533e+00, 2.97212895e+00, 3.14935734e+00,
        3.84375131e-01],
       [3.88851203e+00, 3.96199064e+00, 4.03419918e+00, 3.84687254e-01,
        3.95292503e+00],
       [3.09038577e+00, 3.06738643e+00, 3.02436866e+00, 5.67918015e-03,
        2.92060587e-01],
       [3.97870774e+00, 3.88870985e+00, 3.99149288e+00, 8.88845245e+00,
        5.78839531e-01],
       [4.15842621e+00, 4.05194498e+00, 4.14293343e+00, 4.01744393e+00,
        8.75263576e-01],
       [2.32295848e+00, 2.26154840e+00, 2.13251468e+00, 6.43749008e+00,
        2.49291335e-01],
       [4.14563032e+00, 4.00699498e+00, 4.08954643e+00, 8.07138498e-03,
        5.23583735e-01],
       [4.14120205e+00, 3.99658863e+00, 4.08059646e+00, 2.90782824e+00,
        3.68126680e-01],
       [3.87534271e+00, 3.61650389e+00, 3.65006441e+00, 6.31911122e+00,
        9.87657040e-02],
       [3.89913314e+00, 3.9572

In [31]:
style_checker["km_class"]= kmean_df[["classes" ]]


In [33]:
mod_beer_df["km_class"] = kmean_df[["classes" ]]

In [43]:
predict_analysis6 = mod_beer_df.loc[(mod_beer_df["km_class"] == 6)]

In [44]:
predict_analysis6["meta_style"].unique()

array(['pilsener', 'lager', 'stout', 'porter', 'kölsch', 'ipa',
       'trappist'], dtype=object)

In [48]:
predict_analysis2 = mod_beer_df.loc[(mod_beer_df["km_class"] == 2)]
predict_analysis2["meta_style"].unique()

array(['barleywine', 'ale', 'bitter'], dtype=object)