In [1]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA 

%matplotlib inline
import matplotlib.pyplot as plt

In [2]:
#Data import and feature additions
beer_df = pd.read_csv('https://query.data.world/s/nuub3qupegsd33g3nimifjpajqeq2o')
beer_df.dropna(inplace=True)
beer_df.drop_duplicates(inplace=True)
beer_df["beer_strength"] = pd.cut(beer_df["beer_abv"], [0,1.9,3.9,6,9,58], 
                                labels=["Light", "Mid", "Full", "Heavy", "Very Strong"])
mod_beer_df =beer_df
conditions =[
    (mod_beer_df.beer_style.str.contains('ale', case=False)),
    (mod_beer_df.beer_style.str.contains('dark ale', case=False)),
    (mod_beer_df.beer_style.str.contains('ipa', case=False)),
    (mod_beer_df.beer_style.str.contains("pilsener", case=False)),
    (mod_beer_df.beer_style.str.contains("pilsner", case=False)),
    (mod_beer_df.beer_style.str.contains("barleywine", case=False)),
    (mod_beer_df.beer_style.str.contains("bitter", case=False)),
    (mod_beer_df.beer_style.str.contains("porter", case=False)),
    (mod_beer_df.beer_style.str.contains("kölsch", case=False)),
    (mod_beer_df.beer_style.str.contains('pel', case=False)),
    (mod_beer_df.beer_style.str.contains('lager', case=False)),
    (mod_beer_df.beer_style.str.contains('stout', case=False)),
]

styles = ['ale','dark ale','ipa',"pilsener","pilsener","barleywine","bitter","porter","kölsch","trappist",'lager','stout']
mod_beer_df["meta_style"] = np.select(conditions, styles, default='exotic')
beer_sub_grp = mod_beer_df[["beer_name", "review_overall", "review_aroma", "review_appearance", "review_palate", "review_taste"]]
grp_beers_mean= pd.DataFrame(beer_sub_grp.groupby(["beer_name"]).mean())
mod_beer_df = pd.merge(mod_beer_df, grp_beers_mean, on='beer_name',how='outer',suffixes=('_OG','_MEAN'))
val_info = pd.DataFrame(mod_beer_df[["review_overall_OG", "review_aroma_OG", "review_appearance_OG", "review_palate_OG", "review_taste_OG"]])
mod_beer_df["sub_review_score"] = val_info.mean(axis=1)

In [3]:
#Scaling Overall rating 
scaler = StandardScaler()
sub_score_info = mod_beer_df["sub_review_score"].values.reshape(-1, 1)
sub_score_scaled = scaler.fit_transform(sub_score_info)
mod_beer_df["scaled_score"] = sub_score_scaled

In [6]:
# Encoding Categorical Beer Style Labels
y_sytle = mod_beer_df["meta_style"]
label_encoder = LabelEncoder()
label_encoder.fit(y_sytle)
encoded_style = label_encoder.transform(y_sytle)

In [7]:
encoded_style

array([3, 0, 9, ..., 0, 0, 0])

In [8]:
# Encoding Categorical Beer strength labels
y_abvstr = mod_beer_df["beer_strength"]
label_encoder.fit(y_abvstr)
encoded_strength = label_encoder.transform(y_abvstr)

In [9]:
#Minimise DF for testing purpose of Kmeans
kmean_df = mod_beer_df[["review_overall_OG", "review_overall_MEAN", "scaled_score","beer_beerid"]]

In [10]:
#Adding encoded labels information back to DF 
kmean_df["coded_style"] = encoded_style.tolist()
kmean_df["coded_abvstr"] = encoded_strength.tolist()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [11]:
#Copy Dataframe 
dataset = kmean_df

In [12]:
#Convert DF to Numpy Array
dataset = dataset.to_numpy()

In [13]:
#Split Array 
dataset_train, dataset_test = train_test_split(dataset, random_state=11)

In [14]:
kmeans = KMeans(n_clusters=13)

In [15]:
kmeans.fit(dataset_train)

KMeans(n_clusters=13)

In [16]:
predicted_clusters = kmeans.predict(dataset_train)

In [17]:
predicted_clusters

array([11, 12,  0, ...,  8,  9,  2], dtype=int32)