## Preprocessing

In [None]:
#Data handling
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

# Clustering
from sklearn.cluster import KMeans, DBSCAN
from sklearn import preprocessing
from sklearn.metrics import silhouette_score

# Dimensionality reduction
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

# Visualization
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
import mpl_toolkits.mplot3d.axes3d as p3
from matplotlib import animation
%matplotlib inline


In [None]:
def load_preprocess_data():
    #load data
    beer_df= pd.read_csv('beer_data_cleaned8-3_with_nums.csv')
    #one hot encode categories
    categoricals = ['Batch_Style','Category','Base Malt','SpecialtyMalt1Name', 'SpecialtyMalt2Name', 'SpecialtyMalt3Name','hop1name','hop1type','hop1timing',
                   'hop2name','hop2type','hop2timing','hop3name','hop3type','hop3timing','hop4name','hop4type','hop4timing','hop5name','hop5type','hop5timing',
                   'YeastStrain', 'Flocculation','Flag']
    for i in categoricals:
        beer_df=beer_df.join(pd.get_dummies(beer_df[i], prefix=i))
    beer_df.drop(categoricals, axis=1, inplace=True)
    return beer_df
def plot_correlations(beer_df):
    corr = beer_df.corr()
    mask = np.zeros_like(corr, dtype=np.bool)
    mask[np.triu_indices_from(mask)] = True
    f, ax = plt.subplots(figsize=(11, 9))
    cmap = sns.diverging_palette(220, 10, as_cmap=True)
    sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
                square=True, linewidths=.5, cbar_kws={"shrink": .5})


In [None]:
beer_df = load_preprocess_data()

In [None]:
#plot_correlations(beer_df)

In [None]:
#k-means
scores = [KMeans(n_clusters=i+2).fit(df).inertia_ for i in range(10)]
sns.lineplot(np.arange(2, 12), scores)
plt.xlabel('Number of clusters')
plt.ylabel("Inertia")
plt.title("Inertia of k-Means versus number of clusters")

In [None]:
kmeans = KMeans(n_clusters=4)
kmeans.fit(beer_df)

In [None]:
#normalized kmeans
normalized_vectors = preprocessing.normalize(beer_df)
scores = [KMeans(n_clusters=i+2).fit(normalized_vectors).inertia_ for i in range(10)]
sns.lineplot(np.arange(2, 12), scores)
plt.xlabel('Number of clusters')
plt.ylabel("Inertia")
plt.title("Inertia of Cosine k-Means versus number of clusters")

In [None]:
normalized_kmeans = KMeans(n_clusters=4)
normalized_kmeans.fit(normalized_vectors)