## Preprocessing

In [None]:
#Data handling
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

# Clustering
from sklearn.cluster import KMeans, DBSCAN
from sklearn import preprocessing
from sklearn.metrics import silhouette_score

# Dimensionality reduction
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

# Visualization
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
import mpl_toolkits.mplot3d.axes3d as p3
from matplotlib import animation
%matplotlib inline


In [None]:
#this section splits the unit from the amount into seperate columns for our adjuncts
#it then converts them to real numbers instead of strings for conversion
pd.set_option('display.max_columns', 500)
beer_df= pd.read_csv('beer_data_cleaned8-23_with_nums.csv')
beer_df['Adjunct1Num'],beer_df['Adjunct1Unit'] = beer_df['Adjunct1Amount'].str.split().str
beer_df['Adjunct2Num'],beer_df['Adjunct2Unit'] = beer_df['Adjunct2Amount'].str.split().str
beer_df['Adjunct3Num'],beer_df['Adjunct3Unit'] = beer_df['Adjunct3Amount'].str.split().str
beer_df['Adjunct4Num'],beer_df['Adjunct4Unit'] = beer_df['Adjunct4Amount'].str.split().str
beer_df['Adjunct5Num'],beer_df['Adjunct5Unit'] = beer_df['Adjunct5Amount'].str.split().str
beer_df.head(40)


In [None]:
beer_df.dtypes #check dytpes to be sure they will work for clustering

In [None]:
pd.set_option('display.max_rows', 500)
objects = ['Adjunct1Num','Adjunct2Num','Adjunct3Num','Adjunct4Num','Adjunct5Num'] #need to convert to numeric here
def nums():
    for i in objects:
        beer_df[i] = beer_df[i].astype(float).round(2)
    return beer_df
nums()


In [None]:
#this section finds a line to divide the dataset into imperial vs metric
beer_df['HighTemp'].hist(bins=50)
plt.axvline(47) #assign cutoff and create new variable to distinguish
beer_df.loc[beer_df['HighTemp'] <=47, 'Flag'] = 'Metric'
beer_df.loc[beer_df['HighTemp'] >47, 'Flag'] = 'Imperial'
beer_df['Flag'].value_counts()

In [6]:
#imports for the pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [7]:
#this section creates a dictionary of the unique values in the 5 adjunct unit columns to be used for a lookup later
adj_cols = ['Adjunct1Unit','Adjunct2Unit','Adjunct3Unit','Adjunct4Unit','Adjunct5Unit']
column_values = beer_df[adj_cols].values.ravel()
unique_values = pd.unique(column_values)
unique_values
convert_keys = [' ',.035724, .033814, 0.00,0.00,0.00,0.00,0.00,0.00,.033814,0.00,2.2,0.00,.219969,0.00,.033814,0.00,.033814,0.00,0.00]

adjdict = dict(zip(unique_values, convert_keys))
adjdict

{nan: ' ',
 'g': 0.035724,
 'each': 0.033814,
 'tsp': 0.0,
 'ml': 0.0,
 'tbsp': 0.0,
 'oz': 0.0,
 'lb': 0.0,
 'L': 0.0,
 'g/gal': 0.033814,
 'qt': 0.0,
 'g/l': 2.2,
 'ml/gal': 0.0,
 'days': 0.219969,
 'hr.': 0.0,
 'gal': 0.033814,
 'kg': 0.0,
 'ml/l': 0.033814,
 'mg/l': 0.0,
 'min.': 0.0}

In [None]:
kg_cols = ['Base Malt Amount','SpecialtyMalt1Amount','SpecialtyMalt2Amount','SpecialtyMalt1Amount']
gram_cols = ['hop1amount','hop2amount','hop3amount','hop4amount','hop5amount']
celcius_cols= ['LowTemp','HighTemp']
adj_cols = ['Adjunct1Num','Adjunct2Num','Adjunct3Num','Adjunct4Num','Adjunct5Num']
lookup_cols = beer_df.iloc[0:,-11:]
def lookup(lookup_cols):
    """Looks up and returns the Adjunct Num column multiplied by the conversion rate"""
    for i in lookup_cols:
        if type(i) == object
    return ((row[-1].value))

In [None]:
lookup_cols = beer_df.iloc[0:,-11:]
print(lookup_cols[0:1])

In [None]:
#a class that converts metric to imperial
class convert(object):
    """A class that converts rows in the dataframe from metric to imperial based on their unit labels"""
    def __init__(self,beer_df):
        self.beer_df = beer_df
    def kgtolb(self,row):
        for i in kg_cols:
           if (beer_df['Flag'].equals("Metric")):
               beer_df[i] = beer_df[i]*2.2
    def gram2oz(self,row):
        for i in gram_cols:
           if (beer_df['Flag'].equals("Metric")):
               beer_df[i] = beer_df[i]*.035274
    def celc2fah(self,row):
        for i in celcius_cols:
           if (beer_df['Flag'].equals("Metric")):
               beer_df[i] = (beer_df[i]*9/5)+32
    def met2imp(self,iterrow):
        for i in adj_cols:
            lookup(i)

In [None]:
#intended to call the class on the dataframe and apply the transformations
def InitConv(row):
    return convert(row)
new_df = beer_df.apply(InitConv,axis=1)

In [8]:
cat_attribs = beer_df.select_dtypes(include=['object']).columns
num_attribs = beer_df.select_dtypes(include=['int64','float64']).columns
num_attribs

Index(['Batch_size_liters', 'og', 'fg', 'abv', 'ibu', 'color_levibonds',
       'mashph', 'Base Malt Amount', 'BasePPG', 'BaseColor', 'BasePercentage',
       'SpecialtyMalt1Amount', 'SpecialtyMalt1PPG', 'SpecialtyMalt1Color',
       'SpecialtyMalt1Percentage', 'SpecialtyMalt2Amount', 'SpecialtyMalt2PPG',
       'SpecialtyMalt2Color', 'SpecialtyMalt2Percentage',
       'SpecialtyMalt3Amount', 'SpecialtyMalt3PPG', 'SpecialtyMalt3Color',
       'SpecialtyMalt3Percentage', 'hop1amount', 'hop1alpha', 'hop1time',
       'hop1ibu', 'hop1percent', 'hop2amount', 'hop2alpha', 'hop2time',
       'hop2ibu', 'hop2percent', 'hop3amount', 'hop3alpha', 'hop3time',
       'hop3ibu', 'hop3percent', 'hop4amount', 'hop4alpha', 'hop4time',
       'hop4ibu', 'hop4percent', 'hop5amount', 'hop5alpha', 'hop5time',
       'hop5ibu', 'hop5percent', 'Attenuation', 'LowTemp', 'HighTemp',
       'Adjunct1Num', 'Adjunct2Num', 'Adjunct3Num', 'Adjunct4Num',
       'Adjunct5Num'],
      dtype='object')

In [9]:
#handles numerical values, imputing the mean and scaling (standard instead of minmax since it handles outliers better)
num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="mean")),
        ('std_scaler', StandardScaler()),
    ])

#handles categorical values, imputing the most frequent and onehot encoding
cat_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="most_frequent")),
        ('one_hot', OneHotEncoder()),
    ])
#pulls together two pipelines
pre_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", cat_pipeline, cat_attribs),
    ])

In [17]:
#sampledf = beer_df.sample(weights = beer_prepared.groupby('Category')['Category'].transform('count'))
#beer_sample = beer_prepared.sample(frac=.25, replace=False, random_state=33)
#beer_sample = beer_prepared.groupby["Category"].sample(frac=0.25, random_state=33)
#randomly generated sample of 25% of each beer Category to shrink the size of the dataframe
number_of_clusters = beer_df.Category.unique()
len(number_of_clusters)
from sklearn.cluster import MiniBatchKMeans


In [18]:
cluster = Pipeline(
 [
     (
        "kmeans",
        MiniBatchKMeans(
            n_clusters=167,
            batch_size=1000,
            compute_labels=True,
            max_iter=100,
            random_state=33,
            max_no_improvement=50,
            reassignment_ratio=0.01,
            ),
      ),
 ]
)
full_pipeline = Pipeline(
[
    ("preprocessor", pre_pipeline),
    ("clusterer",cluster)
    
])

In [19]:
full_pipeline.fit(beer_df)

Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  Pipeline(memory=None,
                                                           steps=[('imputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=None,
                                                                                 missing_values=nan,
                                                                                 strategy='mean',
                                                               