## Conversion of Units and Preprocessing

In [3]:
#Data handling
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler
import category_encoders as ce
from category_encoders.target_encoder import TargetEncoder
# Classification
import sklearn.linear_model

# Dimensionality reduction
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

# Visualization
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
import mpl_toolkits.mplot3d.axes3d as p3
from matplotlib import animation
%matplotlib inline
from sklearn.model_selection import StratifiedKFold
from sklearn.base import clone


In [4]:
#read in data and drop index
beer_df = pd.read_csv('beer_df_for_classification.csv',index_col=[0])
beer_df.info()
beer_df.head()
beer_df.dropna(subset=['Category'])

  interactivity=interactivity, compiler=compiler, result=result)


<class 'pandas.core.frame.DataFrame'>
Int64Index: 96540 entries, 0 to 96539
Columns: 106 entries, Batch_Style to Flag
dtypes: float64(55), int64(1), object(50)
memory usage: 78.8+ MB


Unnamed: 0,Batch_Style,Category,Batch_size_liters,og,fg,abv,ibu,color_levibonds,mashph,Base Malt Amount,...,Adjunct1Unit,Adjunct2Num,Adjunct2Unit,Adjunct3Num,Adjunct3Unit,Adjunct4Num,Adjunct4Unit,Adjunct5Num,Adjunct5Unit,Flag
0,All Grain,Standard/Ordinary Bitter,480.0,1.041,1.008,4.31,25.98,3.00,,75.000,...,,,,,,,,,,Metric
1,All Grain,Belgian Dubbel,1800.0,1.117,1.027,11.83,13.47,9.20,5.35,300.000,...,g,,,,,,,,,Metric
2,Partial Mash,Robust Porter,20.8,1.077,1.015,8.11,14.06,25.81,,0.454,...,,,,,,,,,,Imperial
3,All Grain,American IPA,200.0,1.064,1.012,6.76,33.79,13.53,,43.000,...,,,,,,,,,,Metric
4,All Grain,Blonde Ale,1589.9,1.053,1.012,5.28,17.47,4.08,,249.476,...,,,,,,,,,,Imperial
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96535,Extract,Mixed-Fermentation Sour Beer,20.8,1.050,1.011,5.11,9.59,5.62,,2.268,...,,,,,,,,,,Imperial
96536,All Grain,Weissbier,19.9,1.051,1.010,5.46,1.30,3.67,5.35,2.268,...,tsp,,,,,,,,,Imperial
96537,BIAB,Berliner Weisse,20.8,1.053,1.013,5.24,6.83,3.81,,2.722,...,,,,,,,,,,Imperial
96538,BIAB,Weissbier,50.0,1.051,1.013,5.06,6.49,3.22,,4.000,...,,,,,,,,,,Metric


## Train/Test/Validation Split

In [5]:
#Doing it at this time helps to avoid overfitting or picking the wrong architecture based on bias
#function for test set and validation set creation:
def split_train_val_test(beer_df,validation_ratio, test_ratio):
    np.random.seed(33)
    shuffled_indices = np.random.permutation(len(beer_df))  #shuffles the dataset
    validation_set_size = int(len(beer_df) * validation_ratio) #calculates validation set size based on ratio   
    test_set_size = int(len(beer_df) * test_ratio) #calculates test size based on ratio
    val_indices = shuffled_indices[:validation_set_size]
    test_indices = shuffled_indices[:test_set_size] #selects test set and from incdices
    train_indices = shuffled_indices[(test_set_size+validation_set_size):] #assigns the rest to training
    return beer_df.iloc[train_indices], beer_df.iloc[val_indices], beer_df.iloc[test_indices] #returns two different dfs for test and train
    
#Using the function
train_set, validation_set, test_set = split_train_val_test(beer_df, 0.15,0.10) #75% used for training

In [6]:
#check that it worked
print(len(train_set), len(validation_set),len(test_set))

72405 14481 9654


In [7]:
#save to csv
validation_set.to_csv('beer_data_val.csv')
train_set.to_csv('beer_data_train.csv')
test_set.to_csv('beer_data_test.csv')

In [8]:
#imports for the pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler,normalize, Normalizer,LabelEncoder, OrdinalEncoder, OneHotEncoder, RobustScaler
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.compose import ColumnTransformer
import category_encoders as ce
from category_encoders.binary import BinaryEncoder
from category_encoders.leave_one_out import LeaveOneOutEncoder
from category_encoders.helmert import HelmertEncoder
from category_encoders.count import CountEncoder

In [9]:
train_set = train_set.dropna(subset=['Category'])
X = train_set.drop(['Category'],axis=1)
y = train_set["Category"].copy()
print(len(X))

70057


In [10]:
#encoding is the same as it was for clustering, except the target variables (Category) will be label encoded this time
#variables with over 50% NaN/Missing values are categorized as "highna"
num_highna=['mashph','hop2amount','hop2alpha','hop2time','hop2ibu',
            'hop2percent','hop3amount','hop3alpha','hop3time',
            'hop3ibu','hop3percent','hop4amount','hop4alpha',
            'hop4time','hop4ibu','hop4percent','hop5amount',
            'hop5alpha','hop5time','hop5ibu','hop5percent',
            'Adjunct1Num','Adjunct2Num','Adjunct3Num','Adjunct4Num','Adjunct5Num']
num_lowna = ['Batch_size_liters', 'og', 'fg', 'abv', 'ibu', 'color_levibonds',
       'Base Malt Amount', 'BasePPG', 'BaseColor', 'BasePercentage',
       'SpecialtyMalt1Amount', 'SpecialtyMalt1PPG', 'SpecialtyMalt1Color',
       'SpecialtyMalt1Percentage', 'SpecialtyMalt2Amount', 'SpecialtyMalt2PPG',
       'SpecialtyMalt2Color', 'SpecialtyMalt2Percentage',
       'SpecialtyMalt3Amount', 'SpecialtyMalt3PPG', 'SpecialtyMalt3Color',
       'SpecialtyMalt3Percentage','hop1amount','hop1time','hop1ibu','hop1percent','hop1alpha','Attenuation', 'LowTemp', 'HighTemp']
cat_highna = ['hop2name', 'hop2type', 'hop2timing', 'hop3name',
       'hop3type', 'hop3timing', 'hop4name', 'hop4type', 'hop4timing',
       'hop5name', 'hop5type', 'hop5timing', 'Adjunct1Amount', 'Adjunct1Name', 'Adjunct1Type',
       'Adjunct1Timing', 'Adjunct2Amount', 'Adjunct2Name', 'Adjunct2Type',
       'Adjunct2Timing', 'Adjunct3Amount', 'Adjunct3Name', 'Adjunct3Type',
       'Adjunct3Timing', 'Adjunct4Amount', 'Adjunct4Name', 'Adjunct4Type',
       'Adjunct4Timing', 'Adjunct5Amount', 'Adjunct5Name', 'Adjunct5Type',
       'Adjunct5Timing', 'Adjunct1Unit', 'Adjunct2Unit', 'Adjunct3Unit',
       'Adjunct4Unit', 'Adjunct5Unit']
cat_lowna = ['Batch_Style', 'Base Malt', 'SpecialtyMalt1Name',
       'SpecialtyMalt2Name', 'SpecialtyMalt3Name', 'hop1name', 'hop1type',
       'hop1timing', 'YeastStrain', 'Flocculation',
       'Starter?', 'Flag']
target = ['Category']

In [11]:
#handles numerical values, imputing the mean and scaling (standard instead of minmax since it handles outliers better)
num_pipeline_lowna = Pipeline(steps=[
        ('imputer', SimpleImputer(missing_values=np.nan,strategy="mean")),
        ('standardizer', RobustScaler(with_centering=False))
    ])
num_pipeline_highna = Pipeline(steps=[
        ('imputer', SimpleImputer(missing_values=np.nan,strategy="constant",fill_value=0)),
        ('standardizer', RobustScaler(with_centering=False))
    ])
#handles categorical values, imputing the most frequent and onehot encoding
cat_pipeline_lowna = Pipeline(steps=[
        ('imputer', SimpleImputer(missing_values=np.nan,strategy="most_frequent")),
        ('encoder', TargetEncoder(handle_missing='return_nan',min_samples_leaf=100,smoothing=10)),
        ('standardizer', RobustScaler())
       
    ])
cat_pipeline_highna = Pipeline(steps=[
        ('imputer', SimpleImputer(missing_values=np.nan, strategy="constant", fill_value="most_frequent")),
        ('encoder', TargetEncoder(handle_missing='return_nan',min_samples_leaf=100,smoothing=10)),
        ('standardizer', RobustScaler())
    ])
#pulls together two pipelines
pre_pipeline = ColumnTransformer(transformers=[
        ("num_lowna", num_pipeline_lowna, num_lowna),
        ("num_highna", num_pipeline_highna,num_highna),
        ("cat_lowna", cat_pipeline_lowna, cat_lowna),
        ("cat_highna", cat_pipeline_highna, cat_highna),
    ])

In [12]:
#label encodes the target variable
le = LabelEncoder()
label_encoder = le.fit(y)
y = label_encoder.transform(y)

In [13]:
beer_prepared = pre_pipeline.fit_transform(X,y=y)

  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):


## Baseline Models

In [14]:
len(y)

70057

In [17]:
#trying out a few base models before we get to hyperparameter tuning
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(beer_prepared, y)

LinearRegression()

In [18]:
#evaluation
from sklearn.metrics import mean_squared_error
style_predictions = lin_reg.predict(beer_prepared)
lin_mse = mean_squared_error(y, style_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

41.442771480228174

In [19]:
from sklearn.tree import DecisionTreeRegressor
tree_reg = DecisionTreeRegressor()
tree_reg.fit(beer_prepared, y)

DecisionTreeRegressor()

In [20]:
style_predictions = tree_reg.predict(beer_prepared)
tree_mse = mean_squared_error(y, style_predictions)
tree_rmse = np.sqrt(tree_mse)
tree_rmse

0.22149503629031086

In [21]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(tree_reg, beer_prepared, y,
                         scoring="neg_mean_squared_error", cv=10)
tree_rmse_scores = np.sqrt(-scores)

In [22]:
#function to display scores of various models using cross validation
def display_scores(scores):
     print("Scores:", scores)
     print("Mean:", scores.mean())
     print("Standard deviation:", scores.std())
display_scores(-scores)
#this one is massively overfit

Scores: [2882.03786041 2904.907829   2718.16575079 2812.75977733 2942.99414787
 2860.86168998 2947.34727377 2823.39700214 2745.92633833 2866.76745182]
Mean: 2850.5165121437667
Standard deviation: 72.710592993121


In [23]:
lin_scores = cross_val_score(lin_reg, beer_prepared, y,
                              scoring="neg_mean_squared_error", cv=10)

lin_rmse_scores = np.sqrt(-lin_scores)
display_scores(lin_rmse_scores)
#not great but not as overfit as the decision tree

Scores: [ 41.25313864  41.68320675  41.43610998  41.63630872  41.78240718
  41.96375986  41.39615871 135.04085334  40.70116613  41.45458071]
Mean: 50.83476900221565
Standard deviation: 28.070580854932803


In [24]:
from sklearn.ensemble import RandomForestRegressor
forest_reg = RandomForestRegressor()
forest_reg.fit(beer_prepared, y)

RandomForestRegressor()

In [25]:
forest_scores = cross_val_score(forest_reg, beer_prepared, y,
                              scoring="neg_mean_squared_error", cv=10)
forest_rmse_scores = np.sqrt(-forest_scores)
display_scores(forest_rmse_scores)

Scores: [37.1862683  37.58172514 37.48675662 37.3825078  38.18165039 37.82251424
 37.8917524  38.03902301 36.70879563 37.56284486]
Mean: 37.58438383949072
Standard deviation: 0.4111584030818157


In [26]:
#save the base models for now in case I need em
import joblib
joblib.dump(lin_reg, "base_lin_reg.pkl")
joblib.dump(tree_reg, "base_decision_tree.pkl")
joblib.dump(forest_reg, "base_random_forest.pkl")

['base_random_forest.pkl']

## Fine Tuning Models

In [26]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    {'n_estimators': [3, 10, 30], 'max_features': [20, 40, 60, 80]},
    {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [20, 30, 40]},
  ]

forest_reg = RandomForestRegressor()

grid_search = GridSearchCV(forest_reg, param_grid, cv=5,
                           scoring='neg_mean_squared_error',
                           return_train_score=True)

grid_search.fit(beer_prepared, y)

GridSearchCV(cv=5, estimator=RandomForestRegressor(),
             param_grid=[{'max_features': [20, 40, 60, 80],
                          'n_estimators': [3, 10, 30]},
                         {'bootstrap': [False], 'max_features': [20, 30, 40],
                          'n_estimators': [3, 10]}],
             return_train_score=True, scoring='neg_mean_squared_error')

In [27]:
grid_search.best_estimator_

RandomForestRegressor(max_features=60, n_estimators=30)

In [28]:
feature_importances = grid_search.best_estimator_.feature_importances_
feature_importances

array([1.02155604e-02, 2.23697723e-02, 1.62397973e-02, 3.37825604e-02,
       4.14059720e-02, 5.98098967e-02, 1.29447237e-02, 6.99118075e-03,
       1.07268040e-02, 1.74293673e-02, 1.17202696e-02, 9.29929712e-03,
       1.42525095e-02, 1.67876484e-02, 9.31572802e-03, 8.87325606e-03,
       1.25584937e-02, 1.32524497e-02, 7.95099064e-03, 7.56034474e-03,
       9.44153697e-03, 1.11053021e-02, 1.13729516e-02, 6.40677246e-03,
       1.93145805e-02, 1.51994116e-02, 1.35893076e-02, 1.08057369e-02,
       1.18082729e-02, 1.11250883e-02, 5.21349620e-03, 7.16020408e-03,
       1.30459836e-02, 1.66427293e-03, 2.44338950e-03, 3.39147846e-03,
       1.26153243e-03, 3.02940034e-03, 1.02452402e-03, 1.11290586e-03,
       1.68712982e-03, 6.21774280e-04, 9.53388170e-04, 5.74459886e-04,
       5.72254281e-04, 8.11625913e-04, 4.82184303e-04, 4.83098146e-04,
       4.82423848e-04, 3.28703031e-04, 5.57937033e-04, 1.44892684e-03,
       1.19037660e-03, 1.37006533e-03, 5.91859549e-04, 6.80824036e-04,
      

In [37]:
#cat_high = pre_pipeline.named_transformers_["cat_highna"]
#cat_high_attribs = list(cat_high.components_)
#cat_low = pre_pipeline.named_transformers_["cat_lowna"]
#Acat_low_attribs = list(cat_low.get_feature_names(input_features=cat_lowna))
#attributes = num_highna + num_lowna + cat_low_attribs + cat_high_attribs
#sorted(zip(feature_importances, attributes), reverse=True)
best_model = grid_search.best_estimator_

test_set = test_set.dropna(subset=['Category'])
X_test = test_set.drop('Category',axis=1)
y = test_set['Category'].copy()
y_test = le.transform(y)

X_test_prepared = pre_pipeline.transform(X_test)

model_preds = best_model.predict(X_test_prepared)

model_mse = mean_squared_error(y_test, model_preds)
model_rmse = np.sqrt(model_mse)

print(model_mse,model_rmse) #rmse = 38.17, mse = 1457.21

1457.2149536057655 38.17348495494963


In [42]:
#match up feature importances with actual names of features
targetencode_cols_low = list(pre_pipeline.named_transformers_['cat_lowna'].named_steps['encoder'].get_feature_names())
targetencode_cols_high = list(pre_pipeline.named_transformers_['cat_highna'].named_steps['encoder'].get_feature_names())
numeric_list = list(num_lowna)
numeric_highna_list = list(num_highna)
numeric_list.extend(num_highna)
numeric_list.extend(targetencode_cols_high)
numeric_list.extend(targetencode_cols_low)

In [43]:
numeric_list

['Batch_size_liters',
 'og',
 'fg',
 'abv',
 'ibu',
 'color_levibonds',
 'Base Malt Amount',
 'BasePPG',
 'BaseColor',
 'BasePercentage',
 'SpecialtyMalt1Amount',
 'SpecialtyMalt1PPG',
 'SpecialtyMalt1Color',
 'SpecialtyMalt1Percentage',
 'SpecialtyMalt2Amount',
 'SpecialtyMalt2PPG',
 'SpecialtyMalt2Color',
 'SpecialtyMalt2Percentage',
 'SpecialtyMalt3Amount',
 'SpecialtyMalt3PPG',
 'SpecialtyMalt3Color',
 'SpecialtyMalt3Percentage',
 'hop1amount',
 'hop1time',
 'hop1ibu',
 'hop1percent',
 'hop1alpha',
 'Attenuation',
 'LowTemp',
 'HighTemp',
 'mashph',
 'hop2amount',
 'hop2alpha',
 'hop2time',
 'hop2ibu',
 'hop2percent',
 'hop3amount',
 'hop3alpha',
 'hop3time',
 'hop3ibu',
 'hop3percent',
 'hop4amount',
 'hop4alpha',
 'hop4time',
 'hop4ibu',
 'hop4percent',
 'hop5amount',
 'hop5alpha',
 'hop5time',
 'hop5ibu',
 'hop5percent',
 'Adjunct1Num',
 'Adjunct2Num',
 'Adjunct3Num',
 'Adjunct4Num',
 'Adjunct5Num',
 0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
