## Conversion of Units and Preprocessing

In [51]:
#Data handling
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler
import category_encoders as ce
from category_encoders.target_encoder import TargetEncoder
# Classification
import sklearn.linear_model

# Dimensionality reduction
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

# Visualization
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
import mpl_toolkits.mplot3d.axes3d as p3
from matplotlib import animation
%matplotlib inline
from sklearn.model_selection import StratifiedKFold
from sklearn.base import clone


In [16]:
#read in data and drop index
beer_df = pd.read_csv('beer_df_for_classification.csv',index_col=[0])
beer_df.info()
beer_df.head()
beer_df.dropna(subset=['Category'])

  interactivity=interactivity, compiler=compiler, result=result)


<class 'pandas.core.frame.DataFrame'>
Int64Index: 96540 entries, 0 to 96539
Columns: 106 entries, Batch_Style to Flag
dtypes: float64(55), int64(1), object(50)
memory usage: 78.8+ MB


Unnamed: 0,Batch_Style,Category,Batch_size_liters,og,fg,abv,ibu,color_levibonds,mashph,Base Malt Amount,...,Adjunct1Unit,Adjunct2Num,Adjunct2Unit,Adjunct3Num,Adjunct3Unit,Adjunct4Num,Adjunct4Unit,Adjunct5Num,Adjunct5Unit,Flag
0,All Grain,Standard/Ordinary Bitter,480.0,1.041,1.008,4.31,25.98,3.00,,75.000,...,,,,,,,,,,Metric
1,All Grain,Belgian Dubbel,1800.0,1.117,1.027,11.83,13.47,9.20,5.35,300.000,...,g,,,,,,,,,Metric
2,Partial Mash,Robust Porter,20.8,1.077,1.015,8.11,14.06,25.81,,0.454,...,,,,,,,,,,Imperial
3,All Grain,American IPA,200.0,1.064,1.012,6.76,33.79,13.53,,43.000,...,,,,,,,,,,Metric
4,All Grain,Blonde Ale,1589.9,1.053,1.012,5.28,17.47,4.08,,249.476,...,,,,,,,,,,Imperial
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96535,Extract,Mixed-Fermentation Sour Beer,20.8,1.050,1.011,5.11,9.59,5.62,,2.268,...,,,,,,,,,,Imperial
96536,All Grain,Weissbier,19.9,1.051,1.010,5.46,1.30,3.67,5.35,2.268,...,tsp,,,,,,,,,Imperial
96537,BIAB,Berliner Weisse,20.8,1.053,1.013,5.24,6.83,3.81,,2.722,...,,,,,,,,,,Imperial
96538,BIAB,Weissbier,50.0,1.051,1.013,5.06,6.49,3.22,,4.000,...,,,,,,,,,,Metric


## Train/Test/Validation Split

In [52]:
#Doing it at this time helps to avoid overfitting or picking the wrong architecture based on bias
#function for test set and validation set creation:
def split_train_val_test(beer_df,validation_ratio, test_ratio):
    np.random.seed(33)
    shuffled_indices = np.random.permutation(len(beer_df))  #shuffles the dataset
    validation_set_size = int(len(beer_df) * validation_ratio) #calculates validation set size based on ratio   
    test_set_size = int(len(beer_df) * test_ratio) #calculates test size based on ratio
    val_indices = shuffled_indices[:validation_set_size]
    test_indices = shuffled_indices[:test_set_size] #selects test set and from incdices
    train_indices = shuffled_indices[(test_set_size+validation_set_size):] #assigns the rest to training
    return beer_df.iloc[train_indices], beer_df.iloc[val_indices], beer_df.iloc[test_indices] #returns two different dfs for test and train
    
#Using the function
train_set, validation_set, test_set = split_train_val_test(beer_df, 0.15,0.10) #75% used for training

In [53]:
#check that it worked
print(len(train_set), len(validation_set),len(test_set))

72405 14481 9654


In [54]:
#save to csv
validation_set.to_csv('beer_data_val.csv')
train_set.to_csv('beer_data_train.csv')
test_set.to_csv('beer_data_test.csv')

In [58]:
#imports for the pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler,normalize, Normalizer,LabelEncoder, OrdinalEncoder, OneHotEncoder, RobustScaler
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.compose import ColumnTransformer
import category_encoders as ce
from category_encoders.binary import BinaryEncoder
from category_encoders.leave_one_out import LeaveOneOutEncoder
from category_encoders.helmert import HelmertEncoder
from category_encoders.count import CountEncoder

In [56]:
train_set = train_set.dropna(subset=['Category'])
X = train_set.drop(['Category'],axis=1)
y = train_set["Category"].copy()
print(len(X))

70057


In [57]:
#encoding is the same as it was for clustering, except the target variables (Category) will be label encoded this time
#variables with over 50% NaN/Missing values are categorized as "highna"
num_highna=['mashph','hop2amount','hop2alpha','hop2time','hop2ibu',
            'hop2percent','hop3amount','hop3alpha','hop3time',
            'hop3ibu','hop3percent','hop4amount','hop4alpha',
            'hop4time','hop4ibu','hop4percent','hop5amount',
            'hop5alpha','hop5time','hop5ibu','hop5percent',
            'Adjunct1Num','Adjunct2Num','Adjunct3Num','Adjunct4Num','Adjunct5Num']
num_lowna = ['Batch_size_liters', 'og', 'fg', 'abv', 'ibu', 'color_levibonds',
       'Base Malt Amount', 'BasePPG', 'BaseColor', 'BasePercentage',
       'SpecialtyMalt1Amount', 'SpecialtyMalt1PPG', 'SpecialtyMalt1Color',
       'SpecialtyMalt1Percentage', 'SpecialtyMalt2Amount', 'SpecialtyMalt2PPG',
       'SpecialtyMalt2Color', 'SpecialtyMalt2Percentage',
       'SpecialtyMalt3Amount', 'SpecialtyMalt3PPG', 'SpecialtyMalt3Color',
       'SpecialtyMalt3Percentage','hop1amount','hop1time','hop1ibu','hop1percent','hop1alpha','Attenuation', 'LowTemp', 'HighTemp']
cat_highna = ['hop2name', 'hop2type', 'hop2timing', 'hop3name',
       'hop3type', 'hop3timing', 'hop4name', 'hop4type', 'hop4timing',
       'hop5name', 'hop5type', 'hop5timing', 'Adjunct1Amount', 'Adjunct1Name', 'Adjunct1Type',
       'Adjunct1Timing', 'Adjunct2Amount', 'Adjunct2Name', 'Adjunct2Type',
       'Adjunct2Timing', 'Adjunct3Amount', 'Adjunct3Name', 'Adjunct3Type',
       'Adjunct3Timing', 'Adjunct4Amount', 'Adjunct4Name', 'Adjunct4Type',
       'Adjunct4Timing', 'Adjunct5Amount', 'Adjunct5Name', 'Adjunct5Type',
       'Adjunct5Timing', 'Adjunct1Unit', 'Adjunct2Unit', 'Adjunct3Unit',
       'Adjunct4Unit', 'Adjunct5Unit']
cat_lowna = ['Batch_Style', 'Base Malt', 'SpecialtyMalt1Name',
       'SpecialtyMalt2Name', 'SpecialtyMalt3Name', 'hop1name', 'hop1type',
       'hop1timing', 'YeastStrain', 'Flocculation',
       'Starter?', 'Flag']
target = ['Category']

In [62]:
#handles numerical values, imputing the mean and scaling (standard instead of minmax since it handles outliers better)
num_pipeline_lowna = Pipeline(steps=[
        ('imputer', SimpleImputer(missing_values=np.nan,strategy="mean")),
        ('standardizer', RobustScaler(with_centering=False))
    ])
num_pipeline_highna = Pipeline(steps=[
        ('imputer', SimpleImputer(missing_values=np.nan,strategy="constant",fill_value=0)),
        ('standardizer', RobustScaler(with_centering=False))
    ])
#handles categorical values, imputing the most frequent and onehot encoding
cat_pipeline_lowna = Pipeline(steps=[
        ('imputer', SimpleImputer(missing_values=np.nan,strategy="most_frequent")),
        ('encoder', CountEncoder(handle_missing='return_nan',min_group_size=.01,combine_min_nan_groups=True))        
    ])
cat_pipeline_highna = Pipeline(steps=[
        ('imputer', SimpleImputer(missing_values=np.nan, strategy="constant", fill_value="most_frequent")),
        ('encoder', CountEncoder(handle_missing='return_nan',min_group_size=0.01,combine_min_nan_groups=True))        
    ])
#pulls together two pipelines
pre_pipeline = ColumnTransformer(transformers=[
        ("num_lowna", num_pipeline_lowna, num_lowna),
        ("num_highna", num_pipeline_highna,num_highna),
        ("cat_lowna", cat_pipeline_lowna, cat_lowna),
        ("cat_highna", cat_pipeline_highna, cat_highna),
    ])

In [63]:
beer_prepared = pre_pipeline.fit_transform(X,y=None)

## Baseline Models

In [64]:
len(y)

70057

In [65]:
#label encodes the target variable
le = LabelEncoder()
label_encoder = le.fit(y)
y = label_encoder.transform(y)

In [66]:
#trying out a few base models before we get to hyperparameter tuning
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(beer_prepared, y)

LinearRegression()

In [67]:
#evaluation
from sklearn.metrics import mean_squared_error
style_predictions = lin_reg.predict(beer_prepared)
lin_mse = mean_squared_error(y, style_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

45.679339335426235

In [68]:
from sklearn.tree import DecisionTreeRegressor
tree_reg = DecisionTreeRegressor()
tree_reg.fit(beer_prepared, y)

DecisionTreeRegressor()

In [69]:
style_predictions = tree_reg.predict(beer_prepared)
tree_mse = mean_squared_error(y, style_predictions)
tree_rmse = np.sqrt(tree_mse)
tree_rmse

0.22149503629031086

In [70]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(tree_reg, beer_prepared, y,
                         scoring="neg_mean_squared_error", cv=10)
tree_rmse_scores = np.sqrt(-scores)

In [71]:
#function to display scores of various models using cross validation
def display_scores(scores):
     print("Scores:", scores)
     print("Mean:", scores.mean())
     print("Standard deviation:", scores.std())
display_scores(-scores)
#this one is massively overfit

Scores: [3102.95849986 3101.64130745 3135.84170711 3082.49657436 3137.79346275
 3060.6427348  3157.6328861  3217.63286938 3040.40885082 3143.33747323]
Mean: 3118.0386365856893
Standard deviation: 48.90208963402913


In [72]:
lin_scores = cross_val_score(lin_reg, beer_prepared, y,
                              scoring="neg_mean_squared_error", cv=10)

lin_rmse_scores = np.sqrt(-lin_scores)
display_scores(lin_rmse_scores)
#not great but not as overfit as the decision tree

Scores: [45.14907883 45.90436669 45.74295962 46.43475091 45.93237839 46.28127798
 45.56010737 57.49578184 45.32224708 45.73563177]
Mean: 46.955858049424464
Standard deviation: 3.5328611612597003


In [73]:
from sklearn.ensemble import RandomForestRegressor
forest_reg = RandomForestRegressor()
forest_reg.fit(beer_prepared, y)

RandomForestRegressor()

In [74]:
forest_scores = cross_val_score(forest_reg, beer_prepared, y,
                              scoring="neg_mean_squared_error", cv=10)
forest_rmse_scores = np.sqrt(-forest_scores)
display_scores(forest_rmse_scores)

Scores: [38.55063278 38.93790681 39.04571738 39.03800903 39.60860038 39.23259552
 39.31792234 39.16232581 38.31573292 39.15573902]
Mean: 39.0365182004156
Standard deviation: 0.35219041651964755


In [75]:
#save the base models for now in case I need em
import joblib
joblib.dump(lin_reg, "base_lin_reg.pkl")
joblib.dump(tree_reg, "base_decision_tree.pkl")
joblib.dump(forest_reg, "base_random_forest.pkl")

['base_random_forest.pkl']

## Fine Tuning Models

In [76]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    {'n_estimators': [3, 10, 30], 'max_features': [20, 40, 60, 80]},
    {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [20, 30, 40]},
  ]

forest_reg = RandomForestRegressor()

grid_search = GridSearchCV(forest_reg, param_grid, cv=5,
                           scoring='neg_mean_squared_error',
                           return_train_score=True)

grid_search.fit(beer_prepared, y)

GridSearchCV(cv=5, estimator=RandomForestRegressor(),
             param_grid=[{'max_features': [20, 40, 60, 80],
                          'n_estimators': [3, 10, 30]},
                         {'bootstrap': [False], 'max_features': [20, 30, 40],
                          'n_estimators': [3, 10]}],
             return_train_score=True, scoring='neg_mean_squared_error')

In [77]:
grid_search.best_estimator_

RandomForestRegressor(max_features=80, n_estimators=30)

In [78]:
feature_importances = grid_search.best_estimator_.feature_importances_
feature_importances

array([0.01129865, 0.02393912, 0.0217164 , 0.04209988, 0.06725438,
       0.06782862, 0.01359407, 0.00802323, 0.01583586, 0.01981394,
       0.0125456 , 0.01092339, 0.01801823, 0.02884332, 0.00992036,
       0.01100057, 0.0146859 , 0.01540337, 0.0084272 , 0.00825411,
       0.01132119, 0.01252496, 0.01176747, 0.00778917, 0.02191702,
       0.01823223, 0.03617183, 0.01953898, 0.01694921, 0.01621279,
       0.00555174, 0.02427809, 0.12160741, 0.0018533 , 0.00252076,
       0.00767189, 0.00142348, 0.00268808, 0.00114114, 0.0011023 ,
       0.00163192, 0.00082827, 0.00089569, 0.00066122, 0.00054225,
       0.00088379, 0.00042221, 0.00070634, 0.00045296, 0.00032163,
       0.00068141, 0.00164373, 0.00121495, 0.00139566, 0.00057512,
       0.00054608, 0.0033616 , 0.01922439, 0.01656111, 0.01338476,
       0.01031598, 0.01869007, 0.00217378, 0.00128852, 0.0672018 ,
       0.01840479, 0.00209656, 0.00092139, 0.00372015, 0.00065366,
       0.0095333 , 0.00155102, 0.0004307 , 0.00058883, 0.00110

In [83]:
#match up feature importances with actual names of features
cat_lowna_encoder = pre_pipeline.named_transformers_["cat_lowna"]
cat_lowna_attribs = list(cat_pipeline_lowna.get_feature_names())
cat_higna_encoder = pre_pipeline.named_transformers_["cat_highna"]
cat_highna_attribs = list(cat_highna_encoder.categories_[0])
attributes = num_highna + num_lowna + cat_lowna_attribs + cat_highna_attribs
sorted(zip(feature_importances, attributes), reverse=True)

AttributeError: 'Pipeline' object has no attribute 'get_feature_names'