## Conversion of Units and Preprocessing

In [1]:
#Data handling
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler
import category_encoders as ce
from category_encoders.target_encoder import TargetEncoder
# Classification
import sklearn.linear_model

# Dimensionality reduction
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

# Visualization
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
import mpl_toolkits.mplot3d.axes3d as p3
from matplotlib import animation
%matplotlib inline
from sklearn.model_selection import StratifiedKFold
from sklearn.base import clone


  import pandas.util.testing as tm


In [2]:
#read in data and drop index
beer_df = pd.read_csv('beer_df_for_classification.csv',index_col=[0])
beer_df.info()
beer_df.head()
beer_df.dropna(subset=['Category'])

  interactivity=interactivity, compiler=compiler, result=result)


<class 'pandas.core.frame.DataFrame'>
Int64Index: 96540 entries, 0 to 96539
Columns: 106 entries, Batch_Style to Flag
dtypes: float64(55), int64(1), object(50)
memory usage: 78.8+ MB


Unnamed: 0,Batch_Style,Category,Batch_size_liters,og,fg,abv,ibu,color_levibonds,mashph,Base Malt Amount,...,Adjunct1Unit,Adjunct2Num,Adjunct2Unit,Adjunct3Num,Adjunct3Unit,Adjunct4Num,Adjunct4Unit,Adjunct5Num,Adjunct5Unit,Flag
0,All Grain,Standard/Ordinary Bitter,480.0,1.041,1.008,4.31,25.98,3.00,,75.000,...,,,,,,,,,,Metric
1,All Grain,Belgian Dubbel,1800.0,1.117,1.027,11.83,13.47,9.20,5.35,300.000,...,g,,,,,,,,,Metric
2,Partial Mash,Robust Porter,20.8,1.077,1.015,8.11,14.06,25.81,,0.454,...,,,,,,,,,,Imperial
3,All Grain,American IPA,200.0,1.064,1.012,6.76,33.79,13.53,,43.000,...,,,,,,,,,,Metric
4,All Grain,Blonde Ale,1589.9,1.053,1.012,5.28,17.47,4.08,,249.476,...,,,,,,,,,,Imperial
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96535,Extract,Mixed-Fermentation Sour Beer,20.8,1.050,1.011,5.11,9.59,5.62,,2.268,...,,,,,,,,,,Imperial
96536,All Grain,Weissbier,19.9,1.051,1.010,5.46,1.30,3.67,5.35,2.268,...,tsp,,,,,,,,,Imperial
96537,BIAB,Berliner Weisse,20.8,1.053,1.013,5.24,6.83,3.81,,2.722,...,,,,,,,,,,Imperial
96538,BIAB,Weissbier,50.0,1.051,1.013,5.06,6.49,3.22,,4.000,...,,,,,,,,,,Metric


## Train/Test/Validation Split

In [3]:
#Doing it at this time helps to avoid overfitting or picking the wrong architecture based on bias
#function for test set and validation set creation:
def split_train_val_test(beer_df,validation_ratio, test_ratio):
    np.random.seed(33)
    shuffled_indices = np.random.permutation(len(beer_df))  #shuffles the dataset
    validation_set_size = int(len(beer_df) * validation_ratio) #calculates validation set size based on ratio   
    test_set_size = int(len(beer_df) * test_ratio) #calculates test size based on ratio
    val_indices = shuffled_indices[:validation_set_size]
    test_indices = shuffled_indices[:test_set_size] #selects test set and from incdices
    train_indices = shuffled_indices[(test_set_size+validation_set_size):] #assigns the rest to training
    return beer_df.iloc[train_indices], beer_df.iloc[val_indices], beer_df.iloc[test_indices] #returns two different dfs for test and train
    
#Using the function
train_set, validation_set, test_set = split_train_val_test(beer_df, 0.15,0.10) #75% used for training

In [4]:
#check that it worked
print(len(train_set), len(validation_set),len(test_set))

72405 14481 9654


In [36]:
#save to csv
validation_set.to_csv('beer_data_val.csv')
train_set.to_csv('beer_data_train.csv')
test_set.to_csv('beer_data_test.csv')

In [5]:
#imports for the pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler,normalize, Normalizer,LabelEncoder, OrdinalEncoder, OneHotEncoder, RobustScaler
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.compose import ColumnTransformer
import category_encoders as ce
from category_encoders.binary import BinaryEncoder
from category_encoders.leave_one_out import LeaveOneOutEncoder
from category_encoders.helmert import HelmertEncoder
from category_encoders.count import CountEncoder

In [6]:
train_set = train_set.dropna(subset=['Category'])
X = train_set.drop(['Category'],axis=1)
y = train_set["Category"].copy()
print(len(X))

70057


In [7]:
#encoding is the same as it was for clustering, except the target variables (Category) will be label encoded this time
#variables with over 50% NaN/Missing values are categorized as "highna"
num_highna=['mashph','hop2amount','hop2alpha','hop2time','hop2ibu',
            'hop2percent','hop3amount','hop3alpha','hop3time',
            'hop3ibu','hop3percent','hop4amount','hop4alpha',
            'hop4time','hop4ibu','hop4percent','hop5amount',
            'hop5alpha','hop5time','hop5ibu','hop5percent',
            'Adjunct1Num','Adjunct2Num','Adjunct3Num','Adjunct4Num','Adjunct5Num']
num_lowna = ['Batch_size_liters', 'og', 'fg', 'abv', 'ibu', 'color_levibonds',
       'Base Malt Amount', 'BasePPG', 'BaseColor', 'BasePercentage',
       'SpecialtyMalt1Amount', 'SpecialtyMalt1PPG', 'SpecialtyMalt1Color',
       'SpecialtyMalt1Percentage', 'SpecialtyMalt2Amount', 'SpecialtyMalt2PPG',
       'SpecialtyMalt2Color', 'SpecialtyMalt2Percentage',
       'SpecialtyMalt3Amount', 'SpecialtyMalt3PPG', 'SpecialtyMalt3Color',
       'SpecialtyMalt3Percentage','hop1amount','hop1time','hop1ibu','hop1percent','hop1alpha','Attenuation', 'LowTemp', 'HighTemp']
cat_highna = ['hop2name', 'hop2type', 'hop2timing', 'hop3name',
       'hop3type', 'hop3timing', 'hop4name', 'hop4type', 'hop4timing',
       'hop5name', 'hop5type', 'hop5timing', 'Adjunct1Amount', 'Adjunct1Name', 'Adjunct1Type',
       'Adjunct1Timing', 'Adjunct2Amount', 'Adjunct2Name', 'Adjunct2Type',
       'Adjunct2Timing', 'Adjunct3Amount', 'Adjunct3Name', 'Adjunct3Type',
       'Adjunct3Timing', 'Adjunct4Amount', 'Adjunct4Name', 'Adjunct4Type',
       'Adjunct4Timing', 'Adjunct5Amount', 'Adjunct5Name', 'Adjunct5Type',
       'Adjunct5Timing', 'Adjunct1Unit', 'Adjunct2Unit', 'Adjunct3Unit',
       'Adjunct4Unit', 'Adjunct5Unit']
cat_lowna = ['Batch_Style', 'Base Malt', 'SpecialtyMalt1Name',
       'SpecialtyMalt2Name', 'SpecialtyMalt3Name', 'hop1name', 'hop1type',
       'hop1timing', 'YeastStrain', 'Flocculation',
       'Starter?', 'Flag']
target = ['Category']

In [26]:
#handles numerical values, imputing the mean and scaling (standard instead of minmax since it handles outliers better)
num_pipeline_lowna = Pipeline(steps=[
        ('imputer', SimpleImputer(missing_values=np.nan,strategy="mean")),
        ('standardizer', RobustScaler())
    ])
num_pipeline_highna = Pipeline(steps=[
        ('imputer', SimpleImputer(missing_values=np.nan,strategy="constant",fill_value=0)),
        ('standardizer', RobustScaler())
    ])
#handles categorical values, imputing the most frequent and onehot encoding
cat_pipeline_lowna = Pipeline(steps=[
        ('imputer', SimpleImputer(missing_values=np.nan,strategy="most_frequent")),
        ('encoder', TargetEncoder(handle_missing='return_nan',min_samples_leaf=100,smoothing=10)),
        ('standardize', RobustScaler())
       
    ])
cat_pipeline_highna = Pipeline(steps=[
        ('imputer', SimpleImputer(missing_values=np.nan, strategy="most_frequent")),
        ('encoder', TargetEncoder(handle_missing='return_nan',min_samples_leaf=100,smoothing=10)),
        ('standardizer', RobustScaler())
        
    ])
#pulls together two pipelines
pre_pipeline = ColumnTransformer(transformers=[
        ("num_lowna", num_pipeline_lowna, num_lowna),
        ("num_highna", num_pipeline_highna,num_highna),
        ("cat_lowna", cat_pipeline_lowna, cat_lowna),
        ("cat_highna", cat_pipeline_highna, cat_highna),
    ])

In [27]:
#label encodes the target variable
le = LabelEncoder()
label_encoder = le.fit(y)
y = label_encoder.transform(y)

In [28]:
beer_prepared = pre_pipeline.fit_transform(X,y=y)

  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):


## Baseline Models

In [29]:
beer_prepared

array([[-0.37254902, -0.5625    , -0.16666667, ...,  0.        ,
         0.        ,  0.        ],
       [-2.03921569,  0.25      ,  0.        , ...,  7.28329148,
         0.        ,  0.        ],
       [ 0.        , -0.375     ,  0.83333333, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [ 0.03921569,  1.5625    ,  0.5       , ...,  0.        ,
         0.        ,  0.        ],
       [-0.35294118, -0.3125    , -0.16666667, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.62745098,  1.5       ,  0.66666667, ...,  0.        ,
         0.        ,  0.        ]])

In [None]:
from sklearn.svm import SVC
poly_kernel_svm_clf = Pipeline([
        ("svm_clf", SVC(kernel="poly", degree=3, coef0=1, C=5))
    ])
poly_kernel_svm_clf.fit(beer_prepared, y)

In [None]:
rbf_kernel_svm_clf = Pipeline([
        ("svm_clf", SVC(kernel="rbf", gamma=5, C=0.001))
    ])
rbf_kernel_svm_clf.fit(X, y)

In [None]:
from sklearn.model_selection import GridSearchCV 
  
# defining parameter range 
param_grid = {'C': [0.1, 1, 10, 100, 1000],  
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 
              'kernel': ['poly']}  
  
grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3) 
  
# fitting the model for grid search 
grid.fit(X_train, y_train) 

In [26]:
#save the base models for now in case I need em
import joblib
joblib.dump(svm, "base_svm_reg.pkl")

['base_random_forest.pkl']

## Fine Tuning Models

In [27]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    {'n_estimators': [3, 10, 30], 'max_features': [20, 40, 60, 80]},
    {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [20, 30, 40]},
  ]

forest_reg = RandomForestRegressor()

grid_search = GridSearchCV(forest_reg, param_grid, cv=5,
                           scoring='neg_mean_squared_error',
                           return_train_score=True)

grid_search.fit(beer_prepared, y)

GridSearchCV(cv=5, estimator=RandomForestRegressor(),
             param_grid=[{'max_features': [20, 40, 60, 80],
                          'n_estimators': [3, 10, 30]},
                         {'bootstrap': [False], 'max_features': [20, 30, 40],
                          'n_estimators': [3, 10]}],
             return_train_score=True, scoring='neg_mean_squared_error')

In [28]:
grid_search.best_estimator_

RandomForestRegressor(max_features=60, n_estimators=30)

In [29]:
feature_importances = grid_search.best_estimator_.feature_importances_
feature_importances

array([1.00767721e-02, 2.21007137e-02, 1.59470632e-02, 3.38760840e-02,
       4.19695001e-02, 5.96859210e-02, 1.30573963e-02, 6.93352137e-03,
       1.06734567e-02, 1.74899932e-02, 1.19544929e-02, 9.29486780e-03,
       1.42806515e-02, 1.66406093e-02, 9.48777140e-03, 8.84341879e-03,
       1.24467914e-02, 1.32491503e-02, 7.94976399e-03, 7.66876140e-03,
       9.66456971e-03, 1.11454432e-02, 1.15422880e-02, 6.54450449e-03,
       1.95048533e-02, 1.51624063e-02, 1.37191246e-02, 1.09407673e-02,
       1.19810584e-02, 1.10537482e-02, 5.25442552e-03, 7.21804660e-03,
       1.38889270e-02, 1.69578167e-03, 2.59992031e-03, 3.46424018e-03,
       1.26334769e-03, 2.90516567e-03, 1.00119238e-03, 1.12259822e-03,
       1.65468141e-03, 5.95748868e-04, 9.06323424e-04, 5.78888301e-04,
       5.70112805e-04, 7.85166645e-04, 5.00988737e-04, 4.53068898e-04,
       4.96892060e-04, 2.92276532e-04, 5.87375745e-04, 1.49272550e-03,
       1.22479337e-03, 1.40556499e-03, 5.63575260e-04, 6.61178583e-04,
      

In [None]:
#match up feature importances with actual names of features
cat_lowna_encoder = pre_pipeline.named_transformers_["cat_lowna"]
cat_lowna_attribs = cat_lowna.get_feature_names()
cat_higna_encoder = pre_pipeline.named_transformers_["cat_highna"]
cat_highna_attribs = cat_pipeline_highna.get_feature_names()
attributes = num_highna + num_lowna + cat_lowna_attribs + cat_highna_attribs
sorted(zip(feature_importances, attributes), reverse=True)