## Conversion of Units and Preprocessing

In [77]:
#Data handling
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler
import category_encoders as ce
from category_encoders.target_encoder import TargetEncoder
# Classification
import sklearn.linear_model

# Dimensionality reduction
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

# Visualization
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
import mpl_toolkits.mplot3d.axes3d as p3
from matplotlib import animation
%matplotlib inline
from sklearn.model_selection import StratifiedKFold
from sklearn.base import clone


In [78]:
#read in data and drop index
beer_df = pd.read_csv('beer_data_9-13_for_classification.csv')
beer_df.info()
beer_df.head()
beer_df.drop(['Unnamed: 0'],axis=1)

FileNotFoundError: [Errno 2] No such file or directory: 'beer_data_9-13_for_classification.csv'

## Train/Test/Validation Split

In [12]:
#Doing it at this time helps to avoid overfitting or picking the wrong architecture based on bias
#function for test set and validation set creation:
def split_train_val_test(beer_df,validation_ratio, test_ratio):
    np.random.seed(33)
    shuffled_indices = np.random.permutation(len(beer_df))  #shuffles the dataset
    validation_set_size = int(len(beer_df) * validation_ratio) #calculates validation set size based on ratio   
    test_set_size = int(len(beer_df) * test_ratio) #calculates test size based on ratio
    val_indices = shuffled_indices[:validation_set_size]
    test_indices = shuffled_indices[:test_set_size] #selects test set and from incdices
    train_indices = shuffled_indices[(test_set_size+validation_set_size):] #assigns the rest to training
    return beer_df.iloc[train_indices], beer_df.iloc[val_indices], beer_df.iloc[test_indices] #returns two different dfs for test and train
    
#Using the function
train_set, validation_set, test_set = split_train_val_test(beer_df, 0.15,0.10) #75% used for training

In [13]:
#check that it worked
print(len(train_set), len(validation_set),len(test_set))

70105 14020 9347


In [14]:
#save to csv
validation_set.to_csv('beer_data_val.csv')
train_set.to_csv('beer_data_train.csv')
test_set.to_csv('beer_data_test.csv')

In [16]:
#imports for the pipeline
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler,normalize, Normalizer,LabelEncoder, OrdinalEncoder, OneHotEncoder, RobustScaler
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.compose import ColumnTransformer
import category_encoders as ce
from category_encoders.binary import BinaryEncoder
from category_encoders.leave_one_out import LeaveOneOutEncoder
from category_encoders.helmert import HelmertEncoder

In [46]:
#encoding is the same as it was for clustering, except the target variables (Category) will be label encoded this time
X = train_set.drop(['Unnamed: 0','Category'],axis=1)
y = train_set["Category"].copy()
#variables with over 50% NaN/Missing values are categorized as "highna"
num_highna=['mashph','hop2amount','hop2alpha','hop2time','hop2ibu',
            'hop2percent','hop3amount','hop3alpha','hop3time',
            'hop3ibu','hop3percent','hop4amount','hop4alpha',
            'hop4time','hop4ibu','hop4percent','hop5amount',
            'hop5alpha','hop5time','hop5ibu','hop5percent',
            'Adjunct1Num','Adjunct2Num','Adjunct3Num','Adjunct4Num','Adjunct5Num']
num_lowna = ['Batch_size_liters', 'og', 'fg', 'abv', 'ibu', 'color_levibonds',
       'Base Malt Amount', 'BasePPG', 'BaseColor', 'BasePercentage',
       'SpecialtyMalt1Amount', 'SpecialtyMalt1PPG', 'SpecialtyMalt1Color',
       'SpecialtyMalt1Percentage', 'SpecialtyMalt2Amount', 'SpecialtyMalt2PPG',
       'SpecialtyMalt2Color', 'SpecialtyMalt2Percentage',
       'SpecialtyMalt3Amount', 'SpecialtyMalt3PPG', 'SpecialtyMalt3Color',
       'SpecialtyMalt3Percentage','hop1amount','hop1time','hop1ibu','hop1percent','hop1alpha','Attenuation', 'LowTemp', 'HighTemp']
cat_highna = ['hop2name', 'hop2type', 'hop2timing', 'hop3name',
       'hop3type', 'hop3timing', 'hop4name', 'hop4type', 'hop4timing',
       'hop5name', 'hop5type', 'hop5timing', 'Adjunct1Amount', 'Adjunct1Name', 'Adjunct1Type',
       'Adjunct1Timing', 'Adjunct2Amount', 'Adjunct2Name', 'Adjunct2Type',
       'Adjunct2Timing', 'Adjunct3Amount', 'Adjunct3Name', 'Adjunct3Type',
       'Adjunct3Timing', 'Adjunct4Amount', 'Adjunct4Name', 'Adjunct4Type',
       'Adjunct4Timing', 'Adjunct5Amount', 'Adjunct5Name', 'Adjunct5Type',
       'Adjunct5Timing', 'Adjunct1Unit', 'Adjunct2Unit', 'Adjunct3Unit',
       'Adjunct4Unit', 'Adjunct5Unit']
cat_lowna = ['Batch_Style', 'Base Malt', 'SpecialtyMalt1Name',
       'SpecialtyMalt2Name', 'SpecialtyMalt3Name', 'hop1name', 'hop1type',
       'hop1timing', 'YeastStrain', 'Flocculation',
       'Starter?', 'Flag']
target = ['Category']

In [51]:
#handles numerical values, imputing the mean and scaling (standard instead of minmax since it handles outliers better)
num_pipeline_lowna = Pipeline(steps=[
        ('imputer', KNNImputer(weights='distance')),
        ('standardizer', RobustScaler())
    ])
num_pipeline_highna = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy="constant",fill_value=0)),
        ('standardizer', RobustScaler())
    ])
#handles categorical values, imputing the most frequent and onehot encoding
cat_pipeline_lowna = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy="most_frequent")),
        ('encoder', OneHotEncoder())        
    ])
cat_pipeline_highna = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy="constant", fill_value=0)),
        ('encoder', OneHotEncoder())
    ])
#pulls together two pipelines
pre_pipeline = ColumnTransformer(transformers=[
        ("num_lowna", num_pipeline_lowna, num_lowna),
        ("num_highna", num_pipeline_highna,num_highna),
        ("cat_lowna", cat_pipeline_lowna, cat_lowna),
        ("cat_highna", cat_pipeline_highna, cat_highna),
    ])

In [52]:
beer_prepared = pre_pipeline.fit_transform(X)

  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):


## Baseline Models

In [57]:
#label encodes the target variable
le = LabelEncoder()
label_encoder = le.fit(y)
y = label_encoder.transform(y)

In [59]:
#trying out a few base models before we get to hyperparameter tuning
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(beer_prepared, y)

LinearRegression()

In [60]:
#evaluation
from sklearn.metrics import mean_squared_error
style_predictions = lin_reg.predict(beer_prepared)
lin_mse = mean_squared_error(y, style_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

43.987678565218786

In [61]:
from sklearn.tree import DecisionTreeRegressor
tree_reg = DecisionTreeRegressor()
tree_reg.fit(beer_prepared, y)

DecisionTreeRegressor()

In [62]:
style_predictions = tree_reg.predict(beer_prepared)
tree_mse = mean_squared_error(y, style_predictions)
tree_rmse = np.sqrt(tree_mse)
tree_rmse

0.3167573384603923

In [63]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(tree_reg, beer_prepared, y,
                         scoring="neg_mean_squared_error", cv=10)
tree_rmse_scores = np.sqrt(-scores)

In [75]:
#function to display scores of various models using cross validation
def display_scores(scores):
     print("Scores:", scores)
     print("Mean:", scores.mean())
     print("Standard deviation:", scores.std())
display_scores(-scores)
#this one is massively overfit

Scores: [2876.51219512 3050.17861218 2878.86924119 3087.35572672 3065.09000143
 2941.30263909 2926.54197575 2989.0125535  2995.37788873 2980.76733238]
Mean: 2979.1008166080364
Standard deviation: 70.20912285195098


In [68]:
lin_scores = cross_val_score(lin_reg, beer_prepared, y,
                              scoring="neg_mean_squared_error", cv=10)

lin_rmse_scores = np.sqrt(-lin_scores)
display_scores(lin_rmse_scores)
#not great but not as overfit as the decision tree

Scores: [43.96113548 44.40221072 44.02796973 44.53668452 44.33212676 44.81514152
 44.02195562 44.43758537 43.86106308 44.47989823]
Mean: 44.28757710244766
Standard deviation: 0.2900729655918983


In [70]:
from sklearn.ensemble import RandomForestRegressor
forest_reg = RandomForestRegressor()
forest_reg.fit(beer_prepared, y)

RandomForestRegressor()

In [74]:
forest_scores = cross_val_score(forest_reg, beer_prepared, y,
                              scoring="neg_mean_squared_error", cv=10)
forest_rmse_scores = np.sqrt(-forest_scores)
display_scores(forest_rmse_scores)

Scores: [38.13264348 38.61054983 37.99524247 38.95985994 38.37499595 38.52207717
 37.88220666 37.95206306 38.03420559 38.16596019]
Mean: 38.26298043475292
Standard deviation: 0.3283602600797024


In [76]:
#save the base models for now in case I need em
import joblib
joblib.dump(lin_reg, "base_lin_reg.pkl")
joblib.dump(tree_reg, "base_decision_tree.pkl")
joblib.dump(forest_reg, "base_random_forest.pkl")

['base_random_forest.pkl']

## Fine Tuning Models

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    {'n_estimators': [3, 10, 30], 'max_features': [20, 40, 60, 80]},
    {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [20, 30, 40]},
  ]

forest_reg = RandomForestRegressor()

grid_search = GridSearchCV(forest_reg, param_grid, cv=5,
                           scoring='neg_mean_squared_error',
                           return_train_score=True)

grid_search.fit(beer_prepared, y)

In [None]:
grid_search.best_estimator_

In [None]:
feature_importances = grid_search.best_estimator_.feature_importances_
feature_importances

In [None]:
#match up feature importances with actual names of features
cat_lowna_encoder = pre_pipeline.named_transformers_["cat_lowna"]
cat_lowna_attribs = list(cat_lowna_encoder.categories_[0])
cat_higna_encoder = pre_pipeline.named_transformers_["cat_highna"]
cat_highna_attribs = list(cat_highna_encoder.categories_[0])
attributes = num_highna + num_lowna + cat_lowna_attribs + cat_highna_attribs
sorted(zip(feature_importances, attributes), reverse=True)