In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.linear_model      import LinearRegression
from sklearn.model_selection   import train_test_split
from sklearn.impute            import SimpleImputer
from sklearn.preprocessing     import FunctionTransformer
from sklearn.preprocessing     import OneHotEncoder
from sklearn.preprocessing     import OrdinalEncoder
from sklearn.compose           import ColumnTransformer
from sklearn.preprocessing     import StandardScaler
from sklearn.pipeline          import Pipeline
from sklearn.decomposition     import PCA
from sklearn.tree              import DecisionTreeRegressor
from sklearn.neighbors         import KNeighborsRegressor
from sklearn.neighbors         import KNeighborsClassifier
from sklearn.model_selection   import RandomizedSearchCV
from sklearn.feature_selection import SelectKBest, SelectFromModel, RFE
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor, StackingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import RidgeCV

In [2]:
cookies = pd.read_csv('cookies.csv')
cookies_val = pd.read_csv('cookies_validate.csv')

In [3]:
cookies.drop(columns=["aesthetic appeal", "diameter"], inplace=True)
cookies_val.drop(columns=["aesthetic appeal", "diameter"], inplace=True)

In [4]:
cookies['chocolate'] = 0
cookies['raisins'] = 0
cookies['oats'] = 0
cookies['nuts'] = 0
cookies['peanut butter'] = 0

mixins = ['chocolate', 'raisins', 'oats', 'nuts', 'peanut butter']
cookies['mixins'].fillna(value = ' ', axis=0, inplace=True)
for mix in mixins:
    for i in range(len(cookies)):
        if mix in [x.strip() for x in cookies.loc[i, 'mixins'].split(',')]:
            cookies.loc[i, mix] = 1
            
cookies.drop(columns=["mixins"], inplace=True)

In [5]:
cookies_val['chocolate'] = 0
cookies_val['raisins'] = 0
cookies_val['oats'] = 0
cookies_val['nuts'] = 0
cookies_val['peanut butter'] = 0

mixins = ['chocolate', 'raisins', 'oats', 'nuts', 'peanut butter']
cookies_val['mixins'].fillna(value = ' ', axis=0, inplace=True)
for mix in mixins:
    for i in range(len(cookies_val)):
        if mix in [x.strip() for x in cookies_val.loc[i, 'mixins'].split(',')]:
            cookies_val.loc[i, mix] = 1
            
cookies_val.drop(columns=["mixins"], inplace=True)

In [6]:
X = cookies.drop(columns="quality")
y = cookies["quality"]
X_val = cookies_val.drop(columns="quality")
y_val = cookies_val["quality"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
X_train_val, X_test_val, y_train_val, y_test_val = train_test_split(X_val, y_val, test_size=0.2, random_state=1)

In [7]:
num_cols = X_train.select_dtypes(include=["int64", "float64"]).columns
cat_cols = X_train.select_dtypes(exclude=["int64", "float64"]).columns

In [8]:
def fill_numeric_nulls(df):
    imputer = SimpleImputer(strategy="median")
    X = imputer.fit_transform(df)
    return X

numeric_nulls_imputer = FunctionTransformer(fill_numeric_nulls)

In [9]:
categ_encode_pipeline = ColumnTransformer([
    ("onehot_enc", OneHotEncoder(handle_unknown="ignore"), ['butter type'])
])

In [10]:
intermediate_pipeline = ColumnTransformer([
    ("num_nulls_imp", numeric_nulls_imputer, num_cols),
    ("cat_fill_pipe", categ_encode_pipeline, cat_cols)
])

In [11]:
process_pipeline = Pipeline([
    ("intermediate_pipe", intermediate_pipeline),
    ("standard_scal", StandardScaler())
])

In [17]:
X_val.drop(columns=['id'], inplace=True)

In [18]:
X.describe()

Unnamed: 0,sugar to flour ratio,sugar index,bake temp,chill time,calories,density,pH,grams baking soda,bake time,weight,crunch factor,chocolate,raisins,oats,nuts,peanut butter
count,5198.0,5193.0,5198.0,5198.0,5198.0,5198.0,5198.0,5198.0,5188.0,5198.0,5198.0,5198.0,5198.0,5198.0,5198.0,5198.0
mean,0.318049,5.402465,559.638322,30.390246,115.015294,0.995819,8.22202,0.530864,10.494758,14.381935,1.499367,0.675644,0.299731,0.227203,0.241631,0.013467
std,0.150036,4.668342,353.274062,17.268403,56.506171,0.062193,0.283323,0.150886,1.194584,3.02374,0.289205,0.468179,0.458184,0.419065,0.428113,0.115273
min,0.0,0.6,90.0,0.0,-99.0,0.98711,7.72,0.22,8.0,-99.0,1.0,0.0,0.0,0.0,0.0,0.0
25%,0.24,1.8,380.0,17.0,76.0,0.9923,8.11,0.43,9.5,12.8,1.25,0.0,0.0,0.0,0.0,0.0
50%,0.31,3.0,470.0,29.0,118.0,0.9948,8.21,0.505,10.3,14.0,1.5,1.0,0.0,0.0,0.0,0.0
75%,0.39,8.0,640.0,41.0,155.0,0.996907,8.32,0.6,11.3,15.4,1.75,1.0,1.0,0.0,0.0,0.0
max,3.0,31.6,6110.0,146.5,366.5,5.0,25.0,2.0,14.9,31.8,2.0,1.0,1.0,1.0,1.0,1.0


In [19]:
X_val.describe()

Unnamed: 0,sugar to flour ratio,sugar index,bake temp,chill time,calories,density,pH,grams baking soda,bake time,weight,crunch factor,chocolate,raisins,oats,nuts,peanut butter
count,779.0,779.0,779.0,779.0,779.0,779.0,779.0,779.0,779.0,779.0,779.0,779.0,779.0,779.0,779.0,779.0
mean,0.322555,6.384531,480.42362,34.296534,129.188062,0.994293,8.196264,0.513504,10.614925,14.015404,1.49113,0.695764,0.293967,0.189987,0.154044,0.006418
std,0.120214,5.520385,225.928988,20.170179,50.67934,0.003502,0.153041,0.131211,1.234559,2.269722,0.282746,0.460379,0.45587,0.392543,0.361223,0.079909
min,0.0,0.6,120.0,2.0,8.0,0.98746,7.79,0.23,8.5,9.4,1.0,0.0,0.0,0.0,0.0,0.0
25%,0.26,1.9,360.0,21.0,98.0,0.99171,8.09,0.42,9.5,12.6,1.255,0.0,0.0,0.0,0.0,0.0
50%,0.31,4.8,430.0,33.0,128.0,0.9942,8.18,0.5,10.5,13.6,1.49,1.0,0.0,0.0,0.0,0.0
75%,0.37,10.0,520.0,45.0,162.0,0.996645,8.29,0.58,11.5,14.8,1.73,1.0,1.0,0.0,0.0,0.0
max,1.0,65.8,2140.0,289.0,440.0,1.03898,8.76,1.18,14.2,27.6,2.0,1.0,1.0,1.0,1.0,1.0
