In [16]:
# Data manipulation & vizualisation tools
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.preprocessing import RobustScaler, MinMaxScaler, Normalizer, StandardScaler, OneHotEncoder
from sklearn.model_selection import RepeatedKFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score




In [47]:
cookies = pd.read_excel('cookies_comp_students.xlsx')

In [48]:
cookies.dropna(inplace=True)
cookies.reset_index(drop=True, inplace=True)

In [49]:
cookies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5181 entries, 0 to 5180
Data columns (total 16 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   sugar to flour ratio  5181 non-null   float64
 1   sugar index           5181 non-null   float64
 2   bake temp             5181 non-null   int64  
 3   chill time            5181 non-null   float64
 4   calories              5181 non-null   float64
 5   density               5181 non-null   float64
 6   pH                    5181 non-null   float64
 7   grams baking soda     5181 non-null   float64
 8   bake time             5181 non-null   float64
 9   quality               5181 non-null   int64  
 10  butter type           5181 non-null   object 
 11  weight                5181 non-null   float64
 12  diameter              5181 non-null   int64  
 13  mixins                5181 non-null   object 
 14  crunch factor         5181 non-null   float64
 15  aesthetic appeal     

In [50]:
y = cookies['quality']
X = cookies.drop(['quality'], axis=1)

In [51]:
X_cat = X.select_dtypes(object)
X_num = X.select_dtypes(np.number)

In [52]:
encoder = OneHotEncoder(drop='first').fit(X_cat)
X_cat_encoded = encoder.transform(X_cat).toarray()
X_encoded_cols = encoder.get_feature_names_out(X_cat.columns) # recovering names
X_cat_en_df = pd.DataFrame(X_cat_encoded, columns=X_encoded_cols)

In [53]:
X_cat_en_df.shape

(5181, 13)

In [54]:
X_num.shape

(5181, 13)

In [55]:
X_full = pd.concat([X_num, X_cat_en_df], axis=1)

In [56]:
X_full.shape

(5181, 26)

In [45]:
X_full.reset_index(drop=True, inplace=True)

In [57]:
X_train,X_test,y_train,y_test = train_test_split(X_full, y, test_size=0.3, random_state=42)

In [58]:
rfr = RandomForestRegressor()
rfr.fit(X_train, y_train)

RandomForestRegressor()

In [59]:
cross_val_scores = cross_val_score(rfr, X_train, y_train)
cross_val_scores

array([0.74888032, 0.73497518, 0.75993596, 0.7523594 , 0.76807101])

In [75]:
feature_importances = rfr.feature_importances_
feature_names = X_full.columns
forest_importances = pd.Series(feature_importances, index=feature_names)
forest_importances.sort_values(ascending=False).head(26)

ValueError: Length of values (12) does not match length of index (26)

In [85]:
best_model = ()
for i in range(1, 26):
    a = forest_importances.sort_values(ascending=False).head(i).index
    X_full_1 = X_full[a]
    X_train,X_test,y_train,y_test = train_test_split(X_full_1, y, test_size=0.3, random_state=42)
    rfr = RandomForestRegressor()
    rfr.fit(X_train, y_train)
    cross_val_scores = cross_val_score(rfr, X_train, y_train)
    if len(best_model) == 0:
        best_model = (i, cross_val_scores.mean(), cross_val_scores.std())
    else:
        if cross_val_scores.mean() > best_model[1]:
            best_model = (i, cross_val_scores.mean(), cross_val_scores.std())

In [86]:
best_model

(18, 0.7537652341285106, 0.012501854743379875)

In [89]:
select = forest_importances.sort_values(ascending=False).head(18).index

In [90]:
X_full_select = X_full[select]

In [91]:
X_full_select.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5181 entries, 0 to 5180
Data columns (total 18 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   butter type_melted            5181 non-null   float64
 1   bake time                     5181 non-null   float64
 2   chill time                    5181 non-null   float64
 3   grams baking soda             5181 non-null   float64
 4   calories                      5181 non-null   float64
 5   pH                            5181 non-null   float64
 6   sugar index                   5181 non-null   float64
 7   bake temp                     5181 non-null   int64  
 8   sugar to flour ratio          5181 non-null   float64
 9   density                       5181 non-null   float64
 10  crunch factor                 5181 non-null   float64
 11  weight                        5181 non-null   float64
 12  mixins_chocolate, oats        5181 non-null   float64
 13  mix

In [92]:
params = {'bootstrap': True,
 'max_depth': 70,
 'max_features': 'auto',
 'min_samples_leaf': 4,
 'min_samples_split': 10,
 'n_estimators': 400}

In [98]:
X_train,X_test,y_train,y_test = train_test_split(X_full_select, y, test_size=0.3, random_state=42)
rfr = RandomForestRegressor(bootstrap= True,
 max_depth= 2,
 max_features= 'auto',
 min_samples_leaf= 4,
 min_samples_split= 10,
 n_estimators= 400)
rfr.fit(X_train, y_train)
cross_val_scores = cross_val_score(rfr, X_train, y_train)
cross_val_scores.mean()

0.6371656541734378

In [100]:
param_grid = {
    'bootstrap': [True],
    'max_depth': [80, 90, 100, 110],
    'max_features': [2, 3],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200, 300, 1000]
}

In [102]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

search = GridSearchCV(estimator=rfr, param_grid=param_grid, 
                      cv = 3, n_jobs = -1, verbose = 2)

In [105]:
search.fit(X_train, y_train)

Fitting 3 folds for each of 288 candidates, totalling 864 fits
[CV] END bootstrap=True, max_depth=80, max_features=2, min_samples_leaf=3, min_samples_split=8, n_estimators=100; total time=   0.5s
[CV] END bootstrap=True, max_depth=80, max_features=2, min_samples_leaf=3, min_samples_split=8, n_estimators=300; total time=   1.4s
[CV] END bootstrap=True, max_depth=80, max_features=2, min_samples_leaf=3, min_samples_split=8, n_estimators=1000; total time=   4.6s
[CV] END bootstrap=True, max_depth=80, max_features=2, min_samples_leaf=3, min_samples_split=10, n_estimators=300; total time=   1.4s
[CV] END bootstrap=True, max_depth=80, max_features=2, min_samples_leaf=3, min_samples_split=10, n_estimators=1000; total time=   4.5s
[CV] END bootstrap=True, max_depth=80, max_features=2, min_samples_leaf=3, min_samples_split=12, n_estimators=1000; total time=   4.3s
[CV] END bootstrap=True, max_depth=80, max_features=2, min_samples_leaf=4, min_samples_split=8, n_estimators=200; total time=   0.9s


[CV] END bootstrap=True, max_depth=90, max_features=2, min_samples_leaf=3, min_samples_split=12, n_estimators=300; total time=   1.9s
[CV] END bootstrap=True, max_depth=90, max_features=2, min_samples_leaf=4, min_samples_split=8, n_estimators=100; total time=   0.6s
[CV] END bootstrap=True, max_depth=90, max_features=2, min_samples_leaf=4, min_samples_split=8, n_estimators=100; total time=   0.8s
[CV] END bootstrap=True, max_depth=90, max_features=2, min_samples_leaf=4, min_samples_split=8, n_estimators=100; total time=   0.6s
[CV] END bootstrap=True, max_depth=90, max_features=2, min_samples_leaf=4, min_samples_split=8, n_estimators=200; total time=   1.2s
[CV] END bootstrap=True, max_depth=90, max_features=2, min_samples_leaf=4, min_samples_split=8, n_estimators=200; total time=   1.7s
[CV] END bootstrap=True, max_depth=90, max_features=2, min_samples_leaf=4, min_samples_split=8, n_estimators=200; total time=   1.5s
[CV] END bootstrap=True, max_depth=90, max_features=2, min_samples_l

[CV] END bootstrap=True, max_depth=90, max_features=3, min_samples_leaf=3, min_samples_split=10, n_estimators=300; total time=   1.7s
[CV] END bootstrap=True, max_depth=90, max_features=3, min_samples_leaf=3, min_samples_split=10, n_estimators=1000; total time=   6.8s
[CV] END bootstrap=True, max_depth=90, max_features=3, min_samples_leaf=3, min_samples_split=12, n_estimators=1000; total time=   6.5s
[CV] END bootstrap=True, max_depth=90, max_features=3, min_samples_leaf=4, min_samples_split=8, n_estimators=300; total time=   1.9s
[CV] END bootstrap=True, max_depth=90, max_features=3, min_samples_leaf=4, min_samples_split=8, n_estimators=1000; total time=   5.9s
[CV] END bootstrap=True, max_depth=90, max_features=3, min_samples_leaf=4, min_samples_split=10, n_estimators=300; total time=   1.8s
[CV] END bootstrap=True, max_depth=90, max_features=3, min_samples_leaf=4, min_samples_split=10, n_estimators=1000; total time=   6.0s
[CV] END bootstrap=True, max_depth=90, max_features=3, min_s

[CV] END bootstrap=True, max_depth=100, max_features=2, min_samples_leaf=3, min_samples_split=10, n_estimators=300; total time=   1.7s
[CV] END bootstrap=True, max_depth=100, max_features=2, min_samples_leaf=3, min_samples_split=10, n_estimators=1000; total time=   5.7s
[CV] END bootstrap=True, max_depth=100, max_features=2, min_samples_leaf=3, min_samples_split=12, n_estimators=1000; total time=   6.9s
[CV] END bootstrap=True, max_depth=100, max_features=2, min_samples_leaf=4, min_samples_split=8, n_estimators=300; total time=   1.5s
[CV] END bootstrap=True, max_depth=100, max_features=2, min_samples_leaf=4, min_samples_split=8, n_estimators=1000; total time=   8.1s
[CV] END bootstrap=True, max_depth=100, max_features=2, min_samples_leaf=4, min_samples_split=10, n_estimators=300; total time=   1.9s
[CV] END bootstrap=True, max_depth=100, max_features=2, min_samples_leaf=4, min_samples_split=10, n_estimators=1000; total time=   5.4s
[CV] END bootstrap=True, max_depth=100, max_features=

GridSearchCV(cv=3,
             estimator=RandomForestRegressor(max_depth=2, min_samples_leaf=4,
                                             min_samples_split=10,
                                             n_estimators=400),
             n_jobs=-1,
             param_grid={'bootstrap': [True], 'max_depth': [80, 90, 100, 110],
                         'max_features': [2, 3], 'min_samples_leaf': [3, 4, 5],
                         'min_samples_split': [8, 10, 12],
                         'n_estimators': [100, 200, 300, 1000]},
             verbose=2)

In [106]:
search.best_params_

{'bootstrap': True,
 'max_depth': 90,
 'max_features': 3,
 'min_samples_leaf': 3,
 'min_samples_split': 8,
 'n_estimators': 1000}

In [115]:
X_train,X_test,y_train,y_test = train_test_split(X_full_select, y, test_size=0.3, random_state=42)
rfr = RandomForestRegressor(bootstrap= True,
 max_depth= 90,
 max_features= 'auto',
 min_samples_leaf= 2,
 min_samples_split= 3,
 n_estimators= 1000)
rfr.fit(X_train, y_train)
cross_val_scores = cross_val_score(rfr, X_train, y_train)
cross_val_scores.mean()

0.7526462338156236

In [111]:
cookies_test = pd.read_excel('cookies_comp_students.xlsx', 'cookies_test')

In [118]:
cookies_test_num = cookies_test.select_dtypes(np.number)
cookies_test_cat = cookies_test.select_dtypes(object)

In [119]:
cookies_test_cat_en = encoder.transform(cookies_test_cat).toarray()
X_encoded_cols = encoder.get_feature_names_out(cookies_test_cat.columns) # recovering names
X_cat_en_df = pd.DataFrame(cookies_test_cat_en, columns=X_encoded_cols)

In [120]:
cookies_full = pd.concat([cookies_test_num, X_cat_en_df], axis=1)

In [121]:
cookies_full.drop(['quality'], inplace=True, axis=1)
cookies_full = cookies_full[select]

In [126]:
preds = rfr.predict(cookies_full)

In [128]:
np.savetxt("preds.csv", preds, delimiter=",")