In [39]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

import matplotlib as plt

%matplotlib inline

In [2]:
#Read csv
url = '../data/cookies_target_binned.csv'
df = pd.read_csv(url)
target = 'quality_binned'

In [3]:
#Separating train set into X and y
X = df.drop(target, axis=1)
y = df[target]

In [33]:
#Split DF into train and test (15% test):
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,stratify = y, random_state=42)

In [26]:
#Grid search for parameter selection for a Random Forest Classifier model
param_grid = {
    'n_estimators': [300, 500, 1000],
    'max_features': ['auto','sqrt','log2'],
    'max_depth': [10, 8, 6]
}

In [6]:
#Model creation:
RFC = RandomForestClassifier(n_jobs=-1)
GS = GridSearchCV(RFC, param_grid, cv=5, verbose = 3)

In [7]:
#LAUNCH MODEL:
GS.fit(X_train, y_train)

Fitting 5 folds for each of 27 candidates, totalling 135 fits
[CV] max_depth=10, max_features=auto, n_estimators=300 ...............


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  max_depth=10, max_features=auto, n_estimators=300, score=0.814, total=   2.2s
[CV] max_depth=10, max_features=auto, n_estimators=300 ...............


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    2.2s remaining:    0.0s


[CV]  max_depth=10, max_features=auto, n_estimators=300, score=0.818, total=   2.7s
[CV] max_depth=10, max_features=auto, n_estimators=300 ...............


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    4.9s remaining:    0.0s


[CV]  max_depth=10, max_features=auto, n_estimators=300, score=0.826, total=   2.3s
[CV] max_depth=10, max_features=auto, n_estimators=300 ...............
[CV]  max_depth=10, max_features=auto, n_estimators=300, score=0.841, total=   2.3s
[CV] max_depth=10, max_features=auto, n_estimators=300 ...............
[CV]  max_depth=10, max_features=auto, n_estimators=300, score=0.821, total=   2.6s
[CV] max_depth=10, max_features=auto, n_estimators=500 ...............
[CV]  max_depth=10, max_features=auto, n_estimators=500, score=0.807, total=   2.7s
[CV] max_depth=10, max_features=auto, n_estimators=500 ...............
[CV]  max_depth=10, max_features=auto, n_estimators=500, score=0.820, total=   2.8s
[CV] max_depth=10, max_features=auto, n_estimators=500 ...............
[CV]  max_depth=10, max_features=auto, n_estimators=500, score=0.823, total=   3.3s
[CV] max_depth=10, max_features=auto, n_estimators=500 ...............
[CV]  max_depth=10, max_features=auto, n_estimators=500, score=0.835, 

[CV]  max_depth=8, max_features=auto, n_estimators=1000, score=0.791, total=   3.3s
[CV] max_depth=8, max_features=auto, n_estimators=1000 ...............
[CV]  max_depth=8, max_features=auto, n_estimators=1000, score=0.802, total=   3.7s
[CV] max_depth=8, max_features=auto, n_estimators=1000 ...............
[CV]  max_depth=8, max_features=auto, n_estimators=1000, score=0.802, total=   4.0s
[CV] max_depth=8, max_features=auto, n_estimators=1000 ...............
[CV]  max_depth=8, max_features=auto, n_estimators=1000, score=0.821, total=   3.9s
[CV] max_depth=8, max_features=auto, n_estimators=1000 ...............
[CV]  max_depth=8, max_features=auto, n_estimators=1000, score=0.795, total=   3.1s
[CV] max_depth=8, max_features=sqrt, n_estimators=300 ................
[CV]  max_depth=8, max_features=sqrt, n_estimators=300, score=0.790, total=   2.2s
[CV] max_depth=8, max_features=sqrt, n_estimators=300 ................
[CV]  max_depth=8, max_features=sqrt, n_estimators=300, score=0.795, to

[CV]  max_depth=6, max_features=sqrt, n_estimators=300, score=0.780, total=   2.4s
[CV] max_depth=6, max_features=sqrt, n_estimators=500 ................
[CV]  max_depth=6, max_features=sqrt, n_estimators=500, score=0.767, total=   2.7s
[CV] max_depth=6, max_features=sqrt, n_estimators=500 ................
[CV]  max_depth=6, max_features=sqrt, n_estimators=500, score=0.783, total=   2.7s
[CV] max_depth=6, max_features=sqrt, n_estimators=500 ................
[CV]  max_depth=6, max_features=sqrt, n_estimators=500, score=0.786, total=   3.3s
[CV] max_depth=6, max_features=sqrt, n_estimators=500 ................
[CV]  max_depth=6, max_features=sqrt, n_estimators=500, score=0.808, total=   3.4s
[CV] max_depth=6, max_features=sqrt, n_estimators=500 ................
[CV]  max_depth=6, max_features=sqrt, n_estimators=500, score=0.775, total=   3.5s
[CV] max_depth=6, max_features=sqrt, n_estimators=1000 ...............
[CV]  max_depth=6, max_features=sqrt, n_estimators=1000, score=0.766, total=

[Parallel(n_jobs=1)]: Done 135 out of 135 | elapsed:  6.5min finished


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators='warn', n_jobs=-1,
                                              oob_score=False,
                                              random_state=None, verbose=0,
                                              warm_start=False),
             iid='

In [16]:
#We look for the best parameters of the model:
GS.best_params_

{'max_depth': 10, 'max_features': 'auto', 'n_estimators': 300}

In [34]:
#With the best parameters, we create the final model with the train dataset:
RFC = RandomForestClassifier(max_depth = 10, max_features='auto', n_estimators=300)
RFC.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=10, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=300,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [35]:
#Prediction of the test dataset:
y_train_pred = RFC.predict(X_train)
y_pred = RFC.predict(X_test)

In [40]:
#We check scores with train:
f1 = f1_score(y_train, y_train_pred, labels=None, pos_label=1, average='weighted')
accuracy = accuracy_score(y_train, y_train_pred)
conf = confusion_matrix(y_train, y_train_pred)

print ('TRAIN MODEL METRICS:')
print('The F1 score is: ' + str(f1))
print('The accuracy is: ' + str(accuracy))
print('Confusion matrix:')
conf

TRAIN MODEL METRICS:
The F1 score is: 0.9318299376636487
The accuracy is: 0.9354338842975206
Confusion matrix:


array([[ 426,   14,    0],
       [  17, 2733,   15],
       [   0,  204,  463]])

In [41]:
#We check scores with test:
f1 = f1_score(y_test, y_pred, labels=None, pos_label=1, average='weighted')
accuracy = accuracy_score(y_test, y_pred)
conf = confusion_matrix(y_test, y_pred)

print ('TEST MODEL METRICS:')
print('The F1 score is: ' + str(f1))
print('The accuracy is: ' + str(accuracy))
print('Confusion matrix:')
conf

TEST MODEL METRICS:
The F1 score is: 0.8061611810717865
The accuracy is: 0.8181818181818182
Confusion matrix:


array([[ 82,  28,   0],
       [ 27, 638,  26],
       [  0,  95,  72]])

In [31]:
features_list = RFC.feature_importances_

In [32]:
features = pd.DataFrame(columns=['features','importance'])

In [23]:
for i in range ((X.shape[1])):
    features.loc[i] = [X.columns[i],features_list[i]]

features.sort_values('importance', ascending=False)

Unnamed: 0,features,importance
8,bake_time,0.15204
2,bake_temp,0.122079
5,density,0.119949
4,calories,0.093532
9,butter_type,0.090902
7,grams_baking_soda,0.075597
1,sugar_index,0.071242
6,pH,0.063906
3,chill_time,0.059827
0,sugar_to_flour_ratio,0.058803
