In [2]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

import matplotlib as plt

%matplotlib inline

In [3]:
#Read csv
url = '../data/cookies_clean.csv'
df = pd.read_csv(url, sep='\t')
target = 'quality'

In [4]:
#Create a random column to define which parameters a relevant and which not:
df['random'] = np.random.randint(1, 101, df.shape[0])

In [5]:
#Separating train set into X and y
X = df.drop(target, axis=1)
y = df[target]

In [1]:
#Split DF into train and test (20% test):
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

NameError: name 'train_test_split' is not defined

In [20]:
#Grid search for parameter selection for a Random Forest Classifier model
param_grid = {
    'n_estimators': [5, 10, 20],
    'max_features': ['auto','sqrt','log2'],
    'max_depth': [10, 5, 3]
}

In [21]:
#Model creation:
RFC = RandomForestClassifier(n_jobs=-1)
GS = GridSearchCV(RFC, param_grid, cv=5, verbose = 3)

In [None]:
#LAUNCH MODEL:
GS.fit(X_train, y_train)

In [None]:
#We look for the best parameters of the model:
GS.best_params_

In [22]:
#With the best parameters, we create the final model with the train dataset:
RFC = RandomForestClassifier(max_depth = 5, max_features='auto', n_estimators=3)
RFC.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=5, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=3,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [23]:
#Prediction of the test dataset:
y_train_pred = RFC.predict(X_train)
y_pred = RFC.predict(X_test)

In [24]:
#We check scores with train:
f1 = f1_score(y_train, y_train_pred, labels=None, pos_label=1, average='weighted')
score = RFC.score(X_train, y_train)

print ('TRAIN MODEL METRICS:')
print('The F1 score is: ' + str(f1))
print('The model score is: ' + str(score))

TRAIN MODEL METRICS:
The F1 score is: 0.4941300628663377
The model score is: 0.5438977327544622


  'precision', 'predicted', average, warn_for)


In [25]:
#We check scores with test:
f1 = f1_score(y_test, y_pred, labels=None, pos_label=1, average='weighted')
score = RFC.score(X_test, y_test)

print ('TEST MODEL METRICS:')
print('The F1 score is: ' + str(f1))
print('The model score is: ' + str(score))


TEST MODEL METRICS:
The F1 score is: 0.4294203283292783
The model score is: 0.48601735776277727


  'precision', 'predicted', average, warn_for)


In [26]:
features_list = RFC.feature_importances_

In [27]:
features = pd.DataFrame(columns=['features','importance'])

In [28]:
for i in range ((X.shape[1])):
    features.loc[i] = [X.columns[i],features_list[i]]
    
features.sort_values('importance', ascending=False)

Unnamed: 0,features,importance
4,calories,0.244199
15,nuts,0.203479
2,bake_temp,0.109025
9,butter_type,0.108947
8,bake_time,0.078077
5,density,0.072908
1,sugar_index,0.062458
6,pH,0.028523
0,sugar_to_flour_ratio,0.022712
10,weight,0.022678
