In [1]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score

from sklearn.model_selection import train_test_split

import matplotlib as plt

%matplotlib inline

In [2]:
#Read csv
url = '../data/cookies_eda.csv'
df = pd.read_csv(url)
target = 'quality'

In [3]:
#Create a random column to define which parameters a relevant and which not:
df['random'] = np.random.randint(1, 101, df.shape[0])

In [4]:
#Separating train set into X and y
X = df.drop(target, axis=1)
y = df[target]

In [5]:
#Split DF into train and test (20% test):
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
#With the best parameters, we create the final model with the train dataset:
RFC = RandomForestClassifier(max_depth = 10, max_features='auto', n_estimators=10, n_jobs=-1)
RFC.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=10, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
                       oob_score=False, random_state=None, verbose=0,
                       warm_start=False)

In [13]:
#Prediction of the test dataset:
y_train_pred = RFC.predict(X_train)
y_pred = RFC.predict(X_test)

In [14]:
#We check the features importance:
features_list = RFC.feature_importances_
features = pd.DataFrame(columns=['features','importance'])

for i in range ((X.shape[1])):
    features.loc[i] = [X.columns[i],features_list[i]]
    
features.sort_values('importance', ascending=False)

Unnamed: 0,features,importance
9,butter_type,0.114717
8,bake_time,0.107788
2,bake_temp,0.105433
5,density,0.100842
4,calories,0.080274
1,sugar_index,0.067187
0,sugar_to_flour_ratio,0.062731
6,pH,0.057142
7,grams_baking_soda,0.057069
10,weight,0.054246


In [15]:
#We check with features are less important than the random feature:

margin_value = float(features[features['features']=='random']['importance'])
drop_columns = list(features['features'][features['importance'] <= margin_value])
drop_columns

['aesthetic_appeal',
 'raisins',
 'nuts',
 'chocolate',
 'oats',
 'peanut butter',
 'random']

In [16]:
#Delete columns with lower importance than random and random itself.

df_selected = df.drop(drop_columns, axis = 1)

In [17]:
df_selected.to_csv('../data/cookies_FS.csv', index=False)