In [1]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split


In [2]:
df = pd.read_csv('../data/cookies.csv')

### Data Cleaning

In [3]:
#Creamos las columnas dummies y en el mismo loop le ponemos un valor en función de la columna mixins:
columns = ['raisins','nuts','chocolate','oats','peanut butter']
for col in columns:
    df[col]=0
    df.loc[df["mixins"].str.contains(col, na=False), col] = 1
#Borramos la columna "mixins":
df = df.drop("mixins", axis = 1)

In [4]:
df.loc[df["butter type"].str.contains('melted', na=False), 'butter type'] = 1
df.loc[df["butter type"].str.contains('cubed', na=False), 'butter type'] = 0

In [5]:
df = df.dropna()
df.columns = ['sugar_to_flour_ratio', 'sugar_index', 'bake_temp', 'chill_time',
      'calories', 'density', 'pH', 'grams_baking_soda', 'bake_time',
      'quality', 'butter_type', 'weight', 'diameter',
      'crunch_factor', 'aesthetic_appeal','raisins','nuts','chocolate','oats','peanut butter']

### EDA

In [6]:
#Sugar to flour
df = df[df.sugar_to_flour_ratio < 2]

#Sugar index
Q1 = df['sugar_index'].quantile(0.25)
Q3 = df['sugar_index'].quantile(0.75)
IQR = Q3 - Q1

lower = Q1 - 1.5* IQR
upper = Q3 + 1.5* IQR

df = df.loc[(df['sugar_index'] > lower) & (df['sugar_index'] < upper)]

#Bake temp
Q1 = df['bake_temp'].quantile(0.25)
Q3 = df['bake_temp'].quantile(0.75)
IQR = Q3 - Q1

lower = Q1 - 1.5* IQR
upper = Q3 + 1.5* IQR

df = df.loc[(df['bake_temp'] > lower) & (df['bake_temp'] < upper)]

#Calories:
df = df.loc[df.calories > 0]

#Density:
df = df[df.density < 4]

#pH:
df = df[df.pH < 14]

#Weight:
df = df[df.weight > 0]

#Diameter:
df.drop(columns='diameter', inplace=True)

### Feature Selection

In [7]:
target = 'quality'
df['random'] = np.random.randint(1, 101, df.shape[0])

X = df.drop(target, axis=1)
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
RFC = RandomForestClassifier(max_depth = 10, max_features='auto', n_estimators=10, n_jobs=-1)
RFC.fit(X_train, y_train)

y_train_pred = RFC.predict(X_train)
y_pred = RFC.predict(X_test)

In [9]:
features_list = RFC.feature_importances_
features = pd.DataFrame(columns=['features','importance'])

for i in range ((X.shape[1])):
    features.loc[i] = [X.columns[i],features_list[i]]

In [10]:
margin_value = float(features[features['features']=='random']['importance'])
drop_columns = list(features['features'][features['importance'] <= margin_value])

df = df.drop(drop_columns, axis = 1)

### Binning Target Variable

In [11]:
low = [3,4,5]
medium = [6,7,8]
high = [9,10,11]

def bins (x):
    if x in low:
        return 0
    elif x in medium:
        return 1
    elif x in high:
        return 2

df['quality_binned'] = df['quality'].apply(bins)

df = df.drop(columns='quality')

### Export csv

In [12]:
df.to_csv('../data/cookies_target_binned.csv', index=False)