## Importar librerías

In [1]:
import pandas as pd
import numpy as np

## Importar dataset

In [2]:
df = pd.read_csv('../data/cookies.csv')
df.head()

Unnamed: 0,sugar to flour ratio,sugar index,bake temp,chill time,calories,density,pH,grams baking soda,bake time,quality,butter type,weight,diameter,mixins,crunch factor,aesthetic appeal
0,0.25,9.5,300,15.0,136.0,0.99367,8.1,0.44,12.1,8,melted,15.2,7,raisins,1.3,3
1,0.23,3.3,520,34.0,113.0,0.99429,8.16,0.48,8.4,7,melted,12.4,7,raisins,1.71,3
2,0.18,1.9,360,33.0,106.0,0.98746,8.21,0.83,14.0,9,melted,9.4,7,"nuts, chocolate",1.78,3
3,0.18,10.5,490,41.0,124.0,0.9963,8.14,0.35,10.5,7,melted,12.2,7,chocolate,1.59,3
4,0.24,2.4,770,6.0,33.0,0.9974,8.09,0.57,9.4,5,cubed,19.8,7,"nuts, oats, chocolate",1.3,3


## Previsualización de datos

In [3]:
df.shape

(5198, 16)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5198 entries, 0 to 5197
Data columns (total 16 columns):
sugar to flour ratio    5198 non-null float64
sugar index             5193 non-null float64
bake temp               5198 non-null int64
chill time              5198 non-null float64
calories                5198 non-null float64
density                 5198 non-null float64
pH                      5198 non-null float64
grams baking soda       5198 non-null float64
bake time               5188 non-null float64
quality                 5198 non-null int64
butter type             5198 non-null object
weight                  5198 non-null float64
diameter                5198 non-null int64
mixins                  5196 non-null object
crunch factor           5198 non-null float64
aesthetic appeal        5198 non-null int64
dtypes: float64(10), int64(4), object(2)
memory usage: 649.9+ KB


## Categorical variables

In [5]:
# Variable mixins es categorica y tiene valores null

df['mixins'].value_counts()

chocolate                         1893
raisins                           1200
chocolate, oats                    749
nuts, chocolate                    521
nuts,raisins                       333
nuts, oats, chocolate              295
nuts, oats                         107
chocolate, peanut butter            52
raisins, oats                       24
peanut butter                       15
oats                                 4
chocolate, oats, peanut butter       2
peanut butter, raisins               1
Name: mixins, dtype: int64

In [6]:
# creamos una copia del dataframe para trabajar sobre ella

df_clean = df.copy()

#Creamos las columnas dummies y en el mismo loop le ponemos un valor en función de la columna mixins:
columns = ['raisins','nuts','chocolate','oats','peanut butter']
for col in columns:
    df_clean[col]=0
    df_clean.loc[df["mixins"].str.contains(col, na=False), col] = 1

# Los valores nulos de la variable mixins pasan a tener 0 en cada columna de dummies, ya que consideramos que 
# son galletas sin añadidos.

#Borramos la columna "mixins":
df_clean = df_clean.drop("mixins", axis = 1)


In [7]:
df_clean.head()

Unnamed: 0,sugar to flour ratio,sugar index,bake temp,chill time,calories,density,pH,grams baking soda,bake time,quality,butter type,weight,diameter,crunch factor,aesthetic appeal,raisins,nuts,chocolate,oats,peanut butter
0,0.25,9.5,300,15.0,136.0,0.99367,8.1,0.44,12.1,8,melted,15.2,7,1.3,3,1,0,0,0,0
1,0.23,3.3,520,34.0,113.0,0.99429,8.16,0.48,8.4,7,melted,12.4,7,1.71,3,1,0,0,0,0
2,0.18,1.9,360,33.0,106.0,0.98746,8.21,0.83,14.0,9,melted,9.4,7,1.78,3,0,1,1,0,0
3,0.18,10.5,490,41.0,124.0,0.9963,8.14,0.35,10.5,7,melted,12.2,7,1.59,3,0,0,1,0,0
4,0.24,2.4,770,6.0,33.0,0.9974,8.09,0.57,9.4,5,cubed,19.8,7,1.3,3,0,1,1,1,0


In [8]:
# # Variable butter type es categorica y tiene dos valores posibles

df_clean['butter type'].value_counts()

melted    3920
cubed     1278
Name: butter type, dtype: int64

In [9]:
#Convertimos 'butter type' a binario:
df_clean.loc[df["butter type"].str.contains('melted', na=False), 'butter type'] = 1
df_clean.loc[df["butter type"].str.contains('cubed', na=False), 'butter type'] = 0

In [10]:
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5198 entries, 0 to 5197
Data columns (total 20 columns):
sugar to flour ratio    5198 non-null float64
sugar index             5193 non-null float64
bake temp               5198 non-null int64
chill time              5198 non-null float64
calories                5198 non-null float64
density                 5198 non-null float64
pH                      5198 non-null float64
grams baking soda       5198 non-null float64
bake time               5188 non-null float64
quality                 5198 non-null int64
butter type             5198 non-null int64
weight                  5198 non-null float64
diameter                5198 non-null int64
crunch factor           5198 non-null float64
aesthetic appeal        5198 non-null int64
raisins                 5198 non-null int64
nuts                    5198 non-null int64
chocolate               5198 non-null int64
oats                    5198 non-null int64
peanut butter           5198 non-null

## Variables numericas - valores nulos

In [11]:
df_clean[df_clean['sugar index'].isnull()]

Unnamed: 0,sugar to flour ratio,sugar index,bake temp,chill time,calories,density,pH,grams baking soda,bake time,quality,butter type,weight,diameter,crunch factor,aesthetic appeal,raisins,nuts,chocolate,oats,peanut butter
16,0.02,,670,9.0,23.0,3.0,8.47,0.67,9.4,6,0,14.0,7,1.95,3,0,1,1,1,0
77,0.14,,360,38.0,155.0,0.99622,8.27,0.5,9.4,7,1,12.4,7,1.82,3,1,0,0,0,0
1209,0.12,,820,7.0,28.0,0.997,8.37,0.5,9.4,6,0,15.6,7,1.74,3,1,1,0,0,0
4331,0.52,,470,63.0,186.0,0.99481,8.18,0.44,9.6,7,1,12.6,7,1.35,3,1,0,0,0,0
4490,0.07,,430,34.0,149.0,0.9944,8.34,0.57,9.7,7,1,10.6,7,1.8,3,1,1,0,0,0


In [12]:
df_clean[df_clean['bake time'].isnull()]

Unnamed: 0,sugar to flour ratio,sugar index,bake temp,chill time,calories,density,pH,grams baking soda,bake time,quality,butter type,weight,diameter,crunch factor,aesthetic appeal,raisins,nuts,chocolate,oats,peanut butter
13,0.48,4.0,1860,64.0,150.0,0.9945,8.06,0.4,,7,1,12.8,7,1.05,3,0,0,1,1,0
173,0.25,9.8,490,59.5,137.0,0.995,8.16,0.38,,8,1,13.2,7,1.46,3,0,0,1,1,0
484,0.33,2.9,440,21.0,73.0,0.98896,8.17,0.32,,10,1,11.2,7,1.57,3,0,0,1,1,0
816,0.49,1.1,480,11.0,138.0,0.9929,8.01,0.42,,7,1,14.4,7,1.53,3,0,1,1,0,0
1562,0.74,1.8,330,33.0,156.0,0.991,8.29,0.52,,8,1,15.6,7,1.54,3,1,0,0,0,0
1867,0.24,12.1,560,68.0,210.0,0.99718,8.05,0.5,,7,1,14.0,7,1.57,3,0,1,1,0,0
1933,0.23,8.6,560,56.0,215.0,0.9967,8.17,0.44,,7,1,13.8,7,1.74,3,0,0,1,0,0
2215,0.38,10.2,280,20.0,82.0,0.99274,8.1,0.43,,9,1,13.0,7,1.3,3,1,0,0,0,0
2910,0.08,2.1,450,19.0,48.0,0.9949,8.49,0.49,,6,0,12.8,7,1.22,3,0,1,1,1,0
3746,0.46,12.6,420,41.0,143.0,1.0,8.24,0.76,,10,1,15.6,7,1.77,3,0,0,1,1,0


In [13]:
#Dado que los valores Nan de sugar no tienen 0 en el sugar to flour ratio, consideramos errores y borramos filas
#Los valores de bake time null tienen bake temperature asi que no pueden ser 0. TIJERA!

df_clean = df_clean.dropna()

In [14]:
# El dataframe queda libre de valores nulos

df_clean.isna().sum()

sugar to flour ratio    0
sugar index             0
bake temp               0
chill time              0
calories                0
density                 0
pH                      0
grams baking soda       0
bake time               0
quality                 0
butter type             0
weight                  0
diameter                0
crunch factor           0
aesthetic appeal        0
raisins                 0
nuts                    0
chocolate               0
oats                    0
peanut butter           0
dtype: int64

## Formato nombre de variables

In [15]:
# Finalmente,pasamos nombres de variable a formato snake case: nombres de variables en minuscula y separados por '_'
df_clean.columns = ['sugar_to_flour_ratio', 'sugar_index', 'bake_temp', 'chill_time',
      'calories', 'density', 'pH', 'grams_baking_soda', 'bake_time',
      'quality', 'butter_type', 'weight', 'diameter',
      'crunch_factor', 'aesthetic_appeal', 'raisins','nuts','chocolate','oats','peanut_butter']

In [16]:
df_clean.head()

Unnamed: 0,sugar_to_flour_ratio,sugar_index,bake_temp,chill_time,calories,density,pH,grams_baking_soda,bake_time,quality,butter_type,weight,diameter,crunch_factor,aesthetic_appeal,raisins,nuts,chocolate,oats,peanut_butter
0,0.25,9.5,300,15.0,136.0,0.99367,8.1,0.44,12.1,8,1,15.2,7,1.3,3,1,0,0,0,0
1,0.23,3.3,520,34.0,113.0,0.99429,8.16,0.48,8.4,7,1,12.4,7,1.71,3,1,0,0,0,0
2,0.18,1.9,360,33.0,106.0,0.98746,8.21,0.83,14.0,9,1,9.4,7,1.78,3,0,1,1,0,0
3,0.18,10.5,490,41.0,124.0,0.9963,8.14,0.35,10.5,7,1,12.2,7,1.59,3,0,0,1,0,0
4,0.24,2.4,770,6.0,33.0,0.9974,8.09,0.57,9.4,5,0,19.8,7,1.3,3,0,1,1,1,0


In [19]:
# Guargamos dataframe limpio

# df_clean.to_csv('../data/cookies_clean.csv', sep='\t', index = False)