In [None]:
import pandas as pd
from IPython.display import display
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
import numpy as np

## Método para estimación de datos faltantes

In [None]:
data = pd.read_csv("./dataset.csv", header=0)


MISSING_VALUE = 999.99


def estimate_missing_values(data, k):

    # Separamos los datos en dos conjuntos:
    # - Completos: los datos que no tienen valores en MISSING_VALUE
    complete_df = data
    complete_df = complete_df[~complete_df.isin([MISSING_VALUE]).any(axis=1)]

    # - Incompletos: los datos que tienen al menos 1 valor en MISSING_VALUE
    incomplete_df = data
    incomplete_df = incomplete_df[incomplete_df.isin(
        [MISSING_VALUE]).any(axis=1)]

    # Estandarizamos ambos conjuntos
    scaler = StandardScaler()
    std_complete_df = pd.DataFrame(scaler.fit_transform(complete_df),columns = complete_df.columns)
    std_incomplete_df = pd.DataFrame(scaler.fit_transform(incomplete_df),columns = incomplete_df.columns)

    estimated_data = data.copy()      

    # group our df by index and creates a dict with lists of df's as values
    df_dict = dict(
        list(
            incomplete_df.groupby(incomplete_df.index)
        )
    )

    print(df_dict.items())

    for k, v in df_dict.items():               # k: name of index, v: is a df
        check = v.columns[(v == MISSING_VALUE).any()]
        if len(check) > 0:
            print((k, check.to_list()))

    # Calculamos la distancia euclidea entre cada dato incompleto y todos los datos completos
    for i in range(len(incomplete_df)):
        incomplete_row_df = incomplete_df.iloc[[i]]
        std_incomplete_row = std_incomplete_df.iloc[[i]]
        
        missing_column_names = incomplete_row_df.columns[(incomplete_row_df == MISSING_VALUE).iloc[0]] 

        # Eliminamos los datos faltantes
        std_incomplete_row = std_incomplete_row.drop(missing_column_names, axis=1)

            # Eliminamos los datos faltantes
        std_complete_aux = std_complete_df.drop(missing_column_names, axis=1)
        distances = []
        for j in range(len(std_complete_aux)):
            # Almacenamos distancia para ordenamiento e índice de fila 
            distances.append((np.linalg.norm(std_incomplete_row - std_complete_aux.iloc[[j]]), j))

        # Calculamos los k vecinos más cercanos
        k_nearest_neighbors_indexes = map(lambda x: x[1], sorted(
            distances, key=lambda distance: distance[0])[:k])

        # Nos quedamos con los datos de los k vecinos más cercanos
        k_nearest_neighbors_df = complete_df.filter(
            items=k_nearest_neighbors_indexes, axis=0)

        # Calculamos la media de los k vecinos más cercanos para cada dato incompleto
        for missing_column_name in missing_column_names:
            estimated_data.at[incomplete_row_df.index[0], missing_column_name] = k_nearest_neighbors_df[missing_column_name].mean()
    return estimated_data


data.loc[data["Sexo"] == "F", "Sexo"] = 1
data.loc[data["Sexo"] == "M", "Sexo"] = -1

estimated = estimate_missing_values(data, k=5)
display(estimated)


### Boxplots de cada variable

In [None]:

# Plot de data sin escalar
fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(10, 5))
axes[0].boxplot(data["Calorías"])
axes[0].set_title("Calorías")

axes[1].boxplot(data["Alcohol"])
axes[1].set_title("Alcohol")

axes[2].boxplot(data["Grasas_sat"])
axes[2].set_title("Grasas_sat")

### Comparación de datos sin estandarizar vs estandarizados mediante boxplots

In [None]:
# Plot de data sin escalar
data_without_genre = data.drop("Sexo", axis=1)
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(10, 5))
axes[0].boxplot(data_without_genre)
axes[0].set_xticks([1, 2, 3], data_without_genre.columns)
axes[0].set_title("Boxplot de datos sin estandarizar")


scaler = StandardScaler().fit(data_without_genre)
scaled_data = scaler.transform(data_without_genre)
# Plot de data escalada
axes[1].boxplot(pd.DataFrame(scaled_data))
axes[1].set_xticks([1, 2, 3], data_without_genre.columns)
axes[1].set_title("Boxplot de datos estandarizados")

### Análisis de la variable Alcohol en base a las Calorías por categorías

In [None]:
data = pd.read_csv("./dataset.csv", header=0)

calorie_classification = data.copy()

CALORIES_CATEGORY = [{
    "floor":0,
    "ceiling":1100,
}, 
{
    "floor":1100,
    "ceiling":1700,
},
{
    "floor":1700,
    "ceiling":-1,
}]


for calorie_category in CALORIES_CATEGORY[:-1]:
    calorie_classification.loc[(calorie_classification["Calorías"] >= calorie_category["floor"]) & (calorie_classification["Calorías"] < calorie_category["ceiling"]), 'Calorías'] = calorie_category["floor"]

calorie_classification.loc[calorie_classification["Calorías"] >= CALORIES_CATEGORY[-1]["floor"], 'Calorías'] = CALORIES_CATEGORY[-1]["floor"]

# Filter out columns of our interest
df_alcohol = calorie_classification.loc[:, ["Calorías", "Alcohol"]]
# Group the data
df_calorie_groupby = df_alcohol.groupby("Calorías", axis=0)
 
# Build alcohol classification per category
alcohol_classification = []
labels = []
for category_classification in df_calorie_groupby.groups.keys():
    alcohol_classification.append(df_calorie_groupby.get_group(category_classification)['Alcohol'].values)
    labels.append(category_classification)

fig, axes = plt.subplots(figsize=(10, 5))
axes.boxplot(alcohol_classification)
axes.set_xticks(range(1, len(labels) + 1), labels)
axes.set_title("Boxplots Alcohol por categoría de Calorías")

# TODO: maybe add mean to every boxplot?

    
