# PRÁCTICA 1

Lucía Pérez González, Manuel Ramallo Blanco, Alexandre Lorenzo Martínez

## 1 - Preprocesado

### 1.1 - Eliminación de duplicados

In [3]:
# Abrir datasets
import pandas as pd

df_vino = pd.read_csv("data/train.csv")

# Eliminación de duplicados, ignorado quality
cols = df_vino.columns.drop('quality')
df_vino = df_vino.drop_duplicates(subset=cols)

### 1.2 - Binarización de la calidad

In [4]:
def clasificar_vino(valor): 
    if valor < 7: 
        return 0
    elif valor >= 7: 
        return 1 
df_vino['calidad'] = df_vino['quality'].apply(clasificar_vino) 
df_vino = df_vino.drop(columns=['quality'])

### 1.3 - Gestión de valores atípicos

In [5]:
# Deteccion de valores atipicos

def detectar_atipicos(df_train, df_val, cols_diana):
    for col in cols_diana:

        # Calculamos IQR y límites
        Q1 = df_train[col].quantile(0.25)
        Q3 = df_train[col].quantile(0.75)
        IQR = Q3 - Q1
        lower = Q1 - 6 * IQR
        upper = Q3 + 6 * IQR
        
        # Declaramos condición de atípico y registramos sus posiciones
        cond_iqr = (df_train[col] < lower) | (df_train[col] > upper)    #OLLO! Nico recomendara ver os casos 1 a 1 
        cond_neg = df_train[col] < 0
        cond_atipico = cond_iqr | cond_neg

        cond_iqr_val = (df_val[col] < lower) | (df_val[col] > upper)    #OLLO! Nico recomendara ver os casos 1 a 1 
        cond_neg_val = df_val[col] < 0
        cond_atipico_val = cond_iqr_val | cond_neg_val

        # Incrementamos el contador de atípicos por fila en ambos conjuntos
        df_train.loc[cond_atipico, col] = pd.NA
        df_val.loc[cond_atipico_val, col] = pd.NA


    return df_train, df_val

#### 1.3.11 - Tratamiento de datos atípicos

In [6]:
def eliminar_o_imputacion(df_train, df_val, cols_diana, max_atipicos=3, umbral_col=20,  target='calidad'):
    # Eliminar filas con 4 o más valores atípicos
    # Contamos el número de valores atípicos por fila (NA) y filtramos 
    df_train = df_train[df_train.isnull().sum(axis=1) < max_atipicos]

    # Analisis de valores faltantes por columna
    for col in cols_diana:
        # Contamos el numero de valores faltantes (NA) en la columna(
        num_faltantes = df_train[col].isna().sum()
        if num_faltantes/len(df_train) > umbral_col:
            df_train = df_train.drop(columns=[col])
            df_val = df_val.drop(columns=[col])

    # Imputacion de valores: mediana para cada nulo de cada columna
    for col in cols_diana:
        med = df_train[col].median()
        df_train.loc[:, col] = df_train[col].fillna(med)
        df_val.loc[:, col] = df_val[col].fillna(med)

    return df_train, df_val


#### Normalización y selección de características

In [7]:
# Normalización de los datos
from sklearn.preprocessing import StandardScaler

def estandarizar_train_test(train_df, test_df, target):
    columnas = train_df.drop(columns=[target]).select_dtypes(include="number").columns

    scaler = StandardScaler()


    train_df[columnas] = scaler.fit_transform(train_df[columnas])
    test_df[columnas] = scaler.transform(test_df[columnas])

    return train_df, test_df


In [8]:
# Selección de características con SelectKBest
from sklearn.feature_selection import SelectKBest, f_regression

def seleccion_caracteristicas(df, k, target='calidad'):
    X = df.drop(columns=[target])
    y = df[target]

    selector = SelectKBest(score_func=f_regression, k=k)
    selector.fit(X, y)

    selected_features = X.columns[selector.get_support()]

    df_fs = df[selected_features.tolist() + [target]]
    return df_fs


#### Función que empaqueta el preprocesado

In [9]:
def preprocesado(train_df_pre, valid_df_pre, target='calidad'):
    df_train = train_df_pre.copy()
    df_val = valid_df_pre.copy()
    cols_diana = [c for c in train_df_pre.columns if c not in [target]]
    
    # Detección de atípicos en train y valid
    df_train, df_val = detectar_atipicos(df_train, df_val, cols_diana)

    # Tratamiento de atípicos: eliminación o imputación
    df_train, df_val = eliminar_o_imputacion(df_train, df_val, cols_diana)

    # Normalización de los datos
    df_train, df_val = estandarizar_train_test(df_train, df_val, target)

    # Selección de características con SelectKBest
    df_train = seleccion_caracteristicas(df_train, k=10, target=target)
    df_val = df_val[df_train.columns]
    return df_train, df_val 


## Entrenamiento del modelo de predicción

In [23]:
import pickle
import os

def entrenar_o_cargar(modelo, X_train, y_train, nombre_modelo_archivo):
    #Cargar el modelo si existe
    if os.path.exists(nombre_modelo_archivo):
        with open(nombre_modelo_archivo, "rb") as f:
            modelo = pickle.load(f)
    #Si no existe, entrenar el modelo y guardarlo
    else:
        modelo.fit(X_train, y_train)
        with open(nombre_modelo_archivo, "wb") as f:
            pickle.dump(modelo, f)
    
    return modelo

In [24]:

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier

model_configs = [

    # ======================
    # kNN
    # ======================
    {"name": "knn", "model": KNeighborsClassifier, "params": {"n_neighbors": 3}},
    {"name": "knn", "model": KNeighborsClassifier, "params": {"n_neighbors": 5}},
    {"name": "knn", "model": KNeighborsClassifier, "params": {"n_neighbors": 7}},
    {"name": "knn", "model": KNeighborsClassifier, "params": {"n_neighbors": 9}},

    # ======================
    # Árbol de Decisión
    # ======================
    
    {"name": "decision_tree", "model": DecisionTreeClassifier,
    "params": {"max_depth": 5, "min_samples_split": 2, "min_samples_leaf": 1}},

    {"name": "decision_tree", "model": DecisionTreeClassifier,
    "params": {"max_depth": 5, "min_samples_split": 2, "min_samples_leaf": 5}},

    {"name": "decision_tree", "model": DecisionTreeClassifier,
    "params": {"max_depth": 5, "min_samples_split": 10, "min_samples_leaf": 1}},

    {"name": "decision_tree", "model": DecisionTreeClassifier,
    "params": {"max_depth": 5, "min_samples_split": 10, "min_samples_leaf": 5}},

    {"name": "decision_tree", "model": DecisionTreeClassifier,
    "params": {"max_depth": 10, "min_samples_split": 2, "min_samples_leaf": 1}},

    {"name": "decision_tree", "model": DecisionTreeClassifier,
    "params": {"max_depth": 10, "min_samples_split": 2, "min_samples_leaf": 5}},

    {"name": "decision_tree", "model": DecisionTreeClassifier,
    "params": {"max_depth": 10, "min_samples_split": 10, "min_samples_leaf": 1}},

    {"name": "decision_tree", "model": DecisionTreeClassifier,
    "params": {"max_depth": 10, "min_samples_split": 10, "min_samples_leaf": 5}},

    # ======================
     # Regresión loxística
     # ======================
    {"name": "RegrsionLoxistica", "model": LogisticRegression, "params": {"penalty": "l1", "C": 0.1, "solver": "liblinear","random_state": 0}},
    {"name": "RegrsionLoxistica", "model": LogisticRegression, "params": {"penalty": "l1", "C": 1.0, "solver": "liblinear","random_state": 0}},
    {"name": "RegrsionLoxistica", "model": LogisticRegression, "params": {"penalty": "l1", "C": 10.0, "solver": "liblinear","random_state": 0}},
    {"name": "RegrsionLoxistica", "model": LogisticRegression, "params": {"penalty": "l2", "C": 0.1, "solver": "liblinear","random_state": 0}},
    {"name": "RegrsionLoxistica", "model": LogisticRegression, "params": {"penalty": "l2", "C": 1.0, "solver": "liblinear","random_state": 0}},
    {"name": "RegrsionLoxistica", "model": LogisticRegression, "params": {"penalty": "l2", "C": 10.0, "solver": "liblinear","random_state": 0}},
    
    # ======================
    # Perceptrón multicapa
    # ======================
    {"name": "PerceptronMulticapa", "model": MLPClassifier, "params": {"hidden_layer_sizes": (50,), 
                                                                       "learning_rate_init": 0.001, "max_iter": 300, "random_state": 0, "solver": "lbfgs"}},
    {"name": "PerceptronMulticapa", "model": MLPClassifier, "params": {"hidden_layer_sizes": (50,), 
                                                                       "learning_rate_init": 0.01, "max_iter": 300, "random_state": 0, "solver": "lbfgs"}},
    {"name": "PerceptronMulticapa", "model": MLPClassifier, "params": {"hidden_layer_sizes": (100,), 
                                                                       "learning_rate_init": 0.001, "max_iter": 300, "random_state": 0, "solver": "lbfgs"}},
    {"name": "PerceptronMulticapa", "model": MLPClassifier, "params": {"hidden_layer_sizes": (100,), 
                                                                       "learning_rate_init": 0.01, "max_iter": 300, "random_state": 0, "solver": "lbfgs"}},
    {"name": "PerceptronMulticapa", "model": MLPClassifier, "params": {"hidden_layer_sizes": (50, 50), 
                                                                       "learning_rate_init": 0.001, "max_iter": 300, "random_state": 0, "solver": "lbfgs"}},
    {"name": "PerceptronMulticapa", "model": MLPClassifier, "params": {"hidden_layer_sizes": (50, 50), 
                                                                       "learning_rate_init": 0.01, "max_iter": 300, "random_state": 0, "solver": "lbfgs"}}
]   

In [39]:
# Modelo de entrenamiento
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def nested_cv(df, target='calidad'):
    """
    Validación cruzada anidada:
    - exterior: 3 iteraciones (6 pedazos, 4 train, 2 val)
    - interior: 4 pedazos (3 train, 1 val)
    
    Devuelve: diccionario con información de folds
    """
    
    # Mezclar el dataset para aleatoriedad
    df_shuffled = df.sample(frac=1, random_state=42).reset_index(drop=True)
    n = len(df_shuffled)
    
    # Dividir en 6 pedazos iguales (outer)
    outer_splits = [ df_shuffled.iloc[i::6] for i in range(6) ]
    
    results = []

    # Outer CV
    for outer_iter in range(3):
        #Se construye el diccionario para cada iteración externa -> se almacena un array de puntuaciones para cada modelo evaluado en la VC interna
        #El que tenga mejor media será el usado para entrenar con el conjunto completo del fold externo y evaluar en su validación externa
        inner_models = []
        # Inicializar estructura para acumular scores
        for config in model_configs:
            inner_models.append({
                "name": config["name"],
                "model": config["model"],
                "params": config["params"],
                "acc": [],
                "prec": [],
                "rec": [],
                "f1": []
            })
        # Elegimos 4 pedazos para train, 2 para val
        outer_train_idx = [(outer_iter + i) % 6 for i in range(4)]
        outer_valid_idx = [(outer_iter + 4 + i) % 6 for i in range(2)]
        
        train_outer = pd.concat([outer_splits[i] for i in outer_train_idx])
        valid_outer = pd.concat([outer_splits[i] for i in outer_valid_idx])
        
        # Dividir train_outer en 4 pedazos para inner CV
        train_shuffled = train_outer.sample(frac=1, random_state=outer_iter).reset_index(drop=True)
        indices_inner = np.array_split(train_shuffled.index, 4)
        inner_splits = [train_shuffled.loc[idx] for idx in indices_inner]

        
        # Inner CV
        for inner_iter in range(4):
            inner_valid_pre = inner_splits[inner_iter]
            inner_train_pre = pd.concat([s for j, s in enumerate(inner_splits) if j != inner_iter])
            
            # Preprocesar
            df_train, df_val = preprocesado(inner_train_pre, inner_valid_pre, target)
            #Se separan características y etiquetas para entrenamiento y validación
            X_train = df_train.drop(columns=[target])
            y_train = df_train[target]
            X_val = df_val.drop(columns=[target])
            y_val = df_val[target]


            # Bucle que recorre los modelos con sus configuraciones guardadas en model_configs
            for i, config in enumerate(model_configs):
                # Inicializar el modelo con los parámetros de la configuración
                modelo = config["model"](**config["params"])
                
                # Entrenar el modelo y evaluar en validación interna
                modelo = entrenar_o_cargar(modelo, X_train, y_train, f"results/modelo_{config['name']}_outer{outer_iter}_inner{inner_iter}_params{tuple(config['params'].values())}.pkl")
                preds = modelo.predict(X_val)
                acc = accuracy_score(y_val, preds)
                prec = precision_score(y_val, preds)
                rec = recall_score(y_val, preds)
                f1 = f1_score(y_val, preds)
                
                # Guardar las puntuaciones obtenidas para esta configuración
                inner_models[i]["acc"].append(acc)
                inner_models[i]["prec"].append(prec)
                inner_models[i]["rec"].append(rec)
                inner_models[i]["f1"].append(f1)

        for model in inner_models:
            model["mean_acc"] = np.mean(model["acc"])
            model["mean_prec"] = np.mean(model["prec"])
            model["mean_rec"] = np.mean(model["rec"])
            model["mean_f1"] = np.mean(model["f1"])
        best_model = max(inner_models, key=lambda x: x["mean_f1"])

        # Preprocesar outer validation usando train outer
        train_outer_proc, valid_outer_proc = preprocesado(train_outer, valid_outer, target)
        # Entrenar el mejor modelo en todo el outer train y evaluar en outer valid
        X_train_outer = train_outer_proc.drop(columns=[target])
        y_train_outer = train_outer_proc[target]
        X_valid_outer = valid_outer_proc.drop(columns=[target])
        y_valid_outer = valid_outer_proc[target]

        # Entrenar el mejor modelo con los datos del outer train
        modelo_outer = best_model["model"](**best_model["params"])
        modelo_outer = entrenar_o_cargar(modelo_outer, X_train_outer, y_train_outer, f"results/modelo_{best_model['name']}_outer{outer_iter}_params{tuple(best_model['params'].values())}.pkl")
        preds = modelo_outer.predict(X_valid_outer)
        acc = accuracy_score(y_valid_outer, preds)
        prec = precision_score(y_valid_outer, preds)
        rec = recall_score(y_valid_outer, preds)
        f1 = f1_score(y_valid_outer, preds)
        results.append({
            "outer_iter": outer_iter,
            "best_model": best_model["name"]+str(best_model["params"]),
            "acc": acc,
            "prec": prec,
            "rec": rec,
            "f1": f1
        })
    best_overall_model = max(results, key=lambda x: x["f1"])
    return best_overall_model, results


In [40]:
mejor_resultado,resultados = nested_cv(df_vino, target='calidad')

In [41]:
mejor_resultado

{'outer_iter': 1,
 'best_model': "PerceptronMulticapa{'hidden_layer_sizes': (50, 50), 'learning_rate_init': 0.001, 'max_iter': 300, 'random_state': 0, 'solver': 'lbfgs'}",
 'acc': 0.7773311897106109,
 'prec': 0.5230263157894737,
 'rec': 0.5463917525773195,
 'f1': 0.534453781512605}