# PRÁCTICA 1

Lucía Pérez González, Manuel Ramallo Blanco, Alexandre Lorenzo Martínez

## 1 - Preprocesado

### 1.1 - Eliminación de duplicados

In [1]:
# Abrir datasets
import pandas as pd

df_vino = pd.read_csv("data/train.csv")

# Eliminación de duplicados, ignorado quality
cols = df_vino.columns.drop('quality')
df_vino = df_vino.drop_duplicates(subset=cols)

### 1.2 - Binarización de la calidad

In [None]:
def clasificar_vino(valor): 
    if valor < 7: 
        return 0
    elif valor >= 7: 
        return 1 
df_vino['calidad'] = df_vino['quality'].apply(clasificar_vino) 
df_vino = df_vino.drop(columns=['quality'])

### 1.3 - Gestión de valores atípicos

In [None]:
# Deteccion de valores atipicos

def detectar_atipicos(df_train, df_val, cols_diana):
    for col in cols_diana:

        # Calculamos IQR y límites
        Q1 = df_train[col].quantile(0.25)
        Q3 = df_train[col].quantile(0.75)
        IQR = Q3 - Q1
        lower = Q1 - 6 * IQR
        upper = Q3 + 6 * IQR
        
        # Declaramos condición de atípico y registramos sus posiciones
        cond_iqr = (df_train[col] < lower) | (df_train[col] > upper)    #OLLO! Nico recomendara ver os casos 1 a 1 
        cond_neg = df_train[col] < 0
        cond_atipico = cond_iqr | cond_neg

        cond_iqr_val = (df_val[col] < lower) | (df_val[col] > upper)    #OLLO! Nico recomendara ver os casos 1 a 1 
        cond_neg_val = df_val[col] < 0
        cond_atipico_val = cond_iqr_val | cond_neg_val

        # Incrementamos el contador de atípicos por fila en ambos conjuntos
        df_train.loc[cond_atipico, col] = pd.NA
        df_val.loc[cond_atipico_val, col] = pd.NA


    return df_train, df_val

#### 1.3.11 - Tratamiento de datos atípicos

In [None]:
def eliminar_o_imputacion(df_train, df_val, cols_diana, max_atipicos=3, umbral_col=20,  target='calidad'):
    # Eliminar filas con 4 o más valores atípicos
    # Contamos el número de valores atípicos por fila (NA) y filtramos 
    df_train = df_train[df_train.isnull().sum(axis=1) < max_atipicos]

    # Analisis de valores faltantes por columna
    for col in cols_diana:
        # Contamos el numero de valores faltantes (NA) en la columna(
        num_faltantes = df_train[col].isna().sum()
        if num_faltantes/len(df_train) > umbral_col:
            df_train = df_train.drop(columns=[col])
            df_val = df_val.drop(columns=[col])

    # Imputacion de valores: mediana para cada nulo de cada columna
    for col in cols_diana:
        med = df_train[col].median()
        df_train.loc[:, col] = df_train[col].fillna(med)
        df_val.loc[:, col] = df_val[col].fillna(med)

    return df_train, df_val


#### Normalización y selección de características

In [None]:
# Normalización de los datos
from sklearn.preprocessing import StandardScaler

def estandarizar_train_test(train_df, test_df, target):
    columnas = train_df.drop(columns=[target]).select_dtypes(include="number").columns

    scaler = StandardScaler()


    train_df[columnas] = scaler.fit_transform(train_df[columnas])
    test_df[columnas] = scaler.transform(test_df[columnas])

    return train_df, test_df


In [None]:
# Selección de características con SelectKBest
from sklearn.feature_selection import SelectKBest, f_regression

def seleccion_caracteristicas(df, k, target='calidad'):
    X = df.drop(columns=[target])
    y = df[target]

    selector = SelectKBest(score_func=f_regression, k=k)
    selector.fit(X, y)

    selected_features = X.columns[selector.get_support()]

    df_fs = df[selected_features.tolist() + [target]]
    return df_fs


#### Función que empaqueta el preprocesado

In [None]:
def preprocesado(train_df_pre, valid_df_pre, target='calidad'):
    df_train = train_df_pre.copy()
    df_val = valid_df_pre.copy()
    cols_diana = [c for c in train_df_pre.columns if c not in [target]]
    
    # Detección de atípicos en train y valid
    df_train, df_val = detectar_atipicos(df_train, df_val, cols_diana)

    # Tratamiento de atípicos: eliminación o imputación
    df_train, df_val = eliminar_o_imputacion(df_train, df_val, cols_diana)

    # Normalización de los datos
    df_train, df_val = estandarizar_train_test(df_train, df_val, target)

    # Selección de características con SelectKBest
    df_train = seleccion_caracteristicas(df_train, k=10, target=target)
    df_val = df_val[df_train.columns]
    return df_train, df_val 


## Entrenamiento del modelo de predicción

In [None]:
from sklearn.tree import DecisionTreeClassifier

def arbore_decision(df_train, max_depth, min_samples_split, min_samples_leaf, target='calidad',):
    
    # Separar características (X) e etiqueta (y)
    X_train = df_train.drop(columns=[target])
    y_train = df_train[target]
    
    # Inicializar o modelo cos parámetros recibidos
    modelo = DecisionTreeClassifier(
        max_depth = max_depth,
        min_samples_split = min_samples_split,
        min_samples_leaf = min_samples_leaf,
        random_state = 42  # Para asegurar resultados reproducibles
    )
    
    # Entrenar el modelo
    modelo.fit(X_train, y_train)
    
    return modelo

In [3]:
import pickle
import os

def entrenar_o_cargar(modelo, X_train, y_train, nombre_modelo_archivo):
    #Cargar el modelo si existe
    if os.path.exists(nombre_modelo_archivo):
        with open(nombre_modelo_archivo, "rb") as f:
            modelo = pickle.load(f)
    #Si no existe, entrenar el modelo y guardarlo
    else:
        modelo.fit(X_train, y_train)
        with open(nombre_modelo_archivo, "wb") as f:
            pickle.dump(modelo, f)
    
    return modelo

In [5]:
# Modelo de entrenamiento
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier

model_configs = []

def nested_cv(df, target='calidad'):
    """
    Validación cruzada anidada:
    - exterior: 3 iteraciones (6 pedazos, 4 train, 2 val)
    - interior: 4 pedazos (3 train, 1 val)
    
    Devuelve: diccionario con información de folds
    """
    
    # Mezclar el dataset para aleatoriedad
    df_shuffled = df.sample(frac=1, random_state=42).reset_index(drop=True)
    n = len(df_shuffled)
    
    # Dividir en 6 pedazos iguales (outer)
    outer_splits = np.array_split(df_shuffled, 6)
    
    results = []

    #Se construye el diccionario para cada iteración externa -> se almacena un array de puntuaciones para cada modelo evaluado en la VC interna
    #El que tenga mejor media será el usado para entrenar con el conjunto completo del fold externo y evaluar en su validación externa
    inner_models = []
    # Inicializar estructura para acumular scores
    for config in model_configs:
        inner_models.append({
            "name": config["name"],
            "model": config["model"],
            "params": config["params"],
            "scores": []
        })
    # Outer CV
    for outer_iter in range(3):
        # Elegimos 4 pedazos para train, 2 para val
        outer_train_idx = [(outer_iter + i) % 6 for i in range(4)]
        outer_valid_idx = [(outer_iter + 4 + i) % 6 for i in range(2)]
        
        train_outer = pd.concat([outer_splits[i] for i in outer_train_idx])
        valid_outer = pd.concat([outer_splits[i] for i in outer_valid_idx])
        
        # Dividir train_outer en 4 pedazos para inner CV
        inner_splits = np.array_split(train_outer.sample(frac=1, random_state=outer_iter), 4)
        
        # Inner CV
        for inner_iter in range(4):
            inner_valid_pre = inner_splits[inner_iter]
            inner_train_pre = pd.concat([s for j, s in enumerate(inner_splits) if j != inner_iter])
            
            # Preprocesar
            df_train, df_val = preprocesado(inner_train_pre, inner_valid_pre, target)
            #Se separan características y etiquetas para entrenamiento y validación
            X_train = df_train.drop(columns=[target])
            y_train = df_train[target]
            X_val = df_val.drop(columns=[target])
            y_val = df_val[target]


            # Bucle que recorre los modelos con sus configuraciones guardadas en model_configs
            for i, config in enumerate(model_configs):
                # Inicializar el modelo con los parámetros de la configuración
                modelo = config["model"](**config["params"])
                
                # Entrenar el modelo y evaluar en validación interna
                modelo = entrenar_o_cargar(modelo, X_train, y_train, f"modelo_{config['name']}_outer{outer_iter}_inner{inner_iter}.pkl")
                preds = modelo.predict(X_val)
                score = accuracy_score(y_val, preds)
                
                # Guardar la puntuación obtenida para esta configuración
                inner_models[i]["scores"].append(score)

        for model in inner_models:
            model["mean_score"] = np.mean(model["scores"])
        best_model = max(inner_models, key=lambda x: x["mean_score"])

        # Preprocesar outer validation usando train outer
        train_outer_proc, valid_outer_proc = preprocesado(train_outer, valid_outer, target)
        # Entrenar el mejor modelo en todo el outer train y evaluar en outer valid
        X_train_outer = train_outer_proc.drop(columns=[target])
        y_train_outer = train_outer_proc[target]
        X_valid_outer = valid_outer_proc.drop(columns=[target])
        y_valid_outer = valid_outer_proc[target]

        # Entrenar el mejor modelo con los datos del outer train
        modelo_outer = best_model["model"](**best_model["params"])
        modelo_outer = entrenar_o_cargar(modelo_outer, X_train_outer, y_train_outer, f"best_model_outer{outer_iter}.pkl")
        preds = modelo_outer.predict(X_valid_outer)
        score = accuracy_score(y_valid_outer, preds)
        results.append({
            "outer_iter": outer_iter,
            "best_model": best_model,
            "score": score
        })
    best_overall_model = max(results, key=lambda x: x["score"])
    return best_overall_model, results


In [None]:
splits = np.array_split(df_vino, 2)
df_1, df_2 = preprocesado(splits[0], splits[1], target='calidad')

  return bound(*args, **kwds)


In [None]:
df_1.head(30)

Unnamed: 0,fixed acidity,volatile acidity,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,calidad
0,-0.543169,0.332873,1.012177,0.184756,1.371916,1.596886,1.222638,-0.129759,-0.730507,-0.742707,0
1,-0.666384,-0.605378,-0.857789,-0.625554,-0.20826,-1.006913,-1.56686,-0.888426,2.280356,-0.065215,0
2,-1.159245,1.427499,-0.826623,-0.30143,-1.535609,-0.099166,-0.651169,0.697878,0.273114,0.019472,0
3,-1.03603,0.541373,0.46677,-0.30143,-0.397882,-0.338047,0.480186,1.939334,-0.09184,0.442904,0
4,1.181845,0.437123,0.575851,-1.030708,0.739845,1.334117,1.01758,-0.819456,0.820543,-0.742707,0
5,-0.666384,-0.084127,0.786222,-0.544523,1.624744,1.764103,1.081219,0.421999,0.364352,-0.065215,0
6,-0.296738,2.939125,-0.561711,0.103725,-0.461089,-1.102465,-0.014782,-0.474607,-1.277937,-1.166139,0
7,-0.296738,-0.084127,0.560268,0.265787,1.182295,1.931319,1.194354,-1.026365,-0.365554,-1.420198,0
8,0.812199,0.437123,-0.779874,-1.030708,0.550224,-0.696368,-0.085491,-0.819456,-0.274316,1.713202,0
9,-1.652106,1.479624,-0.795457,-0.058337,-1.093159,-1.675778,-1.216847,0.973757,0.181875,0.78165,1
