In [9]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error as mae, mean_squared_error as mse, r2_score as r2
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import fetch_california_housing
import model
from model import read_data_model, add_laplace_noise

np.random.seed(42)


# Cargar los datos
all_data = fetch_california_housing()
X_train_val, X_val, X_test, Y_train_val, Y_val, Y_test = read_data_model(all_data, 'MedInc')

# Escalado de los datos
scaler = StandardScaler()
X_train_val_scaled = scaler.fit_transform(X_train_val)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Definir los parámetros para la búsqueda en cuadrícula para DecisionTreeRegressor
param_grid = {
    'max_depth': [None, 10, 20, 30],        # Profundidad máxima del árbol
    'min_samples_split': [2, 10, 20],       # Número mínimo de muestras necesarias para dividir un nodo
    'min_samples_leaf': [1, 5, 10],         # Número mínimo de muestras que debe tener una hoja
    'max_features': [None, 'sqrt', 'log2']  # Número de características a considerar en cada división
}

# Crear el modelo de árbol de decisión
tree_model = DecisionTreeRegressor(random_state=42)

# Configurar la búsqueda en cuadrícula
grid_search = GridSearchCV(
    estimator=tree_model, 
    param_grid=param_grid, 
    cv=5, 
    n_jobs=-1, 
    verbose=2, 
    scoring='neg_mean_squared_error'
)

# Entrenar el modelo con la búsqueda de hiperparámetros
grid_search.fit(X_train_val_scaled, Y_train_val)
best_params = grid_search.best_params_

# Imprimir los mejores parámetros encontrados
print("Best parameters found:", best_params)

# Entrenamiento del modelo original sin ruido
model_original = DecisionTreeRegressor(
    max_depth=best_params['max_depth'],
    min_samples_split=best_params['min_samples_split'],
    min_samples_leaf=best_params['min_samples_leaf'],
    max_features=best_params['max_features'],
    random_state=42
)
model_original.fit(X_train_val_scaled, Y_train_val)
Y_predict = model_original.predict(X_test_scaled)

# Evaluar el modelo original (sin ruido)
MSE_original = mse(Y_test, Y_predict)
MAE_original = mae(Y_test, Y_predict)
RMSE_original = np.sqrt(MSE_original)
R2_original = r2(Y_test, Y_predict)

print("\nPredicción de precios de casas sin ruido:")
print(f"MSE: {MSE_original}")
print(f"MAE: {MAE_original}")
print(f"RMSE: {RMSE_original}")
print(f"R2: {R2_original}")

# Paso 2: Crear nuevos datos de entrada reemplazando solo la columna de ingreso con las predicciones
column_index = list(all_data.feature_names).index('MedInc')  # Índice de la columna de ingreso
X_train_val_new = np.copy(X_train_val)
X_test_new = np.copy(X_test)

# Realiza la predicción en el conjunto de entrenamiento para el modelo original
Y_predict_train = model_original.predict(X_train_val_scaled)

# Reemplazar solo la columna de ingreso
Y_train_val_new = X_train_val_new[:, column_index].copy()
Y_test_new = X_test_new[:, column_index].copy()
X_train_val_new[:, column_index] = Y_predict_train  
X_test_new[:, column_index] = Y_predict 

scaler_reconstruct = StandardScaler()
X_train_val_new_scaled = scaler_reconstruct.fit_transform(X_train_val_new)
X_test_new_scaled = scaler_reconstruct.transform(X_test_new)

# Reconstrucción de la columna de ingreso usando los datos sin ruido
model_reconstruct = DecisionTreeRegressor(
    max_depth=best_params['max_depth'],
    min_samples_split=best_params['min_samples_split'],
    min_samples_leaf=best_params['min_samples_leaf'],
    max_features=best_params['max_features'],
    random_state=42
)
model_reconstruct.fit(X_train_val_new_scaled, Y_train_val_new)
income_predict = model_reconstruct.predict(X_test_new_scaled)

# Evaluar la reconstrucción de ingresos (sin ruido)
MSE_income_reconstruct = mse(X_test[:, column_index], income_predict)
MAE_income_reconstruct = mae(X_test[:, column_index], income_predict)
RMSE_income_reconstruct = np.sqrt(MSE_income_reconstruct)
R2_income_reconstruct = r2(X_test[:, column_index], income_predict)

print("\nReconstrucción de ingresos (sin ruido):")
print(f"MSE: {MSE_income_reconstruct}")
print(f"MAE: {MAE_income_reconstruct}")
print(f"RMSE: {RMSE_income_reconstruct}")
print(f"R2: {R2_income_reconstruct}")

# Paso 4: Añadir ruido Laplaciano a la columna de ingreso y repetir el procedimiento
epsilon = 1  # Usamos el epsilon ya seleccionado
X_train_val_noisy = add_laplace_noise(X_train_val, epsilon)
scalerRuido = StandardScaler()
X_train_val_noisy_scaled = scalerRuido.fit_transform(X_train_val_noisy)
X_test_scaled = scalerRuido.transform(X_test)

# Entrenar el modelo con datos ruidosos
model_noisy = DecisionTreeRegressor(
    max_depth=best_params['max_depth'],
    min_samples_split=best_params['min_samples_split'],
    min_samples_leaf=best_params['min_samples_leaf'],
    max_features=best_params['max_features'],
    random_state=42
)
model_noisy.fit(X_train_val_noisy_scaled, Y_train_val)
Y_predict_noisy = model_noisy.predict(X_test_scaled)

# Evaluar el modelo con datos ruidosos
MSE_noisy = mse(Y_test, Y_predict_noisy)
MAE_noisy = mae(Y_test, Y_predict_noisy)
RMSE_noisy = np.sqrt(MSE_noisy)
R2_noisy = r2(Y_test, Y_predict_noisy)

print("\nPredicción de precios con ruido:")
print(f"MSE: {MSE_noisy}")
print(f"MAE: {MAE_noisy}")
print(f"RMSE: {RMSE_noisy}")
print(f"R2: {R2_noisy}")

# Paso 5: Crear nuevos datos de entrada reemplazando la columna de ingreso con las predicciones ruidosas
X_train_val_new_noisy = np.copy(X_train_val_noisy)
X_test_new = np.copy(X_test)

# Realiza la predicción en el conjunto de entrenamiento con el modelo ruidoso
Y_predict_train_noisy = model_noisy.predict(X_train_val_noisy_scaled)

# Reemplazar solo la columna de ingreso con las predicciones ruidosas
Y_train_val_new = X_train_val_new_noisy[:, column_index].copy()
Y_test_new = X_test_new[:, column_index].copy()
X_train_val_new_noisy[:, column_index] = Y_predict_train_noisy  
X_test_new[:, column_index] = Y_predict

scaler_reconstruct = StandardScaler()
X_train_val_new_scaled_noisy = scaler_reconstruct.fit_transform(X_train_val_new_noisy)
X_test_new_scaled = scaler_reconstruct.transform(X_test_new)

# Reconstrucción de la columna de ingreso con datos ruidosos
model_reconstruct_noisy = DecisionTreeRegressor(
    max_depth=best_params['max_depth'],
    min_samples_split=best_params['min_samples_split'],
    min_samples_leaf=best_params['min_samples_leaf'],
    max_features=best_params['max_features'],
    random_state=42
)
model_reconstruct_noisy.fit(X_train_val_new_scaled_noisy, Y_train_val_new)
income_predict_noisy = model_reconstruct_noisy.predict(X_test_new_scaled)

# Evaluar la reconstrucción de ingresos (con ruido)
MSE_income_reconstruct_noisy = mse(X_test[:, column_index], income_predict_noisy)
MAE_income_reconstruct_noisy = mae(X_test[:, column_index], income_predict_noisy)
RMSE_income_reconstruct_noisy = np.sqrt(MSE_income_reconstruct_noisy)
R2_income_reconstruct_noisy = r2(X_test[:, column_index], income_predict_noisy)

print("\nReconstrucción de ingresos (con ruido):")
print(f"MSE: {MSE_income_reconstruct_noisy}")
print(f"MAE: {MAE_income_reconstruct_noisy}")
print(f"RMSE: {RMSE_income_reconstruct_noisy}")
print(f"R2: {R2_income_reconstruct_noisy}")


Fitting 5 folds for each of 108 candidates, totalling 540 fits
[CV] END max_depth=None, max_features=None, min_samples_leaf=1, min_samples_split=10; total time=   0.2s
[CV] END max_depth=None, max_features=None, min_samples_leaf=1, min_samples_split=10; total time=   0.2s
[CV] END max_depth=None, max_features=None, min_samples_leaf=1, min_samples_split=10; total time=   0.2s
[CV] END max_depth=None, max_features=None, min_samples_leaf=1, min_samples_split=2; total time=   0.3s
[CV] END max_depth=None, max_features=None, min_samples_leaf=1, min_samples_split=2; total time=   0.3s
[CV] END max_depth=None, max_features=None, min_samples_leaf=1, min_samples_split=2; total time=   0.3s
[CV] END max_depth=None, max_features=None, min_samples_leaf=1, min_samples_split=2; total time=   0.3s
[CV] END max_depth=None, max_features=None, min_samples_leaf=1, min_samples_split=2; total time=   0.3s
[CV] END max_depth=None, max_features=None, min_samples_leaf=1, min_samples_split=20; total time=   0.