In [30]:
# Importación de librerías
import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, root_mean_squared_error, mean_absolute_percentage_error
from utils.transformations import MyTransformation
from utils.filters import ClusteringFilter


In [14]:
# Leemos los datos desde el archivo CSV ya preprocesado
df_train = pd.read_csv("train_data/preprocessed/train_data.csv")

# Separamos las variables predictoras (X) de la variable objetivo (y)
X_train, y_train = df_train.drop(columns=['Price']), df_train[['Price']]

In [15]:
# Preprocesador que aplica imputaciones, escalados y codificaciones personalizados
preprocessor = MyTransformation(n_neighbors=5)

# Filtro basado en clustering DBSCAN para eliminar outliers
filter = ClusteringFilter(eps=0.5, min_samples=5)

In [16]:
# Se ajusta el preprocesador a los datos de entrenamiento
preprocessor.fit(X_train, y_train)

In [17]:
# Aplica todas las transformaciones definidas en la clase MyTransformation
X_train_processed, y_train_processed = preprocessor.transform(X_train, y_train)

In [18]:
X_train_processed.head()

Unnamed: 0,Area,No. of Bedrooms,city_sim_0,city_sim_1,city_sim_2,city_sim_3,city_sim_4,city_sim_5,Location_sim_0,Location_sim_1,...,Location_sim_1448,Location_sim_1449,Location_sim_1450,Location_sim_1451,Location_sim_1452,Location_sim_1453,Location_sim_1454,Location_sim_1455,city_freq,Location_freq
0,-1.327592,-1.140858,0.022727,0.083333,0.03125,0.022727,0.0,1.0,0.032787,0.025316,...,0.0,0.0,0.038462,0.040816,0.0625,0.076923,0.076923,0.071429,5050,95
1,0.04174,0.0,0.022727,0.083333,0.03125,0.022727,0.0,1.0,0.028571,0.022727,...,0.0,0.013514,0.016129,0.027778,0.071429,0.081395,0.0,0.014706,5050,454
2,-0.22526,0.0,0.022727,0.083333,0.03125,0.022727,0.0,1.0,0.011236,0.009346,...,0.0,0.044944,0.173913,0.30303,0.0,0.14433,0.029851,0.16,5050,86
3,0.145998,1.0,0.0,1.0,0.028571,0.0,0.0,0.083333,0.012987,0.010526,...,0.014706,0.0,0.045455,0.026316,0.0,0.010204,0.017857,0.0,3185,159
4,-0.462938,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.015385,0.012048,...,0.0,0.078125,0.075472,0.019417,0.0,0.023529,0.0,0.0,4152,223


In [19]:
# Se ajusta el filtro a los datos procesados para detectar y eliminar outliers
filter.fit(X_train_processed, y_train_processed)

In [20]:
X_train_filtered, y_train_filtered = filter.transform(X_train_processed, y_train_processed)

In [21]:
X_train_filtered.head()

Unnamed: 0,Area,No. of Bedrooms,city_sim_0,city_sim_1,city_sim_2,city_sim_3,city_sim_4,city_sim_5,Location_sim_0,Location_sim_1,...,Location_sim_1448,Location_sim_1449,Location_sim_1450,Location_sim_1451,Location_sim_1452,Location_sim_1453,Location_sim_1454,Location_sim_1455,city_freq,Location_freq
0,-1.327592,-1.140858,0.022727,0.083333,0.03125,0.022727,0.0,1.0,0.032787,0.025316,...,0.0,0.0,0.038462,0.040816,0.0625,0.076923,0.076923,0.071429,5050,95
1,0.04174,0.0,0.022727,0.083333,0.03125,0.022727,0.0,1.0,0.028571,0.022727,...,0.0,0.013514,0.016129,0.027778,0.071429,0.081395,0.0,0.014706,5050,454
2,-0.22526,0.0,0.022727,0.083333,0.03125,0.022727,0.0,1.0,0.011236,0.009346,...,0.0,0.044944,0.173913,0.30303,0.0,0.14433,0.029851,0.16,5050,86
3,0.145998,1.0,0.0,1.0,0.028571,0.0,0.0,0.083333,0.012987,0.010526,...,0.014706,0.0,0.045455,0.026316,0.0,0.010204,0.017857,0.0,3185,159
4,-0.462938,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.015385,0.012048,...,0.0,0.078125,0.075472,0.019417,0.0,0.023529,0.0,0.0,4152,223


In [None]:
# Entrenamos un modelo de Gradient Boosting 
model = GradientBoostingRegressor(n_estimators=200, max_depth=4, learning_rate=0.1, random_state=42)
model.fit(X_train_filtered, y_train_filtered.ravel())

In [25]:
# Carga del conjunto de test

df_test = pd.read_csv("train_data/preprocessed/test_data.csv")
X_test, y_test = df_test.drop(columns=['Price']), df_test[['Price']]

In [26]:
# Aplicamos las mismas transformaciones que en el entrenamiento
X_test_processed, y_test_processed = preprocessor.transform(X_test, y_test)

In [27]:
# Predecimos los valores usando el modelo entrenado
y_pred_scaled = model.predict(X_test_processed)

In [28]:
# Aplicamos la transformación inversa para volver a la escala original del precio
y_pred = preprocessor.inverse_transform(y_pred_scaled.reshape(-1, 1))



In [31]:
# Calculamos las métricas de evaluación del modelo
y_true = y_test.values
rmse = root_mean_squared_error(y_true, y_pred)
mae = mean_absolute_error(y_true, y_pred)
mape = mean_absolute_percentage_error(y_true, y_pred)

# Formatear las métricas con unidades y porcentajes
metrics = {
    "RMSE (₹)": f"{rmse:,.2f}",
    "MAE (₹)": f"{mae:,.2f}",
    "MAPE (%)": f"{mape:.2%}"
}

# Mostrar las métricas en una tabla formateada
print("\nModel Performance Metrics:\n")
print(f"{'Metric':<15} {'Value':>15}")
print("-" * 30)
for metric, value in metrics.items():
    print(f"{metric:<15} {value:>15}")

# Añadir una interpretación resumida
print("\nInterpretation:")
print(f"- RMSE: The model's predictions are typically off by ₹{rmse:,.2f} on average")
print(f"- MAE: The average absolute error is ₹{mae:,.2f}")
print(f"- MAPE: The predictions are off by {mape:.1%} on average")


Model Performance Metrics:

Metric                    Value
------------------------------
RMSE (₹)          23,157,504.40
MAE (₹)            6,107,623.26
MAPE (%)                 43.74%

Interpretation:
- RMSE: The model's predictions are typically off by ₹23,157,504.40 on average
- MAE: The average absolute error is ₹6,107,623.26
- MAPE: The predictions are off by 43.7% on average


## Conclusiones de la práctica

### Transformación de datos

He creado una clase personalizada llamada `MyTransformation` que realiza las siguientes tareas:

- Imputa los valores que faltan:
  - Con **KNN** para `"No. of Bedrooms"`, ya que es una variable discreta y puede completarse bien usando ejemplos parecidos.
  - Con **la mediana** para `"Area"`, así no se ve tan afectada por valores muy grandes o muy pequeños.

- Aplica transformaciones numéricas:
  - Usa **`PowerTransformer` (método `"yeo-johnson"`)** sobre `"Area"` y `"No. of Bedrooms"`. Este método mejora la forma de la distribución y **funciona incluso si hay ceros o negativos**, a diferencia de `Box-Cox`.
  - Escala esas variables con **`RobustScaler`**, que pone todos los valores en una escala parecida y no se ve afectado por valores extremos.

- Codifica variables categóricas:
  - Usa **`SimilarityEncoder`** para `"city"` y `"Location"`, ya que tienen **muchas categorías distintas**. Este encoder **reduce la dimensionalidad** y **puede capturar similitudes entre nombres** (por ejemplo, errores de escritura o nombres parecidos), lo cual podría beneficiar al modelo.
  - Añade también una **columna con la frecuencia** de cada categoría, lo que puede ayudar al modelo a **diferenciar entre valores comunes y raros**, y tratarlos de forma distinta.

- También transforma y escala `"Price"` (la variable objetivo), y después hace el **desescalado** para que las predicciones vuelvan a su escala original.

### Filtrado de datos

He creado la clase `ClusteringFilter` que usa **DBSCAN**, una técnica de clustering que:
- Detecta outliers automáticamente sin necesidad de especificar el número de grupos.
- Se basa en las variables `"Area"` y `"No. of Bedrooms"` para encontrar observaciones extrañas y eliminarlas del conjunto de entrenamiento.

### Modelo y resultados

- He entrenado un modelo `GradientBoostingRegressor`, que permite capturar relaciones complejas entre las variables. He ajustado algunos parámetros como el número de árboles o la profundidad.  

#### Resultados obtenidos:
- **RMSE**: ₹23,157,504.40  
- **MAE**: ₹6,107,623.26  
- **MAPE**: 43.74%

### Comentario final

- El modelo ha funcionado bien, aunque **el MAPE ha salido un poco más alto (1-2%)** que en otras prácticas. Esto puede deberse a los datos eliminados o a las nuevas transformaciones aplicadas.
- He elegido `GradientBoostingRegressor` porque es un modelo que **se adapta bien a problemas con muchas variables**, permite capturar relaciones no lineales, y **es muy flexible** para ajustar parámetros. Además, tiene buen rendimiento incluso en datasets complejos como este.

