# 02_feature_engineering.ipynb

# ---------------------------------------------
# Feature Engineering - Open Banking Challenge
# Autor: Pablo Flores
# ---------------------------------------------

In [None]:
#1. Librerías necesarias
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import os


In [None]:
#2. Cargar dataset unificado
df = pd.read_csv("data/processed/clientes_unificados.csv")
df.head()

In [None]:
#3. Crear variable objetivo (¿tiene seguro?)
df["has_insurance"] = df.get("has_insurance", 0)  # por si no existe
if "has_insurance" not in df.columns:
    df["has_insurance"] = 0
if "has_insurance" in df.columns and df["has_insurance"].dtype != int:
    df["has_insurance"] = df["has_insurance"].astype(int)


In [None]:
#4. Crear nuevas features
df["product_count"] = df.filter(like="has_").drop(columns=["has_insurance"]).sum(axis=1)

In [None]:
# Codificar categoría favorita (top N, resto = 'other')
top_cats = df["favorite_category"].value_counts().index[:5]
df["favorite_category_enc"] = df["favorite_category"].apply(lambda x: x if x in top_cats else "other")


In [None]:

#5. Codificar variables categóricas (one-hot encoding)
categorical_vars = ["income_range", "risk_profile", "favorite_category_enc"]
df_encoded = pd.get_dummies(df, columns=categorical_vars, drop_first=True)

In [None]:
#6. Escalar variables numéricas (opcional)
scaler = StandardScaler()
df_encoded[["age", "total_spent", "avg_spent", "txn_count", "product_count"]] = scaler.fit_transform(
    df_encoded[["age", "total_spent", "avg_spent", "txn_count", "product_count"]]
)

In [None]:
#7. Eliminar columnas no útiles
drop_cols = ["user_id", "occupation", "favorite_category"]
df_encoded = df_encoded.drop(columns=[col for col in drop_cols if col in df_encoded.columns])

In [None]:
#8. Guardar dataset final para modelado
os.makedirs("data/processed", exist_ok=True)
df_encoded.to_csv("data/processed/final_dataset.csv", index=False)

print("✅ Dataset final preparado y guardado.")
df_encoded.head()