# 01_eda.ipynb

# ---------------------------------------------
# Exploración de Datos - Open Banking Challenge
# Autor: Pablo Flores
# ---------------------------------------------

In [None]:
# 🔧 1. Importación de librerías
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from src.data_preparation import load_datasets, preprocess_transactions, merge_datasets

In [None]:
# Estilos
sns.set(style="whitegrid")
plt.rcParams["figure.figsize"] = (12, 6)

In [None]:
# 2. Cargar datasets originales
demo_path = "data/raw/demographics.csv"
products_path = "data/raw/products.csv"
trans_path = "data/raw/transactions.csv"

demographics, products, transactions = load_datasets(demo_path, products_path, trans_path)

In [None]:
# 3. Preprocesamiento básico
transactions = preprocess_transactions(transactions)

In [None]:

# 4. Unificación de los datos
df_model = merge_datasets(demographics, transactions, products)


In [None]:

# Guardar dataset procesado
os.makedirs("data/processed", exist_ok=True)
df_model.to_csv("data/processed/clientes_unificados.csv", index=False)


In [None]:

# 5. Vista general del dataset unificado
display(df_model.head())
display(df_model.describe(include="all"))


In [None]:

# 6. Análisis exploratorio de variables numéricas
num_cols = ["age", "total_spent", "avg_spent", "txn_count"]
df_model[num_cols].hist(bins=20, figsize=(12, 8))
plt.suptitle("Distribuciones numéricas", fontsize=16)
plt.tight_layout()
plt.show()

In [None]:

# 7. Gasto promedio por rango de ingreso
sns.barplot(data=df_model, x="income_range", y="avg_spent", ci=None, order=["<30k", "30k-50k", "50k-100k", "100k-150k", ">150k"])
plt.title("Gasto promedio mensual vs. ingreso declarado")
plt.xticks(rotation=45)
plt.show()


In [None]:
# 8. ¿La edad influye en el total gastado?
sns.scatterplot(data=df_model, x="age", y="total_spent", hue="risk_profile")
plt.title("Edad vs. gasto total - Segmentado por perfil de riesgo")
plt.show()

In [None]:

# 9. Distribución de categorías favoritas
sns.countplot(data=df_model, y="favorite_category", order=df_model["favorite_category"].value_counts().index[:10])
plt.title("Top 10 categorías favoritas de consumo")
plt.show()

In [None]:

# 10. Correlaciones
corr = df_model[num_cols].corr()
sns.heatmap(corr, annot=True, cmap="coolwarm")
plt.title("Correlación entre variables numéricas")
plt.show()