In [2]:
pip install pandas numpy scikit-learn

Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/Applications/Xcode.app/Contents/Developer/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
import warnings
warnings.filterwarnings("ignore")

# === 1. Cargar datos ===
transacciones = pd.read_csv("datos/base_transacciones_final.csv", parse_dates=["fecha"])
clientes = pd.read_csv("datos/base_clientes_final.csv", parse_dates=["fecha_nacimiento", "fecha_alta"])

# === 2. Renombrar id a id_cliente para merge
transacciones.rename(columns={"id": "id_cliente"}, inplace=True)
clientes.rename(columns={"id": "id_cliente"}, inplace=True)

# === 3. Unir las bases ===
df = transacciones.merge(clientes, on="id_cliente", how="left")

# === 4. Ordenar por cliente y fecha ===
df = df.sort_values(by=["id_cliente", "fecha"])

# === 5. Calcular días hasta próxima transacción ===
df["fecha_siguiente"] = df.groupby("id_cliente")["fecha"].shift(-1)
df["dias_hasta_proximo"] = (df["fecha_siguiente"] - df["fecha"]).dt.days
df = df.dropna(subset=["dias_hasta_proximo"])

# === 6. Features adicionales ===
df["edad"] = ((df["fecha"] - df["fecha_nacimiento"]).dt.days / 365).astype(int)
df["antiguedad"] = (df["fecha"] - df["fecha_alta"]).dt.days
df["mes"] = df["fecha"].dt.month
df["dia_semana"] = df["fecha"].dt.weekday
df["dias_desde_ultimo"] = df.groupby("id_cliente")["fecha"].diff().dt.days
df["dias_desde_ultimo"] = df["dias_desde_ultimo"].fillna(df["dias_desde_ultimo"].median())

# === 7. Variables y target ===
X = df[[
    "mes", "dia_semana", "edad", "antiguedad", "dias_desde_ultimo",
    "giro_comercio", "tipo_venta", "genero", "actividad_empresarial", "tipo_persona"
]]
y = df["dias_hasta_proximo"]

# === 8. Preprocesamiento ===
numeric_features = ["mes", "dia_semana", "edad", "antiguedad", "dias_desde_ultimo"]
categorical_features = ["giro_comercio", "tipo_venta", "genero", "actividad_empresarial", "tipo_persona"]

preprocessor = ColumnTransformer(transformers=[
    ("num", StandardScaler(), numeric_features),
    ("cat", OneHotEncoder(handle_unknown='ignore'), categorical_features)
])

# === 9. Pipeline y modelo ===
model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("regressor", RandomForestRegressor(n_estimators=200, max_depth=15, random_state=42))
])

# === 10. Entrenamiento ===
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=False, test_size=0.2)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# === 11. Evaluación ===
mae = mean_absolute_error(y_test, y_pred)
print(f"MAE (Error absoluto medio): {mae:.2f} días")


MAE (Error absoluto medio): 1.17 días
