In [1]:
# =========================================
# 1) Cargar datos y objetivo (REGRESIÓN)
# =========================================
import os, warnings, datetime, platform, json
import numpy as np
import pandas as pd
warnings.filterwarnings("ignore")

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

TS = "v1" #datetime.datetime.now().strftime("%Y%m%d_%H%M%S")

DATA_FILE = "credito_cleandata_4model.csv"     # <-- tu archivo limpio
assert os.path.exists(DATA_FILE), f"No se encuentra {DATA_FILE}"
df = pd.read_csv(DATA_FILE)

target_clf = "Aprobado"
target_reg = "Line credito"
TARGET     = target_reg
df = df.drop(columns=[target_clf], errors="ignore")

y = df[TARGET].astype(float)
X = df.drop(columns=[TARGET]).copy()

print("Shape:", X.shape)
print("Target:", TARGET)
print("Resumen (y):")
print(y.describe())

# (opcional) export meta del paso 1
# pd.DataFrame({"columns": X.columns, "dtype": [str(X[c].dtype) for c in X.columns]}).to_csv(
#     f"reports/step1_overview_{TS}.csv", index=False
# )


Shape: (60, 7)
Target: Line credito
Resumen (y):
count      60.000000
mean     5250.300000
std      1793.361547
min      2152.000000
25%      3673.250000
50%      5560.000000
75%      6885.250000
max      8453.000000
Name: Line credito, dtype: float64


In [2]:
df

Unnamed: 0,Sexo,Trabajo dependiente,Funcion principal,Nivel estudio,Edad,Antiguedad ultimo trabajo,Ingreso mes,Line credito
0,Femenino,Sí,Administrativo,Secundaria,25,3,2798,2588
1,Femenino,Sí,Producción,Técnico,35,5,4044,3932
2,Femenino,Sí,Servicio al cliente,Bachiller,32,2,3210,3577
3,Masculino,Sí,Administrativo,Bachiller,44,5,4527,4871
4,Masculino,Sí,Servicio al cliente,Magíster,45,6,5018,5657
5,Masculino,Sí,Servicio al cliente,Técnico,40,4,4327,4216
6,Masculino,Sí,Administrativo,Secundaria,42,5,4402,3963
7,Masculino,Sí,Producción,Técnico,32,1,3464,3635
8,Masculino,No,Consultor,Secundaria,29,4,3253,2776
9,Masculino,Sí,Ventas,Secundaria,26,2,2344,2292


In [3]:
# =========================================
# 2) Split temprano (holdout 80/20)
# =========================================
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=RANDOM_STATE
)
print("TRAIN:", X_train.shape, "TEST:", X_test.shape)


TRAIN: (48, 7) TEST: (12, 7)


In [4]:
# =========================================
# 3) Preprocesamiento en pipeline (REGRESIÓN)
#     - OHE para nominales
#     - OrdinalEncoder para 'Nivel estudio' con el orden real
#     - StandardScaler para numéricas
# =========================================
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.pipeline import Pipeline
import pandas as pd

# Listas base (ajústalas si cambias columnas)
cat_nominal = ["Sexo", "Trabajo dependiente", "Funcion principal"]
cat_ordinal = ["Nivel estudio"]
num_cols    = ["Edad", "Antiguedad ultimo trabajo", "Ingreso mes"]

# Orden explícito para 'Nivel estudio' (coincide con tu CSV)
orden_estudio = [["Secundaria", "Técnico", "Bachiller", "Magíster", "Doctor"]]

# Filtrar solo columnas existentes (por si el CSV cambia)
num_features = [c for c in num_cols if c in X.columns]
cat_nominal_ = [c for c in cat_nominal if c in X.columns]
cat_ordinal_ = [c for c in cat_ordinal if c in X.columns]
cat_features = cat_nominal_ + cat_ordinal_

# Comprobación rápida de niveles fuera de catálogo (te avisa pero no rompe)
if "Nivel estudio" in cat_ordinal_:
    niveles_en_datos = pd.Series(X["Nivel estudio"].astype("string").str.strip().unique()).sort_values().tolist()
    fuera = sorted(set(niveles_en_datos) - set(orden_estudio[0]))
    if fuera:
        print("[aviso] 'Nivel estudio' tiene niveles fuera del orden declarado:", fuera)

preprocessor = ColumnTransformer(
    transformers=[
        # Numéricas
        ("num", StandardScaler(), num_features),

        # Nominales (one-hot). Ignora niveles no vistos; usa drop='first' para evitar dummy trap.
        ("nominal", OneHotEncoder(drop="first", sparse_output=False, handle_unknown="ignore"), cat_nominal_),

        # Ordinal (orden declarado). Desconocidos -> -1 (queda claro para el modelo).
        ("ordinal", OrdinalEncoder(
            categories=orden_estudio[:len(cat_ordinal_)],
            handle_unknown="use_encoded_value", unknown_value=-1
        ), cat_ordinal_),
    ],
    remainder="drop",
    verbose_feature_names_out=True
)

def build_pipe(model):
    return Pipeline(steps=[
        ("prep", preprocessor),
        ("model", model),
    ])

print("num_features:", num_features)
print("cat_nominal:", cat_nominal_)
print("cat_ordinal:", cat_ordinal_)


num_features: ['Edad', 'Antiguedad ultimo trabajo', 'Ingreso mes']
cat_nominal: ['Sexo', 'Trabajo dependiente', 'Funcion principal']
cat_ordinal: ['Nivel estudio']


In [5]:
# =========================================
# 4) Modelos candidatos (Regresión)
# =========================================
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor

# Boosting (con guardas)
try:
    from xgboost import XGBRegressor
    _HAS_XGB = True
except Exception:
    XGBRegressor = None; _HAS_XGB = False; print("Aviso: xgboost no disponible -> omitimos XGBR")

try:
    from lightgbm import LGBMRegressor
    _HAS_LGB = True
except Exception:
    LGBMRegressor = None; _HAS_LGB = False; print("Aviso: lightgbm no disponible -> omitimos LGBR")

try:
    from catboost import CatBoostRegressor
    _HAS_CAT = True
except Exception:
    CatBoostRegressor = None; _HAS_CAT = False; print("Aviso: catboost no disponible -> omitimos CATR")

candidates = [
    ("LIN", LinearRegression()),
    ("RID", Ridge(random_state=RANDOM_STATE)),
    ("LAS", Lasso(random_state=RANDOM_STATE, max_iter=10000)),
    ("ENR", ElasticNet(random_state=RANDOM_STATE, max_iter=10000)),
    ("KNR", KNeighborsRegressor()),
    ("DTR", DecisionTreeRegressor(random_state=RANDOM_STATE)),
    ("RFR", RandomForestRegressor(n_estimators=300, random_state=RANDOM_STATE, n_jobs=-1)),
    ("MLP", MLPRegressor(hidden_layer_sizes=(64,), max_iter=1000, random_state=RANDOM_STATE)),
]
if _HAS_XGB:
    candidates.append(("XGBR", XGBRegressor(
        tree_method="hist", random_state=RANDOM_STATE, n_estimators=400,
        learning_rate=0.05, max_depth=6, subsample=0.9, colsample_bytree=0.9, n_jobs=-1
    )))
if _HAS_LGB:
    candidates.append(("LGBR", LGBMRegressor(
        n_estimators=500, learning_rate=0.05, max_depth=-1,
        subsample=0.9, colsample_bytree=0.9, random_state=RANDOM_STATE, n_jobs=-1, verbosity=-1
    )))
if _HAS_CAT:
    candidates.append(("CATR", CatBoostRegressor(
        iterations=600, learning_rate=0.05, depth=6, random_state=RANDOM_STATE,
        l2_leaf_reg=3.0, verbose=False, allow_writing_files=False, thread_count=-1
    )))

print("Candidatos:", [n for n,_ in candidates])


Candidatos: ['LIN', 'RID', 'LAS', 'ENR', 'KNR', 'DTR', 'RFR', 'MLP', 'XGBR', 'LGBR', 'CATR']


In [6]:
# =========================================
# 5) Baseline con CV (sin tuning) — métricas regresión
#    Ranking por RMSE asc, desempate por MAE y R2
# =========================================
from sklearn.model_selection import KFold, cross_validate
import numpy as np, pandas as pd

cv = KFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

scoring = {
    "neg_mae": "neg_mean_absolute_error",
    "neg_rmse": "neg_root_mean_squared_error",
    "r2": "r2",
    "evs": "explained_variance",
    "neg_mape": "neg_mean_absolute_percentage_error"
}

baseline_rows = []
for name, model in candidates:
    pipe = build_pipe(model)
    s = cross_validate(pipe, X_train, y_train, cv=cv, scoring=scoring, n_jobs=-1, error_score=np.nan)
    row = {
        "model": name,
        "MAE":  -s["test_neg_mae"].mean(),
        "RMSE": -s["test_neg_rmse"].mean(),
        "MAPE": -s["test_neg_mape"].mean(),
        "R2":    s["test_r2"].mean(),
        "EVS":   s["test_evs"].mean(),
    }
    baseline_rows.append(row)
    print(f"{name:>4} | MAE {row['MAE']:.3f} | RMSE {row['RMSE']:.3f} | R2 {row['R2']:.3f} | EVS {row['EVS']:.3f}")

baseline_df = (pd.DataFrame(baseline_rows)
                 .sort_values(["RMSE","MAE","R2"], ascending=[True, True, False])
                 .reset_index(drop=True))

baseline_best_name  = baseline_df.iloc[0]["model"]
baseline_best_model = dict(candidates)[baseline_best_name]
print(f">>> Baseline ganador: {baseline_best_name}")




 LIN | MAE 85.288 | RMSE 102.171 | R2 0.995 | EVS 0.996




 RID | MAE 104.332 | RMSE 126.006 | R2 0.994 | EVS 0.994




 LAS | MAE 76.811 | RMSE 91.436 | R2 0.996 | EVS 0.997




 ENR | MAE 439.875 | RMSE 498.860 | R2 0.905 | EVS 0.912




 KNR | MAE 509.247 | RMSE 576.546 | R2 0.863 | EVS 0.872




 DTR | MAE 391.704 | RMSE 489.537 | R2 0.905 | EVS 0.913




 RFR | MAE 331.460 | RMSE 417.979 | R2 0.928 | EVS 0.934




 MLP | MAE 4533.705 | RMSE 4751.315 | R2 -7.555 | EVS 0.270




XGBR | MAE 300.172 | RMSE 383.156 | R2 0.945 | EVS 0.952




LGBR | MAE 1488.404 | RMSE 1675.764 | R2 -0.022 | EVS 0.000
CATR | MAE 330.161 | RMSE 418.255 | R2 0.934 | EVS 0.939
>>> Baseline ganador: LAS




In [7]:
# =========================================
# 5.x) Exportar métricas baseline a CSV
# =========================================
import os
REPORTS_DIR = "reports"; os.makedirs(REPORTS_DIR, exist_ok=True)
metrics_csv = os.path.join(REPORTS_DIR, f"step5_baseline_metrics_{TS}.csv")
baseline_df.to_csv(metrics_csv, index=False)
print("✔ CSV exportado:", metrics_csv)


✔ CSV exportado: reports/step5_baseline_metrics_v1.csv


In [8]:
# =========================================
# 6) Tuning con CV y elección del ganador (rápido)
#     - Refit por RMSE (neg_root_mean_squared_error)
# =========================================
from sklearn.model_selection import RandomizedSearchCV, KFold
from scipy.stats import randint, uniform
try:
    from scipy.stats import loguniform
except Exception:
    from sklearn.utils.fixes import loguniform

cv_light = KFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
cv_heavy = KFold(n_splits=3, shuffle=True, random_state=RANDOM_STATE)

param_spaces = {
    "RID": {"model__alpha": loguniform(1e-3, 1e2)},
    "LAS": {"model__alpha": loguniform(1e-4, 1e1)},
    "ENR": {"model__alpha": loguniform(1e-4, 1e1), "model__l1_ratio": uniform(0.0,1.0)},
    "KNR": {"model__n_neighbors": randint(3, 40), "model__weights": ["uniform","distance"]},
    "DTR": {"model__max_depth": randint(3, 20), "model__min_samples_leaf": randint(1, 8)},
    "RFR": {"model__n_estimators": randint(200, 600), "model__max_depth": randint(4, 20),
            "model__min_samples_split": randint(2, 16), "model__min_samples_leaf": randint(1, 8),
            "model__max_features": ["sqrt","log2", None], "model__bootstrap": [True, False]},
    "XGBR": {"model__n_estimators": randint(250, 600), "model__learning_rate": loguniform(5e-3, 2e-1),
             "model__max_depth": randint(3, 10), "model__subsample": uniform(0.6, 0.4),
             "model__colsample_bytree": uniform(0.6, 0.4), "model__min_child_weight": randint(1, 8)},
    "LGBR": {"model__n_estimators": randint(300, 800), "model__learning_rate": loguniform(5e-3, 2e-1),
             "model__num_leaves": randint(16, 256), "model__max_depth": randint(-1, 16),
             "model__min_child_samples": randint(10, 80), "model__subsample": uniform(0.6, 0.4),
             "model__colsample_bytree": uniform(0.6, 0.4), "model__reg_lambda": loguniform(1e-3, 10)},
    "CATR": {"model__iterations": randint(300, 700), "model__learning_rate": loguniform(5e-3, 2e-1),
             "model__depth": randint(4, 10), "model__l2_leaf_reg": loguniform(1e-2, 30),
             "model__border_count": randint(32, 255)},
    "MLP":  {"model__hidden_layer_sizes": [(64,), (128,), (64,32)], "model__alpha": loguniform(1e-5, 1e-1)},
}

to_tune = [
    ("RID", Ridge(random_state=RANDOM_STATE)),
    ("LAS", Lasso(random_state=RANDOM_STATE, max_iter=10000)),
    ("ENR", ElasticNet(random_state=RANDOM_STATE, max_iter=10000)),
    ("KNR", KNeighborsRegressor()),
    ("DTR", DecisionTreeRegressor(random_state=RANDOM_STATE)),
    ("RFR", RandomForestRegressor(random_state=RANDOM_STATE, n_jobs=1)),
    ("MLP", MLPRegressor(random_state=RANDOM_STATE, max_iter=1000)),
]
if _HAS_XGB: to_tune.append(("XGBR", XGBRegressor(tree_method="hist", random_state=RANDOM_STATE, n_jobs=1)))
if _HAS_LGB: to_tune.append(("LGBR", LGBMRegressor(random_state=RANDOM_STATE, n_jobs=1, verbosity=-1)))
if _HAS_CAT: to_tune.append(("CATR", CatBoostRegressor(random_state=RANDOM_STATE, verbose=False,
                                                       allow_writing_files=False, thread_count=1)))

scoring_opt = {
    "neg_rmse": "neg_root_mean_squared_error",
    "neg_mae":  "neg_mean_absolute_error",
    "neg_mape": "neg_mean_absolute_percentage_error",
    "r2": "r2",
    "evs": "explained_variance",
}
refit_metric = "neg_rmse"

opt_rows, best_models = [], []
for name, base_model in to_tune:
    pipe = build_pipe(base_model)
    heavy = name in ["XGBR","LGBR","CATR","RFR","MLP"]

    search = RandomizedSearchCV(
        pipe, param_spaces[name], n_iter=(15 if heavy else 12),
        cv=(cv_heavy if heavy else cv_light), scoring=scoring_opt,
        refit=refit_metric, n_jobs=-1, random_state=RANDOM_STATE,
        verbose=1, return_train_score=False, error_score=np.nan
    )
    search.fit(X_train, y_train)

    idx = search.best_index_
    cvres = search.cv_results_
    row = {
        "model": name,
        "RMSE": -float(cvres["mean_test_neg_rmse"][idx]),
        "MAE":  -float(cvres["mean_test_neg_mae"][idx]),
        "MAPE": -float(cvres["mean_test_neg_mape"][idx]),
        "R2":    float(cvres["mean_test_r2"][idx]),
        "EVS":   float(cvres["mean_test_evs"][idx]),
        "best_params_len": int(len(search.best_params_)),
    }
    opt_rows.append(row)
    best_models.append((name, search.best_estimator_, -row["RMSE"], search.best_params_))  # para ordenar por RMSE asc
    print(f"{name:>4} | RMSE {row['RMSE']:.3f} | MAE {row['MAE']:.3f} | R2 {row['R2']:.3f}")

tuning_df = (pd.DataFrame(opt_rows)
             .sort_values(["RMSE","MAE","R2"], ascending=[True, True, False])
             .reset_index(drop=True))
best_models.sort(key=lambda x: x[2], reverse=True)  # mayor -RMSE == menor RMSE
best_name, final_pipe_opt, _, best_params = best_models[0]
print(f">>> GANADOR OPTIMIZADO: {best_name}")


Fitting 5 folds for each of 12 candidates, totalling 60 fits




 RID | RMSE 98.382 | MAE 84.072 | R2 0.996
Fitting 5 folds for each of 12 candidates, totalling 60 fits
 LAS | RMSE 87.549 | MAE 72.351 | R2 0.997
Fitting 5 folds for each of 12 candidates, totalling 60 fits


  model = cd_fast.enet_coordinate_descent(


 ENR | RMSE 98.445 | MAE 84.060 | R2 0.996
Fitting 5 folds for each of 12 candidates, totalling 60 fits
 KNR | RMSE 521.172 | MAE 443.007 | R2 0.886
Fitting 5 folds for each of 12 candidates, totalling 60 fits




 DTR | RMSE 474.182 | MAE 396.760 | R2 0.906
Fitting 3 folds for each of 15 candidates, totalling 45 fits




 RFR | RMSE 471.360 | MAE 395.769 | R2 0.918
Fitting 3 folds for each of 15 candidates, totalling 45 fits




 MLP | RMSE 1006.284 | MAE 822.107 | R2 0.621
Fitting 3 folds for each of 15 candidates, totalling 45 fits




XGBR | RMSE 339.480 | MAE 284.372 | R2 0.957
Fitting 3 folds for each of 15 candidates, totalling 45 fits




LGBR | RMSE 1705.444 | MAE 1516.250 | R2 -0.042
Fitting 3 folds for each of 15 candidates, totalling 45 fits




CATR | RMSE 365.216 | MAE 296.425 | R2 0.952
>>> GANADOR OPTIMIZADO: LAS




In [9]:
# =========================================
# 6.x) Exportar métricas tuning a CSV
# =========================================
os.makedirs("reports", exist_ok=True)
tuning_csv = os.path.join("reports", f"step6_tuning_metrics_{TS}.csv")
tuning_df.to_csv(tuning_csv, index=False)
print("✔ CSV exportado:", tuning_csv)


✔ CSV exportado: reports/step6_tuning_metrics_v1.csv


In [10]:
# =========================================
# 7) Comparación justa (solo CV) — baseline vs tuned
#     Mismo CV y métricas. Decide por RMSE (mejora ≥ 1%)
# =========================================
from sklearn.model_selection import KFold, cross_validate

same_cv = KFold(n_splits=5, shuffle=True, random_state=123)
scoring7 = {"neg_rmse":"neg_root_mean_squared_error", "neg_mae":"neg_mean_absolute_error", "r2":"r2", "evs":"explained_variance"}

def cv_summary(pipe, tag):
    s = cross_validate(pipe, X_train, y_train, cv=same_cv, scoring=scoring7, n_jobs=-1)
    rmse = -s["test_neg_rmse"].mean()
    mae  = -s["test_neg_mae"].mean()
    r2   =  s["test_r2"].mean()
    evs  =  s["test_evs"].mean()
    print(f"{tag:>14} | RMSE {rmse:.3f} | MAE {mae:.3f} | R2 {r2:.3f} | EVS {evs:.3f}")
    return {"RMSE": rmse, "MAE": mae, "R2": r2, "EVS": evs}

pipe_baseline_best = build_pipe(baseline_best_model)
pipe_tuned_best    = final_pipe_opt

row_base = cv_summary(pipe_baseline_best, f"Baseline({baseline_best_name})")
row_tune = cv_summary(pipe_tuned_best,   f"Tuned({best_name})")

delta_rmse = row_base["RMSE"] - row_tune["RMSE"]  # positivo = mejora
print(f"ΔRMSE (Baseline - Tuned): {delta_rmse:.4f}")

# Regla: mejora relativa ≥ 1% en RMSE → Tuned; si no, Baseline
rel = delta_rmse / row_base["RMSE"] if row_base["RMSE"] > 0 else 0.0
if rel >= 0.01:
    winner_name, winner_pipe = best_name, pipe_tuned_best
else:
    winner_name, winner_pipe = baseline_best_name, pipe_baseline_best

print(f">>> Modelo seleccionado para TEST: {winner_name}")

compare_df = pd.DataFrame([
    {"model": f"Baseline({baseline_best_name})", **row_base},
    {"model": f"Tuned({best_name})",            **row_tune},
])


 Baseline(LAS) | RMSE 92.422 | MAE 77.226 | R2 0.997 | EVS 0.998
    Tuned(LAS) | RMSE 90.694 | MAE 76.720 | R2 0.997 | EVS 0.998
ΔRMSE (Baseline - Tuned): 1.7272
>>> Modelo seleccionado para TEST: LAS




In [11]:
# =========================================
# 7.x) Exportar comparación justa a CSV
# =========================================
cmp_csv = os.path.join("reports", f"step7_fair_compare_{TS}.csv")
compare_df.to_csv(cmp_csv, index=False)
print("✔ CSV exportado:", cmp_csv)


✔ CSV exportado: reports/step7_fair_compare_v1.csv


In [12]:
# =========================================
# 8) Política de decisión (Residuales OOF en TRAIN)
#     - OOF con cross_val_predict (predict)
#     - Residuales y diagnóstico simple
# =========================================
from sklearn.model_selection import KFold, cross_val_predict

cv_res = KFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
oof_pred = cross_val_predict(winner_pipe, X_train, y_train, cv=cv_res, method="predict", n_jobs=-1)
residuals_train = y_train.values - oof_pred
residuals_df = pd.DataFrame({"y_true": y_train.values, "y_pred": oof_pred, "residual": residuals_train})
print(residuals_df.describe().round(4))

# (opcional export)
# residuals_df.to_csv(os.path.join("reports", f"step8_residuals_train_{TS}.csv"), index=False)


          y_true     y_pred  residual
count    48.0000    48.0000   48.0000
mean   4996.9375  4996.6272    0.3103
std    1707.4903  1702.0189   89.0275
min    2152.0000  2245.6755 -186.1989
25%    3527.2500  3489.1155  -69.3569
50%    5073.5000  5076.2277   -9.9643
75%    6402.7500  6344.4437   47.8293
max    7755.0000  7722.3031  172.3162




In [13]:
# =========================================
# 9) Evaluación final en TEST
# =========================================
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, explained_variance_score

winner_pipe.fit(X_train, y_train)
y_pred_test = winner_pipe.predict(X_test)

MAE  = mean_absolute_error(y_test, y_pred_test)
RMSE = mean_squared_error(y_test, y_pred_test, squared=False)
R2   = r2_score(y_test, y_pred_test)
EVS  = explained_variance_score(y_test, y_pred_test)

print(f"TEST | MAE {MAE:.3f} | RMSE {RMSE:.3f} | R2 {R2:.3f} | EVS {EVS:.3f}")

residuals_test = y_test.values - y_pred_test


TEST | MAE 100.873 | RMSE 120.524 | R2 0.995 | EVS 0.996


In [14]:
# =========================================
# 9.x) Exportar métricas y residuales TEST a CSV
# =========================================
os.makedirs("reports", exist_ok=True)
pd.DataFrame([{"MAE": MAE, "RMSE": RMSE, "R2": R2, "EVS": EVS}]).to_csv(
    os.path.join("reports", f"step9_test_metrics_{TS}.csv"), index=False
)
pd.DataFrame({"y_true": y_test.values, "y_pred": y_pred_test, "residual": residuals_test}).to_csv(
    os.path.join("reports", f"step9_residuals_test_{TS}.csv"), index=False
)
print("✔ CSV exportados (paso 9.x)")


✔ CSV exportados (paso 9.x)


In [15]:
# =========================================
# 10) Interpretabilidad + Error Analysis (sin export)
#     - Importancias (árboles) o Permutation
#     - SHAP si está disponible
# =========================================
from sklearn.inspection import permutation_importance

prep   = winner_pipe.named_steps["prep"]
model  = winner_pipe.named_steps["model"]
X_test_trans = prep.transform(X_test)

# nombres de features transformados
try:
    feat_names = prep.get_feature_names_out().tolist()
except Exception:
    feat_names = num_features + cat_features

# Importancias
imp_source, imp_values = "permutation", None
try:
    # Árboles con .feature_importances_
    imp_values = getattr(model, "feature_importances_", None)
    if imp_values is not None:
        imp_source = "model_feature_importances_"
    else:
        raise AttributeError
except Exception:
    pi = permutation_importance(model, X_test_trans, y_test, n_repeats=10, random_state=RANDOM_STATE, n_jobs=-1)
    imp_values = pi.importances_mean
    imp_source = "permutation_importance"

order = np.argsort(-np.abs(imp_values))[:20]
print(f"\n== Importancias (fuente={imp_source}) — top 20 ==")
for i, j in enumerate(order, 1):
    print(f"{i:>2}. {feat_names[j]:<40} | {float(imp_values[j]): .6f}")

# SHAP (opcional)
shap_abs_mean, shap_source = None, None
try:
    import shap
    rng = np.random.RandomState(RANDOM_STATE)
    idx = rng.choice(np.arange(X_test_trans.shape[0]), size=min(500, X_test_trans.shape[0]), replace=False)
    explainer = shap.Explainer(lambda X: model.predict(X), X_test_trans[idx])
    sv = explainer(X_test_trans[idx], max_evals=500)
    shap_abs_mean = np.mean(np.abs(sv.values), axis=0)
    shap_source = "shap_explainer"
    order_s = np.argsort(-shap_abs_mean)[:20]
    print(f"\n== SHAP mean |value| (fuente={shap_source}) — top 20 ==")
    for i, j in enumerate(order_s, 1):
        print(f"{i:>2}. {feat_names[j]:<40} | {float(shap_abs_mean[j]): .6f}")
except Exception:
    print("\n== SHAP no disponible; se omite ==")



== Importancias (fuente=permutation_importance) — top 20 ==
 1. num__Ingreso mes                         |  1.361383
 2. ordinal__Nivel estudio                   |  0.070791
 3. nominal__Trabajo dependiente_Sí          |  0.003737
 4. num__Edad                                |  0.000461
 5. nominal__Funcion principal_Servicio al cliente | -0.000262
 6. nominal__Sexo_Masculino                  | -0.000153
 7. num__Antiguedad ultimo trabajo           |  0.000000
 8. nominal__Funcion principal_Consultor     |  0.000000
 9. nominal__Funcion principal_Contratista   |  0.000000
10. nominal__Funcion principal_Emprendedor   |  0.000000
11. nominal__Funcion principal_Freelance     |  0.000000
12. nominal__Funcion principal_Producción    |  0.000000
13. nominal__Funcion principal_Ventas        |  0.000000

== SHAP no disponible; se omite ==


In [16]:
# =========================================
# 10.x) Exportar interpretabilidad/residuales outliers (opcional)
# =========================================
os.makedirs("reports", exist_ok=True)

# Importancias
imp_df = pd.DataFrame({"feature": feat_names, "importance": np.asarray(imp_values, dtype=float)})
imp_df["rank"] = imp_df["importance"].abs().rank(method="dense", ascending=False).astype(int)
imp_df.sort_values(["rank","feature"]).to_csv(os.path.join("reports", f"step10_feature_importance_{TS}.csv"), index=False)

# SHAP (si hay)
if 'shap_abs_mean' in globals() and shap_abs_mean is not None:
    shap_df = pd.DataFrame({"feature": feat_names, "shap_mean_abs": np.asarray(shap_abs_mean, dtype=float)})
    shap_df["rank"] = shap_df["shap_mean_abs"].rank(method="dense", ascending=False).astype(int)
    shap_df.sort_values(["rank","feature"]).to_csv(os.path.join("reports", f"step10_shap_meanabs_{TS}.csv"), index=False)

# Outliers: top |residual| en TEST
k = min(50, len(residuals_test))
out_idx = np.argsort(-np.abs(residuals_test))[:k]
pd.DataFrame({
    "idx": out_idx,
    "y_true": y_test.values[out_idx],
    "y_pred": y_pred_test[out_idx],
    "residual": residuals_test[out_idx]
}).to_csv(os.path.join("reports", f"step10_residuals_outliers_{TS}.csv"), index=False)

print("✔ CSV exportados (paso 10.x)")


✔ CSV exportados (paso 10.x)


In [17]:
# =========================================
# 11) Exportar artefactos (despliegue) — sin umbral
# =========================================
import json, joblib, sys, hashlib
from pathlib import Path
import sklearn

version_id = TS
ART_DIR = Path("artefactos") / version_id
ART_DIR.mkdir(parents=True, exist_ok=True)

# 11.1 Pipeline
pipe_path = ART_DIR / f"pipeline_{winner_name}.joblib"
joblib.dump(winner_pipe, pipe_path, compress=3)
print("✔ Pipeline:", pipe_path)

# 11.2 Esquema
schema = {
    "columns": X.columns.tolist(),
    "dtypes": {c: str(X[c].dtype) for c in X.columns},
    "num_features": num_features,
    "cat_features": cat_features,
}
(ART_DIR / "input_schema.json").write_text(json.dumps(schema, ensure_ascii=False, indent=2), encoding="utf-8")

# 11.3 Métricas TEST
test_metrics = {"MAE": float(MAE), "RMSE": float(RMSE), "R2": float(R2), "EVS": float(EVS)}
(ART_DIR / "decision_policy.json").write_text(json.dumps({"winner": winner_name, "test_metrics": test_metrics}, ensure_ascii=False, indent=2), encoding="utf-8")

# 11.4 Model Card mínima
model_card = f"""# Model Card — {winner_name} (Regresión)
**Versión:** {version_id}  
**Entorno:** Python {platform.python_version()} | scikit-learn {sklearn.__version__}

## Datos
Archivo: `{DATA_FILE}`  
Shape: {X.shape}  
Objetivo (continuo): `{TARGET}`

## Entrenamiento
Split 80/20 (random_state={RANDOM_STATE}).
Preprocesamiento: StandardScaler(num) + OHE(nominal) + OrdinalEncoder(ordinal).

## Métricas TEST
MAE={MAE:.3f} | RMSE={RMSE:.3f} | R2={R2:.3f} | EVS={EVS:.3f}

## Artefactos
- `pipeline_{winner_name}.joblib`
- `input_schema.json`
- `decision_policy.json`
"""
(ART_DIR / "model_card.md").write_text(model_card, encoding="utf-8")

# 11.5 Samples
samples_in  = X_test.iloc[:5].to_dict(orient="records")
samples_out = [{"y_pred": float(y_pred_test[i])} for i in range(min(5, len(y_pred_test)))]
(ART_DIR / "sample_inputs.json").write_text(json.dumps(samples_in, ensure_ascii=False, indent=2), encoding="utf-8")
(ART_DIR / "sample_outputs.json").write_text(json.dumps(samples_out, ensure_ascii=False, indent=2), encoding="utf-8")

# 11.6 Hash
sha256 = hashlib.sha256()
with open(pipe_path, "rb") as fh:
    for chunk in iter(lambda: fh.read(1<<20), b""):
        sha256.update(chunk)
(ART_DIR / "pipeline_hash.json").write_text(json.dumps({"file": pipe_path.name, "sha256": sha256.hexdigest()}, ensure_ascii=False, indent=2), encoding="utf-8")

print(f"✔ Artefactos guardados en: {ART_DIR.resolve()}")


✔ Pipeline: artefactos/v1/pipeline_LAS.joblib
✔ Artefactos guardados en: /opt/notebooks/252ml/Regre/artefactos/v1
