In [1]:
import pandas as pd

datatest = pd.read_csv(
    "../files/input/test_data.csv.zip",
    index_col=False,
    compression="zip",
)

datatrain = pd.read_csv(
    "../files/input/train_data.csv.zip",
    index_col=False,
    compression="zip",
)

In [2]:
print(datatest.shape)
print(datatest.head())
print(datatest.info())

(9000, 25)
   ID  LIMIT_BAL  SEX  EDUCATION  MARRIAGE  AGE  PAY_0  PAY_2  PAY_3  PAY_4  \
0   2     120000    2          2         2   26     -1      2      0      0   
1  10      20000    1          3         2   35     -2     -2     -2     -2   
2  11     200000    2          3         2   34      0      0      2      0   
3  15     250000    1          1         2   29      0      0      0      0   
4  16      50000    2          3         3   23      1      2      0      0   

   ...  BILL_AMT4  BILL_AMT5  BILL_AMT6  PAY_AMT1  PAY_AMT2  PAY_AMT3  \
0  ...       3272       3455       3261         0      1000      1000   
1  ...          0      13007      13912         0         0         0   
2  ...       2513       1828       3731      2306        12        50   
3  ...      59696      56875      55512      3000      3000      3000   
4  ...      28771      29531      30211         0      1500      1100   

   PAY_AMT4  PAY_AMT5  PAY_AMT6  default payment next month  
0      1000  

In [3]:
# Código para ejecutar en el notebook (asume datatest ya cargado)
col = "EDUCATION"
if col not in datatest.columns:
    print(f"No existe la columna '{col}' en datatest")
else:
    s = datatest[col]
    print(s.value_counts(dropna=False))

EDUCATION
2    4271
1    3105
3    1490
5      93
4      25
6      11
0       5
Name: count, dtype: int64


In [4]:
print(datatrain.shape)
print(datatrain.head())
print(datatrain.info())

(21000, 25)
      ID  LIMIT_BAL  SEX  EDUCATION  MARRIAGE  AGE  PAY_0  PAY_2  PAY_3  \
0  10748     310000    1          3         1   32      0      0      0   
1  12574      10000    2          3         1   49     -1     -1     -2   
2  29677      50000    1          2         1   28     -1     -1     -1   
3   8857      80000    2          3         1   52      2      2      3   
4  21099     270000    1          1         2   34      1      2      0   

   PAY_4  ...  BILL_AMT4  BILL_AMT5  BILL_AMT6  PAY_AMT1  PAY_AMT2  PAY_AMT3  \
0      0  ...      84373      57779      14163      8295      6000      4000   
1     -1  ...       1690       1138        930         0         0      2828   
2      0  ...      45975       1300      43987         0     46257      2200   
3      3  ...      40748      39816      40607      3700      1600      1600   
4      0  ...      22448      15490      17343         0      4000      2000   

   PAY_AMT4  PAY_AMT5  PAY_AMT6  default payment next mo

In [5]:
def clean_dataframe(df):
    df = df.copy()
    # Renombrar columna objetivo si existe
    if "default payment next month" in df.columns:
        df = df.rename(columns={"default payment next month": "default"})

    # Remover columna ID si existe (varias formas posibles)
    for id_col in ("ID", "Id", "id"):
        if id_col in df.columns:
            df = df.drop(columns=[id_col])
            break

    # Eliminar registros con información no disponible
    df = df.dropna()

    # Agrupar EDUCATION > 4 en categoría 'others'
    if "EDUCATION" in df.columns:
        # Convertir a numérico si es posible para comparar
        # Si no se puede convertir, dejar tal cual y reemplazar valores mayores interpretando como int
        df["EDUCATION"] = pd.to_numeric(df["EDUCATION"], errors="coerce")
        # Valores nulos (por conversión) los quitamos antes; si quedaron, se eliminan ahora
        df = df.dropna(subset=["EDUCATION"])
        # Reemplazar > 4 por 4
        df.loc[df["EDUCATION"] > 4, "EDUCATION"] = 4

    return df

In [6]:
datatest =clean_dataframe(datatest)
datatrain =clean_dataframe(datatrain)

In [7]:
# Código para ejecutar en el notebook (asume datatest ya cargado)
col = "EDUCATION"
if col not in datatest.columns:
    print(f"No existe la columna '{col}' en datatest")
else:
    s = datatest[col]
    print(s.value_counts(dropna=False))


EDUCATION
2    4271
1    3105
3    1490
4     129
0       5
Name: count, dtype: int64


In [8]:
print(datatrain.shape)
print(datatrain.head())
print(datatrain.info())

(21000, 24)
   LIMIT_BAL  SEX  EDUCATION  MARRIAGE  AGE  PAY_0  PAY_2  PAY_3  PAY_4  \
0     310000    1          3         1   32      0      0      0      0   
1      10000    2          3         1   49     -1     -1     -2     -1   
2      50000    1          2         1   28     -1     -1     -1      0   
3      80000    2          3         1   52      2      2      3      3   
4     270000    1          1         2   34      1      2      0      0   

   PAY_5  ...  BILL_AMT4  BILL_AMT5  BILL_AMT6  PAY_AMT1  PAY_AMT2  PAY_AMT3  \
0      0  ...      84373      57779      14163      8295      6000      4000   
1      2  ...       1690       1138        930         0         0      2828   
2     -1  ...      45975       1300      43987         0     46257      2200   
3      3  ...      40748      39816      40607      3700      1600      1600   
4      2  ...      22448      15490      17343         0      4000      2000   

   PAY_AMT4  PAY_AMT5  PAY_AMT6  default  
0      3000  

In [9]:

def split_xy(df_test, df_train):
    df_test = df_test.copy()
    df_train = df_train.copy()

    X_test = df_test.drop(columns=["default"])
    y_test = df_test["default"].astype(int)

    X_train = df_train.drop(columns=["default"])
    y_train = df_train["default"].astype(int)

    return X_train, y_train, X_test, y_test

In [10]:
X_train, y_train, X_test, y_test = split_xy(datatest, datatrain)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(21000, 23) (21000,) (9000, 23) (9000,)


In [11]:
# Código: crear y entrenar pipeline (Paso 3)
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

# detecta categóricas automáticas (dtype object/category) + numéricas con pocas categorías
cat_cols = list(X_train.select_dtypes(include=["object", "category"]).columns)
num_like = X_train.select_dtypes(include=["int64", "int32", "float64", "float32"]).columns
cat_cols += [c for c in num_like if X_train[c].nunique() <= 20 and c not in cat_cols]
# excluir la target si está presente
cat_cols = [c for c in cat_cols if c != "default"]
#print("Columnas categóricas detectadas:", cat_cols)

# Definir columnas categóricas esperadas y filtrar las que realmente existen
cat_cols = [c for c in cat_cols if c in X_train.columns]

# Preprocesador: one-hot encoding para categóricas, pasar el resto tal cual
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), cat_cols)
    ],
    remainder="passthrough"
)

# Pipeline: preprocesador + RandomForest
pipe = Pipeline([
    ("pre", preprocessor),
    ("clf", RandomForestClassifier(random_state=42, n_jobs=-1))
])

# Ajustar el pipeline
pipe.fit(X_train, y_train)

# Información rápida
print("Pipeline entrenado.")

Pipeline entrenado.


In [12]:
# ...existing code...
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

TARGET = "default"
THRESH = 20  # umbral de nunique para tratar columnas numéricas como "categoricas"

# OneHotEncoder compatible con distintas versiones de sklearn
try:
    ohe = OneHotEncoder(handle_unknown="ignore", sparse=False)
except TypeError:
    ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False)

# calcular lista de columnas categóricas en X_train (evitar pasar una función al ColumnTransformer)
cat_cols = list(X_train.select_dtypes(include=["object", "category"]).columns)
num_like = X_train.select_dtypes(include=["number"]).columns
cat_cols += [c for c in num_like if X_train[c].nunique() <= THRESH and c not in cat_cols]
cat_cols = [c for c in cat_cols if c != TARGET]
cat_cols = [c for c in cat_cols if c in X_train.columns]

if not cat_cols:
    preprocessor = ColumnTransformer(remainder="passthrough")
else:
    preprocessor = ColumnTransformer(
        transformers=[("cat", ohe, cat_cols)],
        remainder="passthrough",
    )

pipe = Pipeline([
    ("pre", preprocessor),
    ("clf", RandomForestClassifier(random_state=42, n_jobs=-1)),
])

# entrenar
pipe.fit(X_train, y_train)
# ...existing code...

0,1,2
,steps,"[('pre', ...), ('clf', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('cat', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,False
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [13]:
# Código: búsqueda de hiperparámetros (Paso 4)
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import balanced_accuracy_score, classification_report

# Parámetros a optimizar (prefijo "clf__" porque el estimador final en el pipeline se llama "clf")
param_grid = {
    "clf__n_estimators": [100, 200],
    "clf__max_depth": [None, 10, 20],
    "clf__min_samples_split": [2, 5],
}

cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

gs = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    scoring="balanced_accuracy",
    cv=cv,
    n_jobs=-1,
    verbose=2,
    refit=True,
)

gs.fit(X_train, y_train)

print("Mejores parámetros encontrados:", gs.best_params_)
print("Mejor score CV (balanced_accuracy):", gs.best_score_)

# Evaluación sobre el conjunto de prueba
best = gs.best_estimator_
y_pred = best.predict(X_test)
print("Balanced accuracy en test:", balanced_accuracy_score(y_test, y_pred))
print("\nReporte de clasificación (test):\n", classification_report(y_test, y_pred))

Fitting 10 folds for each of 12 candidates, totalling 120 fits
[CV] END clf__max_depth=None, clf__min_samples_split=2, clf__n_estimators=100; total time=  13.5s
[CV] END clf__max_depth=None, clf__min_samples_split=2, clf__n_estimators=100; total time=  13.9s
[CV] END clf__max_depth=None, clf__min_samples_split=2, clf__n_estimators=100; total time=  12.3s
[CV] END clf__max_depth=None, clf__min_samples_split=2, clf__n_estimators=100; total time=  15.2s
[CV] END clf__max_depth=None, clf__min_samples_split=2, clf__n_estimators=100; total time=  15.6s
[CV] END clf__max_depth=None, clf__min_samples_split=2, clf__n_estimators=100; total time=  17.0s
[CV] END clf__max_depth=None, clf__min_samples_split=2, clf__n_estimators=100; total time=  15.0s
[CV] END clf__max_depth=None, clf__min_samples_split=2, clf__n_estimators=100; total time=  16.3s
[CV] END clf__max_depth=None, clf__min_samples_split=2, clf__n_estimators=100; total time=  14.2s
[CV] END clf__max_depth=None, clf__min_samples_split=2,

In [14]:
import os
import gzip
import pickle

os.makedirs("../files/models", exist_ok=True)

# elegir el modelo a guardar: preferir el mejor de GridSearch si existe
model_to_save = globals().get("gs", None)
if model_to_save is not None and hasattr(model_to_save, "best_estimator_"):
    model = model_to_save.best_estimator_
else:
    # fallback: usar el pipeline 'pipe' si no se hizo GridSearch
    model = globals().get("pipe")

if model is None:
    raise RuntimeError("No se encontró ningún modelo ('gs.best_estimator_' ni 'pipe') para guardar.")

out_path = "../files/models/model.pkl.gz"
with gzip.open(out_path, "wb") as f:
    pickle.dump(model, f, protocol=pickle.HIGHEST_PROTOCOL)

print(f"Modelo guardado en: {out_path}")

Modelo guardado en: ../files/models/model.pkl.gz


In [15]:
# ...existing code...
import os
import json
from sklearn.metrics import precision_score, balanced_accuracy_score, recall_score, f1_score

os.makedirs("../files/output", exist_ok=True)

# elegir el modelo entrenado (preferir GridSearch si existe)
model = None
if "gs" in globals() and hasattr(globals()["gs"], "best_estimator_"):
    model = globals()["gs"].best_estimator_
else:
    model = globals().get("pipe")

if model is None:
    raise RuntimeError("No se encontró un modelo entrenado (gs.best_estimator_ ni pipe).")

# predecir
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

# calcular métricas
def calc_metrics(y_true, y_pred, dataset_name):
    return {
        "dataset": dataset_name,
        "precision": float(precision_score(y_true, y_pred, zero_division=0)),
        "balanced_accuracy": float(balanced_accuracy_score(y_true, y_pred)),
        "recall": float(recall_score(y_true, y_pred, zero_division=0)),
        "f1_score": float(f1_score(y_true, y_pred, zero_division=0)),
    }

metrics_train = calc_metrics(y_train, y_train_pred, "train")
metrics_test = calc_metrics(y_test, y_test_pred, "test")

out_path = "../files/output/metrics.json"
# escribir como JSON lines: una línea por diccionario
with open(out_path, "w", encoding="utf-8") as f:
    f.write(json.dumps(metrics_train) + "\n")
    f.write(json.dumps(metrics_test) + "\n")

print("Métricas guardadas en:", out_path)
print("train:", metrics_train)
print("test:", metrics_test)
# ...existing code...

Métricas guardadas en: ../files/output/metrics.json
train: {'dataset': 'train', 'precision': 0.9957992998833138, 'balanced_accuracy': 0.9507902833750621, 'recall': 0.9026866934630844, 'f1_score': 0.9469596094096759}
test: {'dataset': 'test', 'precision': 0.6522864538395168, 'balanced_accuracy': 0.6695931266653316, 'recall': 0.39601885804085907, 'f1_score': 0.49282920469361147}


In [16]:
import os
import json
from sklearn.metrics import confusion_matrix

os.makedirs("../files/output", exist_ok=True)
out_path = "../files/output/metrics.json"

# elegir el modelo entrenado (preferir GridSearch si existe)
model = None
if "gs" in globals() and hasattr(globals()["gs"], "best_estimator_"):
    model = globals()["gs"].best_estimator_
else:
    model = globals().get("pipe")

if model is None:
    raise RuntimeError("No se encontró un modelo entrenado (gs.best_estimator_ ni pipe).")

# predecir (recalcular por seguridad)
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

def cm_dict(cm, dataset_name):
    tn, fp, fn, tp = map(int, cm.ravel())
    return {
        "type": "cm_matrix",
        "dataset": dataset_name,
        "true_0": {"predicted_0": tn, "predicted_1": fp},
        "true_1": {"predicted_0": fn, "predicted_1": tp},
    }

cm_train = confusion_matrix(y_train, y_train_pred)
cm_test = confusion_matrix(y_test, y_test_pred)

cm_train_dict = cm_dict(cm_train, "train")
cm_test_dict = cm_dict(cm_test, "test")

# añadir al archivo (JSON lines), sin sobrescribir las métricas previas
with open(out_path, "a", encoding="utf-8") as f:
    f.write(json.dumps(cm_train_dict) + "\n")
    f.write(json.dumps(cm_test_dict) + "\n")

print("Matrices de confusión guardadas en:", out_path)
print("train:", cm_train_dict)
print("test:", cm_test_dict)

Matrices de confusión guardadas en: ../files/output/metrics.json
train: {'type': 'cm_matrix', 'dataset': 'train', 'true_0': {'predicted_0': 16255, 'predicted_1': 18}, 'true_1': {'predicted_0': 460, 'predicted_1': 4267}}
test: {'type': 'cm_matrix', 'dataset': 'test', 'true_0': {'predicted_0': 6688, 'predicted_1': 403}, 'true_1': {'predicted_0': 1153, 'predicted_1': 756}}
