In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.tree import DecisionTreeClassifier, plot_tree,  _tree
from sklearn.model_selection import train_test_split
from sklearn.model_selection import ShuffleSplit, StratifiedShuffleSplit
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer

from joblib import Parallel, delayed

import optuna
from optuna.visualization import plot_optimization_history, plot_param_importances, plot_slice, plot_contour

from time import time

import pickle

In [2]:
#Modificar segun la estructura de sus carpetas, no se olviden crear las que faltan
base_path = 'C:/Users/pedro/Documents/DMEyF/dmeyf2025/'
dataset_path = base_path + 'data/processed/'
modelos_path = base_path + 'models/'
db_path = base_path + 'db/'
dataset_file = 'competencia_01_fe.csv'

ganancia_acierto = 780000
costo_estimulo = 20000
prob_threshold = 0.025

mes_train = [202101, 202102, 202103]
mes_test = 202104
mes_predict = 202106

# agregue sus semillas
semillas = [154277, 204007, 223207, 301013, 639083]

data = pd.read_csv(dataset_path + dataset_file)

In [3]:
X = data[data['foto_mes'].isin(mes_train)]
y = X['clase_ternaria']
X = X.drop(columns=['clase_ternaria'])

In [4]:
X_futuro = data[data['foto_mes'] == mes_test]
y_futuro = X_futuro['clase_ternaria']
X_futuro = X_futuro.drop(columns=['clase_ternaria'])

In [5]:
def ganancia_prob(y_hat, y, prop=1, class_index=1, threshold=0.025):
  @np.vectorize
  def ganancia_row(predicted, actual, threshold=0.025):
    return  (predicted >= threshold) * (ganancia_acierto if actual == "BAJA+2" else -costo_estimulo)

  return ganancia_row(y_hat[:,class_index], y).sum() / prop

In [6]:
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
Xi = imp_mean.fit_transform(X)
Xif = imp_mean.fit_transform(X_futuro)

Los parámetros que se pueden ajustar en el **rf** son

1. **n_estimators**: Número de árboles en el bosque.
2. **max_depth**: Profundidad máxima de los árboles.
3. **min_samples_split**: Número mínimo de muestras requeridas para dividir un nodo interno.
4. **min_samples_leaf**: Número mínimo de muestras requeridas para estar en un nodo hoja.
5. **max_features**: Número de features a usar en cada árbol. **sqrt** es una elección histórica.
6. **max_leaf_nodes**: Número máximo de nodos hoja en cada árbol.
7. **oob_score**: Indica si se usa la muestra fuera de bolsa (out-of-bag) para estimar la calidad del modelo. Para evitar hacer un **montecarlo-cross-validation** que se toma su tiempo, usaremos esta opción para buscar el mejor modelo. No es la mejor opción. Pero no es tan mala.
8. **n_jobs**: Siempre -1, para que use todos los cores presentes en 9. **max_samples**: Fracción de los samples.

Finalmente nuestra función de optimización queda la siguiente forma:

In [7]:
def objective(trial):
    max_depth = trial.suggest_int('max_depth', 15, 40)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 2000)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 200)
    max_features = trial.suggest_float('max_features', 0.05, 0.7)

    model = RandomForestClassifier(
        n_estimators=100,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_features=max_features,
        max_samples=0.7,
        random_state=semillas[0],
        n_jobs=-1,
        oob_score=True
    )

    model.fit(Xi, y)

    return ganancia_prob(model.oob_decision_function_, y)

storage_name = "sqlite:///" + db_path + "optimization_tree.db"
study_name = "p_101_random-forest-opt-fe"

study = optuna.create_study(
    direction="maximize",
    study_name=study_name,
    storage=storage_name,
    load_if_exists=True,
)

[I 2025-10-01 00:02:17,870] Using an existing study with name 'p_101_random-forest-opt-fe' instead of creating a new one.


In [8]:
# study.optimize(objective, n_trials=10)

In [9]:
optuna.visualization.plot_optimization_history(study)

In [10]:
plot_param_importances(study)

In [11]:
plot_slice(study)

In [12]:
plot_contour(study)

In [13]:
plot_contour(study, params=["max_depth", "min_samples_split"])

In [14]:
model_rf = RandomForestClassifier(
        n_estimators=100,
        **study.best_params,
        max_samples=0.7,
        random_state=semillas[0],
        n_jobs=-1,
        oob_score=True
    )

model_rf.fit(Xi, y)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,36
,min_samples_split,2
,min_samples_leaf,62
,min_weight_fraction_leaf,0.0
,max_features,0.45981680395486785
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [15]:
filename = modelos_path + 'p_106_random_forest_model_100_1.sav'
pickle.dump(model_rf, open(filename, 'wb'))

In [16]:
model_rf = pickle.load(open(filename, 'rb'))
model_rf

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,36
,min_samples_split,2
,min_samples_leaf,62
,min_weight_fraction_leaf,0.0
,max_features,0.45981680395486785
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [17]:
y_pred_rf = model_rf.predict_proba(Xif)
ganancias_rf = ganancia_prob(y_pred_rf, y_futuro)
print(f"Ganancia de modelo RF: {ganancias_rf}")

Ganancia de modelo RF: 340500000.0


Agregar código para histogramas

In [18]:
importances = model_rf.feature_importances_

features = X.columns
feat_importances = pd.DataFrame({'feature': features, 'importance': importances})
feat_importances = feat_importances.sort_values('importance', ascending=False)

feat_importances.head(25)

Unnamed: 0,feature,importance
107,ctrx_quarter,0.144139
161,mbanco_total,0.054995
154,ctarjeta_total_transacciones,0.048865
22,mcuentas_saldo,0.04098
21,cdescubierto_preacordado,0.0404
11,mpasivos_margen,0.038667
2,active_quarter,0.035444
133,Visa_status,0.033705
162,mprestamos_total,0.032658
16,mcuenta_corriente,0.031231


### Crear predicción para Kaggle

#### Entrenar el modelo con todos los datos

In [19]:
mes_total = mes_train
mes_total.append(mes_test)
X_total = data[data['foto_mes'].isin(mes_total)]
y_total = X_total['clase_ternaria']
X_total = X_total.drop(columns=['clase_ternaria'])

X_predict = data[data['foto_mes'] == mes_predict].drop(columns=['clase_ternaria'])

In [20]:
Xi_total = imp_mean.fit_transform(X_total)
Xi_predict = imp_mean.fit_transform(X_predict) 

In [21]:
model_rf_total = RandomForestClassifier(
        n_estimators=100,
        **study.best_params,
        max_samples=0.7,
        random_state=semillas[0],
        n_jobs=-1,
        oob_score=True
    )

model_rf_total.fit(Xi_total, y_total)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,36
,min_samples_split,2
,min_samples_leaf,62
,min_weight_fraction_leaf,0.0
,max_features,0.45981680395486785
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [22]:
print(model_rf_total.classes_)

['BAJA+1' 'BAJA+2' 'CONTINUA']


In [23]:
y_pred_prob = model_rf_total.predict_proba(Xi_predict)
y_pred_prob_baja = y_pred_prob[:,1]

In [24]:
prob_threshold = 0.025
y_pred = (y_pred_prob_baja >= prob_threshold).astype(int)

In [25]:
valores, cuentas = np.unique(y_pred, return_counts=True)

print(valores)  # [1 2 3 4]
print(cuentas)  # [1 2 3 4]

[0 1]
[155519   8794]


In [26]:
y_pred

array([0, 0, 0, ..., 0, 1, 0], shape=(164313,))

#### Crear archivo con output para Kaggle

In [27]:
# Crear DataFrame con número de cliente y predicción
df_resultados = pd.DataFrame({
    "numero_de_cliente": X_predict["numero_de_cliente"].values,
    "Predicted": y_pred
})


# Exportar a CSV
df_resultados.to_csv(base_path+"predicted/predicciones_fe_5.csv", index=False)