# Machine Learning

Importación de librerias

In [193]:
import pandas as pd # Librería de lectura de datos
import numpy as np # Librería de cálculo numérico

import matplotlib.pyplot as plt # Librería de visualización de datos

from sklearn.model_selection import train_test_split,GridSearchCV # Función para dividir los datos en entrenamiento y prueba
from sklearn.preprocessing import MinMaxScaler,OneHotEncoder,LabelEncoder

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier,plot_tree, export_text

from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, roc_auc_score, roc_curve # Funciones para evaluar el rendimiento del modelo

Carga del dataset

In [194]:
df = pd.read_csv('../clean_data/telco-customer.csv')

In [195]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 22 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   gender                  7043 non-null   object 
 1   seniorcitizen           7043 non-null   object 
 2   partner                 7043 non-null   object 
 3   dependents              7043 non-null   object 
 4   tenure                  7043 non-null   int64  
 5   phoneservice            7043 non-null   object 
 6   multiplelines           7043 non-null   object 
 7   internetservice         7043 non-null   object 
 8   onlinesecurity          7043 non-null   object 
 9   onlinebackup            7043 non-null   object 
 10  deviceprotection        7043 non-null   object 
 11  techsupport             7043 non-null   object 
 12  streamingtv             7043 non-null   object 
 13  streamingmovies         7043 non-null   object 
 14  contract                7043 non-null   

In [196]:
df_copy = df.copy()

Cambio necesario a variables objetos a categóricas

In [197]:
cols_object = df.select_dtypes(include=['object']).columns
df_copy[cols_object] = df_copy[cols_object].astype('category')
df_copy.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 22 columns):
 #   Column                  Non-Null Count  Dtype   
---  ------                  --------------  -----   
 0   gender                  7043 non-null   category
 1   seniorcitizen           7043 non-null   category
 2   partner                 7043 non-null   category
 3   dependents              7043 non-null   category
 4   tenure                  7043 non-null   int64   
 5   phoneservice            7043 non-null   category
 6   multiplelines           7043 non-null   category
 7   internetservice         7043 non-null   category
 8   onlinesecurity          7043 non-null   category
 9   onlinebackup            7043 non-null   category
 10  deviceprotection        7043 non-null   category
 11  techsupport             7043 non-null   category
 12  streamingtv             7043 non-null   category
 13  streamingmovies         7043 non-null   category
 14  contract                

# Codificación de variables categóricas

## One Hot Encoding

In [198]:
# Iniciliar one hot enconder
enconder = OneHotEncoder(sparse_output=False, dtype=int)

# Obtenemos las variables categóricas
cols_categoricas = df_copy.select_dtypes(include='category').columns
#print(cols_categoricas)

#Eliminamos la variable objetivo
cols_categoricas = cols_categoricas.drop('baja')
#print(cols_categoricas)

#Aplicar One Hot Encoding a las columnas categóricas
enconded = enconder.fit_transform(df_copy[cols_categoricas])

# Obtenemos el nombre de las nuevas columnas
column_names = enconder.get_feature_names_out(cols_categoricas)

# Creación del dataframe con valores codificados
df_encoded = pd.DataFrame(enconded,columns=column_names,index=df_copy.index)
df_encoded.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 43 columns):
 #   Column                                   Non-Null Count  Dtype
---  ------                                   --------------  -----
 0   gender_Female                            7043 non-null   int64
 1   gender_Male                              7043 non-null   int64
 2   seniorcitizen_SeniorCitizen              7043 non-null   int64
 3   seniorcitizen_noSeniorCitizen            7043 non-null   int64
 4   partner_No                               7043 non-null   int64
 5   partner_Yes                              7043 non-null   int64
 6   dependents_No                            7043 non-null   int64
 7   dependents_Yes                           7043 non-null   int64
 8   phoneservice_No                          7043 non-null   int64
 9   phoneservice_Yes                         7043 non-null   int64
 10  multiplelines_No                         7043 non-null   int64
 11  mult

## Label Enconding para variable objetivo

In [199]:
# Aplicar label enconder
label_encoder = LabelEncoder()
target_var = df_copy['baja']
#print(target_var)
target_var = label_encoder.fit_transform(target_var)
print(target_var)


[0 0 1 ... 0 1 0]


## Unir en un dataframe las codificaciones

In [200]:
# Obtenemos las variables numéricas
columns_num_int = df_copy.select_dtypes(include='int64').columns
print(columns_num_int)

columns_num_float = df_copy.select_dtypes(include='float64').columns
print(columns_num_float)

#Obtenemos las variables booleanas
colums_bool = df_copy.select_dtypes(include='bool').columns
print(colums_bool)

Index(['tenure'], dtype='object')
Index(['monthlycharges', 'totalcharges'], dtype='object')
Index(['cliente_larga_duracion', 'phone+internet'], dtype='object')


In [201]:
#Unimos todas los datos en un dataframe
column_target = pd.Series(target_var, name='baja', index=df_copy.index)


X_num_int   = df_copy[columns_num_int]
X_num_float = df_copy[columns_num_float]
X_bool      = df_copy[colums_bool]

df_final = pd.concat([column_target, X_num_int, X_num_float, X_bool, df_encoded], axis=1)

df_final.head()



Unnamed: 0,baja,tenure,monthlycharges,totalcharges,cliente_larga_duracion,phone+internet,gender_Female,gender_Male,seniorcitizen_SeniorCitizen,seniorcitizen_noSeniorCitizen,...,streamingmovies_Yes,contract_Month-to-month,contract_One year,contract_Two year,paperlessbilling_No,paperlessbilling_Yes,paymentmethod_Bank transfer (automatic),paymentmethod_Credit card (automatic),paymentmethod_Electronic check,paymentmethod_Mailed check
0,0,1,29.85,29.85,False,False,1,0,0,1,...,0,1,0,0,0,1,0,0,1,0
1,0,34,56.95,1889.5,True,True,0,1,0,1,...,0,0,1,0,1,0,0,0,0,1
2,1,2,53.85,108.15,False,True,0,1,0,1,...,0,1,0,0,0,1,0,0,0,1
3,0,45,42.3,1840.75,True,False,0,1,0,1,...,0,0,1,0,1,0,1,0,0,0
4,1,2,70.7,151.65,False,True,1,0,0,1,...,0,1,0,0,0,1,0,0,1,0


## Algoritmos de clasificación

In [202]:
#Separación entre variable objetivo y las demás
#Demás variables
X = df_final.drop(columns='baja')
#X.info()
#Variable objetivo
Y = df_final['baja']

In [203]:
#Divisíon 70%/30%
X_train, X_test, y_train, y_test = train_test_split(
    X, Y, test_size=0.3, random_state=42, stratify=Y
)

## Optimización de hiperparámetros con optuna

### Regresión logística

In [204]:

#Crear modelo
modelo = LogisticRegression()
#Entrenar modelo
modelo.fit(X_train,y_train)

print("Intercepto (β0):", modelo.intercept_[0])
print("Coeficiente (β1):", modelo.coef_[0][0])

Intercepto (β0): -0.14077662042989167
Coeficiente (β1): -0.05737512511200509


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### Evaluación y Predicción del modelo

In [205]:
from sklearn.metrics import classification_report

y_pred = modelo.predict(X_test)
y_pred_prob = modelo.predict_proba(X_test)[:, 1]

confusion_matrix = confusion_matrix(y_test,y_pred)
precision = precision_score(y_test,y_pred, zero_division = 1)
recall = recall_score(y_test, y_pred, zero_division=1)
f1 = f1_score(y_test, y_pred, zero_division=1)
auc = roc_auc_score(y_test, y_pred_prob)

# Mostrar las métricas
print("Matriz de Confusión:\n", confusion_matrix)
print("Precisión:", precision)
print("Sensibilidad (Recall):", recall)
print("F1 Score:", f1)
print("AUC-ROC:", auc)

Matriz de Confusión:
 [[1402  150]
 [ 250  311]]
Precisión: 0.6746203904555315
Sensibilidad (Recall): 0.5543672014260249
F1 Score: 0.6086105675146771
AUC-ROC: 0.8448548936913097


## SVM

In [206]:
from sklearn import svm
from sklearn.metrics import confusion_matrix as cm_func
model_svm = svm.SVC(kernel="rbf",
    probability=True,
    class_weight="balanced")
model_svm.fit(X_train,y_train)

y_pred = model_svm.predict(X_test)
y_pred_prob = model_svm.predict_proba(X_test)[:, 1]

cm = cm_func(y_test,y_pred)
precision = precision_score(y_test,y_pred, zero_division = 1)
recall = recall_score(y_test, y_pred, zero_division=1)
f1 = f1_score(y_test, y_pred, zero_division=1)
auc = roc_auc_score(y_test, y_pred_prob)

# Mostrar las métricas
print("Matriz de Confusión:\n", cm)
print("Precisión:", precision)
print("Sensibilidad (Recall):", recall)
print("F1 Score:", f1)
print("AUC-ROC:", auc)


Matriz de Confusión:
 [[1103  449]
 [ 245  316]]
Precisión: 0.4130718954248366
Sensibilidad (Recall): 0.5632798573975044
F1 Score: 0.4766214177978884
AUC-ROC: 0.7205038177407794


## Arboles de decision

In [207]:
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import confusion_matrix as cm_func2

modelo_tree = DecisionTreeClassifier(criterion='entropy', random_state=0)
modelo_tree.fit(X_train, y_train)

y_pred = modelo_tree.predict(X_test)
y_pred_prob = modelo_tree.predict_proba(X_test)[:, 1]

cm2 = cm_func2(y_test,y_pred)
precision = precision_score(y_test,y_pred, zero_division = 1)
recall = recall_score(y_test, y_pred, zero_division=1)
f1 = f1_score(y_test, y_pred, zero_division=1)
auc = roc_auc_score(y_test, y_pred_prob)

# Mostrar las métricas
print("Matriz de Confusión:\n", cm2)
print("Precisión:", precision)
print("Sensibilidad (Recall):", recall)
print("F1 Score:", f1)
print("AUC-ROC:", auc)

Matriz de Confusión:
 [[1261  291]
 [ 279  282]]
Precisión: 0.49214659685863876
Sensibilidad (Recall): 0.5026737967914439
F1 Score: 0.4973544973544973
AUC-ROC: 0.6585545417792235


## K-nearest Neighbour(KNN)

In [208]:
from sklearn.metrics import confusion_matrix as cm_func3


knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
y_pred_prob = knn.predict_proba(X_test)[:, 1]

cm2 = cm_func3(y_test,y_pred)
precision = precision_score(y_test,y_pred, zero_division = 1)
recall = recall_score(y_test, y_pred, zero_division=1)
f1 = f1_score(y_test, y_pred, zero_division=1)
auc = roc_auc_score(y_test, y_pred_prob)

# Mostrar las métricas
print("Matriz de Confusión:\n", cm2)
print("Precisión:", precision)
print("Sensibilidad (Recall):", recall)
print("F1 Score:", f1)
print("AUC-ROC:", auc)

Matriz de Confusión:
 [[1357  195]
 [ 317  244]]
Precisión: 0.5558086560364465
Sensibilidad (Recall): 0.43493761140819964
F1 Score: 0.488
AUC-ROC: 0.7190227778084055


## Random forest

In [209]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    confusion_matrix,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score
)


rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=None,
    random_state=42,
    class_weight="balanced"  
)

# Entrenamos
rf.fit(X_train, y_train)

# Predicciones
y_pred = rf.predict(X_test)
y_pred_prob = rf.predict_proba(X_test)[:, 1]

# Métricas
cm_rf = confusion_matrix(y_test, y_pred)
precision = precision_score(y_test, y_pred, zero_division=1)
recall = recall_score(y_test, y_pred, zero_division=1)
f1 = f1_score(y_test, y_pred, zero_division=1)
auc = roc_auc_score(y_test, y_pred_prob)

# Mostrar resultados
print("Matriz de Confusión:\n", cm_rf)
print("Precisión:", precision)
print("Sensibilidad (Recall):", recall)
print("F1 Score:", f1)
print("AUC-ROC:", auc)


Matriz de Confusión:
 [[1387  165]
 [ 297  264]]
Precisión: 0.6153846153846154
Sensibilidad (Recall): 0.47058823529411764
F1 Score: 0.5333333333333333
AUC-ROC: 0.8163975641803111


## XGBoost

In [210]:
from xgboost import XGBClassifier
from sklearn.metrics import (
    confusion_matrix,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score
)
# Definimos el modelo XGBoost
xgb = XGBClassifier(
    n_estimators=300,
    max_depth=4,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    eval_metric="logloss",
)

# Entrenamos
xgb.fit(X_train, y_train)

# Predicciones
y_pred = xgb.predict(X_test)
y_pred_prob = xgb.predict_proba(X_test)[:, 1]

# Métricas
cm_xgb = confusion_matrix(y_test, y_pred)
precision = precision_score(y_test, y_pred, zero_division=1)
recall = recall_score(y_test, y_pred, zero_division=1)
f1 = f1_score(y_test, y_pred, zero_division=1)
auc = roc_auc_score(y_test, y_pred_prob)

# Mostrar resultados
print("Matriz de Confusión:\n", cm_xgb)
print("Precisión:", precision)
print("Sensibilidad (Recall):", recall)
print("F1 Score:", f1)
print("AUC-ROC:", auc)


Matriz de Confusión:
 [[1396  156]
 [ 279  282]]
Precisión: 0.6438356164383562
Sensibilidad (Recall): 0.5026737967914439
F1 Score: 0.5645645645645646
AUC-ROC: 0.837059191061617


In [211]:
from sklearn.metrics import (
    confusion_matrix,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score
)

def evaluar_modelo(nombre, model, X_train, y_train, X_test, y_test):
    """
    Entrena un modelo, predice y devuelve un diccionario con métricas.
    """
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    # Probabilidades (si el modelo las soporta)
    if hasattr(model, "predict_proba"):
        y_pred_prob = model.predict_proba(X_test)[:, 1]
        auc = roc_auc_score(y_test, y_pred_prob)
    elif hasattr(model, "decision_function"):
        # Para modelos tipo SVC sin predict_proba (si no activaste probability=True)
        scores = model.decision_function(X_test)
        auc = roc_auc_score(y_test, scores)
    else:
        auc = None

    return {
        "Modelo": nombre,
        "Precision": precision_score(y_test, y_pred, zero_division=1),
        "Recall": recall_score(y_test, y_pred, zero_division=1),
        "F1-score": f1_score(y_test, y_pred, zero_division=1),
        "AUC-ROC": auc,
        "Confusion Matrix": confusion_matrix(y_test, y_pred)
    }

# 1) Define aquí tus modelos (los que quieras)
modelos = {
    "Regresión logística": modelo,     # ejemplo: tu objeto ya creado
    "KNN": knn,
    "Árbol de decisión": modelo_tree,
    "SVC": model_svm,
    "Random Forest": rf,
    "XGBoost": xgb
}

# 2) Evalúa todos y guarda resultados
resultados = []
cms = {}

for nombre, modelo in modelos.items():
    res = evaluar_modelo(nombre, modelo, X_train, y_train, X_test, y_test)
    cms[nombre] = res.pop("Confusion Matrix")  # guardamos CM aparte
    resultados.append(res)

# 3) Crea la tabla final ordenada por F1 (desc)
df_resultados = pd.DataFrame(resultados).sort_values("F1-score", ascending=False).reset_index(drop=True)

df_resultados

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Unnamed: 0,Modelo,Precision,Recall,F1-score,AUC-ROC
0,Regresión logística,0.67462,0.554367,0.608611,0.844855
1,XGBoost,0.643836,0.502674,0.564565,0.837059
2,Random Forest,0.615385,0.470588,0.533333,0.816398
3,Árbol de decisión,0.492147,0.502674,0.497354,0.658555
4,KNN,0.555809,0.434938,0.488,0.719023
5,SVC,0.413072,0.56328,0.476621,0.720504


## Optimizacion con Optuna

In [212]:
import optuna
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, cross_val_score

# 1) Escalar UNA VEZ (como en tu ejemplo de clase)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)   # X e Y son todo tu dataset (no train/test aquí)

def objective(trial):
    params = {
        "C": trial.suggest_float("C", 1e-4, 1e2, log=True),
        "penalty": trial.suggest_categorical("penalty", ["l1", "l2"]),
        "solver": "liblinear",  # liblinear soporta l1 y l2 sin líos
        "class_weight": trial.suggest_categorical("class_weight", [None, "balanced"]),
        "max_iter": 2000,
        "random_state": 42
    }

    model = LogisticRegression(**params)

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    # usa "f1" si es churn/baja; o "recall" si te interesa captar bajas
    scores = cross_val_score(model, X_scaled, Y, cv=cv, scoring="f1", n_jobs=-1)
    return scores.mean()

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50)

print("Best F1:", study.best_value)
print("Best params:", study.best_params)


[32m[I 2026-02-10 18:48:52,493][0m A new study created in memory with name: no-name-ca067099-c459-433d-9b8b-710307f97bc3[0m
[32m[I 2026-02-10 18:48:54,184][0m Trial 0 finished with value: 0.6267235324667099 and parameters: {'C': 9.39304726540782, 'penalty': 'l1', 'class_weight': 'balanced'}. Best is trial 0 with value: 0.6267235324667099.[0m
[32m[I 2026-02-10 18:48:54,477][0m Trial 1 finished with value: 0.626175020072371 and parameters: {'C': 52.7535076720208, 'penalty': 'l2', 'class_weight': 'balanced'}. Best is trial 0 with value: 0.6267235324667099.[0m
[32m[I 2026-02-10 18:48:55,862][0m Trial 2 finished with value: 0.6010294403393072 and parameters: {'C': 6.626539134470331, 'penalty': 'l1', 'class_weight': None}. Best is trial 0 with value: 0.6267235324667099.[0m
[32m[I 2026-02-10 18:48:56,009][0m Trial 3 finished with value: 0.0 and parameters: {'C': 0.00011008261189940374, 'penalty': 'l1', 'class_weight': 'balanced'}. Best is trial 0 with value: 0.6267235324667099.

Best F1: 0.6283612480147578
Best params: {'C': 0.09649132568952762, 'penalty': 'l1', 'class_weight': 'balanced'}


In [213]:
import optuna
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler

# Escalado previo (como en clase)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

def objective(trial):
    params = {
        "n_neighbors": trial.suggest_int("n_neighbors", 3, 25),
        "weights": trial.suggest_categorical("weights", ["uniform", "distance"]),
        "metric": trial.suggest_categorical("metric", ["euclidean", "manhattan"])
    }

    model = KNeighborsClassifier(**params)
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    scores = cross_val_score(model, X_scaled, Y, cv=cv, scoring="f1", n_jobs=-1)
    return scores.mean()

study_knn = optuna.create_study(direction="maximize")
study_knn.optimize(objective, n_trials=50)

print("KNN best F1:", study_knn.best_value)
print("KNN best params:", study_knn.best_params)


[32m[I 2026-02-10 18:50:43,682][0m A new study created in memory with name: no-name-39379907-b82b-4e02-800b-e9e0f2864622[0m
[32m[I 2026-02-10 18:50:43,976][0m Trial 0 finished with value: 0.54674842493537 and parameters: {'n_neighbors': 16, 'weights': 'uniform', 'metric': 'euclidean'}. Best is trial 0 with value: 0.54674842493537.[0m
[32m[I 2026-02-10 18:50:44,510][0m Trial 1 finished with value: 0.5061089567348029 and parameters: {'n_neighbors': 8, 'weights': 'uniform', 'metric': 'manhattan'}. Best is trial 0 with value: 0.54674842493537.[0m
[32m[I 2026-02-10 18:50:44,770][0m Trial 2 finished with value: 0.54386567796918 and parameters: {'n_neighbors': 20, 'weights': 'distance', 'metric': 'euclidean'}. Best is trial 0 with value: 0.54674842493537.[0m
[32m[I 2026-02-10 18:50:44,949][0m Trial 3 finished with value: 0.4992369041714838 and parameters: {'n_neighbors': 5, 'weights': 'distance', 'metric': 'euclidean'}. Best is trial 0 with value: 0.54674842493537.[0m
[32m[I 2

KNN best F1: 0.5886529788303608
KNN best params: {'n_neighbors': 25, 'weights': 'uniform', 'metric': 'manhattan'}


In [214]:
import optuna
from sklearn.svm import SVC

def objective(trial):
    params = {
        "C": trial.suggest_float("C", 1e-3, 1e2, log=True),
        "kernel": trial.suggest_categorical("kernel", ["rbf", "linear"]),
        "class_weight": trial.suggest_categorical("class_weight", [None, "balanced"]),
        "probability": True,
        "random_state": 42
    }

    model = SVC(**params)
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    scores = cross_val_score(model, X_scaled, Y, cv=cv, scoring="f1", n_jobs=-1)
    return scores.mean()

study_svc = optuna.create_study(direction="maximize")
study_svc.optimize(objective, n_trials=50)

print("SVC best F1:", study_svc.best_value)
print("SVC best params:", study_svc.best_params)


[32m[I 2026-02-10 18:51:21,708][0m A new study created in memory with name: no-name-4231218c-29b3-40ee-b15b-ef18a432b3af[0m
[32m[I 2026-02-10 18:51:32,645][0m Trial 0 finished with value: 0.6001966397467866 and parameters: {'C': 0.0022780571413683836, 'kernel': 'rbf', 'class_weight': 'balanced'}. Best is trial 0 with value: 0.6001966397467866.[0m
[32m[I 2026-02-10 18:51:38,632][0m Trial 1 finished with value: 0.5620528962456687 and parameters: {'C': 6.139870947851293, 'kernel': 'rbf', 'class_weight': None}. Best is trial 0 with value: 0.6001966397467866.[0m
[32m[I 2026-02-10 18:51:46,069][0m Trial 2 finished with value: 0.596143323483947 and parameters: {'C': 1.0550463535773407, 'kernel': 'linear', 'class_weight': 'balanced'}. Best is trial 0 with value: 0.6001966397467866.[0m
[32m[I 2026-02-10 18:51:51,406][0m Trial 3 finished with value: 0.48606789550512436 and parameters: {'C': 0.03271856140698768, 'kernel': 'rbf', 'class_weight': None}. Best is trial 0 with value: 0.6

SVC best F1: 0.6207788770534488
SVC best params: {'C': 0.011551495548948841, 'kernel': 'rbf', 'class_weight': 'balanced'}


In [215]:
import optuna
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score

def objective(trial):
    params = {
        "criterion": trial.suggest_categorical("criterion", ["gini", "entropy", "log_loss"]),
        "max_depth": trial.suggest_int("max_depth", 2, 30),
        "min_samples_split": trial.suggest_int("min_samples_split", 2, 50),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 30),
        "max_features": trial.suggest_categorical("max_features", [None, "sqrt", "log2"]),
        "class_weight": trial.suggest_categorical("class_weight", [None, "balanced"]),
        "random_state": 42
    }

    model = DecisionTreeClassifier(**params)

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    scores = cross_val_score(model, X, Y, cv=cv, scoring="f1", n_jobs=-1)
    return scores.mean()

study_dt = optuna.create_study(direction="maximize")
study_dt.optimize(objective, n_trials=50)

print("Decision Tree best F1:", study_dt.best_value)
print("Decision Tree best params:", study_dt.best_params)


[32m[I 2026-02-10 19:01:14,719][0m A new study created in memory with name: no-name-731ed41d-778d-4a6e-915b-042407e25ef4[0m
[32m[I 2026-02-10 19:01:14,877][0m Trial 0 finished with value: 0.6064782799191705 and parameters: {'criterion': 'log_loss', 'max_depth': 23, 'min_samples_split': 17, 'min_samples_leaf': 30, 'max_features': 'log2', 'class_weight': 'balanced'}. Best is trial 0 with value: 0.6064782799191705.[0m
[32m[I 2026-02-10 19:01:15,046][0m Trial 1 finished with value: 0.5583058241850836 and parameters: {'criterion': 'gini', 'max_depth': 23, 'min_samples_split': 33, 'min_samples_leaf': 30, 'max_features': None, 'class_weight': None}. Best is trial 0 with value: 0.6064782799191705.[0m
[32m[I 2026-02-10 19:01:15,205][0m Trial 2 finished with value: 0.5836826473539485 and parameters: {'criterion': 'gini', 'max_depth': 5, 'min_samples_split': 9, 'min_samples_leaf': 22, 'max_features': None, 'class_weight': None}. Best is trial 0 with value: 0.6064782799191705.[0m
[32m

Decision Tree best F1: 0.6175436922370185
Decision Tree best params: {'criterion': 'gini', 'max_depth': 3, 'min_samples_split': 13, 'min_samples_leaf': 14, 'max_features': None, 'class_weight': 'balanced'}


In [216]:
import optuna
from sklearn.ensemble import RandomForestClassifier

def objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 500),
        "max_depth": trial.suggest_int("max_depth", 3, 20),
        "min_samples_split": trial.suggest_int("min_samples_split", 2, 30),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 20),
        "max_features": trial.suggest_categorical("max_features", ["sqrt", "log2"]),
        "class_weight": "balanced",
        "random_state": 42,
        "n_jobs": -1
    }

    model = RandomForestClassifier(**params)
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    scores = cross_val_score(model, X, Y, cv=cv, scoring="f1", n_jobs=-1)
    return scores.mean()

study_rf = optuna.create_study(direction="maximize")
study_rf.optimize(objective, n_trials=50)

print("RF best F1:", study_rf.best_value)
print("RF best params:", study_rf.best_params)

[32m[I 2026-02-10 19:01:58,175][0m A new study created in memory with name: no-name-6c76b38b-0f78-4926-a1a7-9e1b822f751f[0m
[32m[I 2026-02-10 19:01:59,650][0m Trial 0 finished with value: 0.6318822989852865 and parameters: {'n_estimators': 481, 'max_depth': 7, 'min_samples_split': 26, 'min_samples_leaf': 12, 'max_features': 'sqrt'}. Best is trial 0 with value: 0.6318822989852865.[0m
[32m[I 2026-02-10 19:02:00,142][0m Trial 1 finished with value: 0.6317456630233416 and parameters: {'n_estimators': 118, 'max_depth': 16, 'min_samples_split': 7, 'min_samples_leaf': 12, 'max_features': 'log2'}. Best is trial 0 with value: 0.6318822989852865.[0m
[32m[I 2026-02-10 19:02:01,047][0m Trial 2 finished with value: 0.6214836403425619 and parameters: {'n_estimators': 337, 'max_depth': 4, 'min_samples_split': 21, 'min_samples_leaf': 10, 'max_features': 'log2'}. Best is trial 0 with value: 0.6318822989852865.[0m
[32m[I 2026-02-10 19:02:01,657][0m Trial 3 finished with value: 0.6241263825

RF best F1: 0.635134337876927
RF best params: {'n_estimators': 206, 'max_depth': 8, 'min_samples_split': 23, 'min_samples_leaf': 9, 'max_features': 'sqrt'}


In [217]:
import optuna
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score

def objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 500),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
        "gamma": trial.suggest_float("gamma", 0.0, 5.0),
        "reg_alpha": trial.suggest_float("reg_alpha", 0.0, 5.0),
        "reg_lambda": trial.suggest_float("reg_lambda", 0.0, 5.0),
        "scale_pos_weight": trial.suggest_float("scale_pos_weight", 1.0, 5.0),
        "objective": "binary:logistic",
        "eval_metric": "logloss",
        "random_state": 42,
        "n_jobs": -1
    }

    model = XGBClassifier(**params)

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    scores = cross_val_score(model, X, Y, cv=cv, scoring="f1", n_jobs=-1)
    return scores.mean()

study_xgb = optuna.create_study(direction="maximize")
study_xgb.optimize(objective, n_trials=50)

print("XGBoost best F1:", study_xgb.best_value)
print("XGBoost best params:", study_xgb.best_params)


[32m[I 2026-02-10 19:03:50,804][0m A new study created in memory with name: no-name-f91a1221-124e-4b29-84c6-077dd92943bd[0m
[32m[I 2026-02-10 19:03:51,431][0m Trial 0 finished with value: 0.6101660905285128 and parameters: {'n_estimators': 199, 'max_depth': 4, 'learning_rate': 0.10377387497109496, 'subsample': 0.8236992074428737, 'colsample_bytree': 0.8021673123523692, 'gamma': 4.87765608637658, 'reg_alpha': 2.4721858493178717, 'reg_lambda': 4.751557175009523, 'scale_pos_weight': 4.908447008728751}. Best is trial 0 with value: 0.6101660905285128.[0m
[32m[I 2026-02-10 19:03:52,483][0m Trial 1 finished with value: 0.6119309770760983 and parameters: {'n_estimators': 483, 'max_depth': 8, 'learning_rate': 0.12147024516106403, 'subsample': 0.6078968964431646, 'colsample_bytree': 0.8204420820959888, 'gamma': 1.8557328545032141, 'reg_alpha': 4.067709324395674, 'reg_lambda': 1.481966198251733, 'scale_pos_weight': 4.395379446170336}. Best is trial 1 with value: 0.6119309770760983.[0m
[

XGBoost best F1: 0.6356086001959926
XGBoost best params: {'n_estimators': 148, 'max_depth': 4, 'learning_rate': 0.015302534460625463, 'subsample': 0.6388573699052247, 'colsample_bytree': 0.9203587664629254, 'gamma': 0.40547658006196147, 'reg_alpha': 4.663995220297353, 'reg_lambda': 1.250469659790281, 'scale_pos_weight': 2.6601468079751056}


## Reentreno

In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import f1_score


best_params = study_xgb.best_params.copy()

best_params.update({
    "objective": "binary:logistic",
    "eval_metric": "logloss",
    "random_state": 42,
    "n_jobs": -1
})


xgb_final = XGBClassifier(**best_params)
xgb_final.fit(X_train, y_train)


y_train_pred = xgb_final.predict(X_train)
y_test_pred = xgb_final.predict(X_test)


f1_train = f1_score(y_train, y_train_pred, zero_division=1)
f1_test = f1_score(y_test, y_test_pred, zero_division=1)

overfitting_f1 = abs(f1_train - f1_test)

print("F1 train:", round(f1_train, 4))
print("F1 test :", round(f1_test, 4))
print("Overfitting (F1):", round(overfitting_f1, 4))
print("Overfitting (%):", round(overfitting_f1 * 100, 2), "%")


F1 train: 0.6485
F1 test : 0.6335
Overfitting (F1): 0.015
Overfitting (%): 1.5 %


## Exportación del modelo

In [220]:
import joblib

joblib.dump(xgb_final, "../models/xgboost_model.pkl")


['../models/xgboost_model.pkl']