In [3]:
import pandas as pd
import numpy as np
import time
import re
import os
from sklearn.ensemble import RandomForestClassifier, StackingClassifier,ExtraTreesClassifier, HistGradientBoostingClassifier
#from xgboost import XGBClassifier
#from lightgbm import LGBMClassifier
#from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split,RandomizedSearchCV
from sklearn.preprocessing import OrdinalEncoder,StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score, f1_score, log_loss
import joblib
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
import warnings

In [4]:
df_users = pd.read_csv('Datos/Originales/Datos look&like/customers_data_2.csv',sep=';')
df_products = pd.read_csv('Datos/Originales/Datos look&like/items_data.csv')
df_interactions = pd.read_csv('Datos/Originales/Datos look&like/look_and_like_data_2.csv',sep=';')

In [5]:
df_full = df_interactions.merge(df_users, on='user_id', how='left')
df_full = df_full.merge(df_products, on='product_variant_id', how='left')
target_map = {'true': 1, 'false': 0}
df_full['target'] = df_full['response'].astype(str).str.lower().map(target_map)
df_full = df_full.dropna(subset=['target'])
df_full['occurred_on_'] = pd.to_datetime(df_full['occurred_on_'], errors='coerce')
df_full['month'] = df_full['occurred_on_'].dt.month.fillna(-1).astype(int)
mapa_precios = {'30-60': 45, '60-100': 80, '100+': 140}
if 'prices' in df_full.columns:
    user_budget = df_full['prices'].map(mapa_precios).fillna(45)
    df_full['price_divergence'] = (df_full['current_price_eur'] - user_budget) / user_budget
    df_full['price_divergence'] = df_full['price_divergence'].fillna(0)
df_full.columns = [re.sub(r'[^\w]', '_', col) for col in df_full.columns]
cols_to_drop = ['user_id', 'product_variant_id', 'response', 'response_clean', 
                'place', 'occurred_on_', 'date_birth', 'target']
features = [c for c in df_full.columns if c not in cols_to_drop]

In [9]:
#df_full.to_csv("Datos/Transformados/look_like_full.csv")
df_full.columns

Index(['user_id', 'product_variant_id', 'response', 'place', 'occurred_on_',
       'user_market', 'frequency', 'newsletter_subscribed', 'dress_leisure',
       'dress_work', 'fit_top', 'fit_bottom', 'body_shape', 'eyes', 'hair',
       'size_top', 'size_bottom', 'size_footwear', 'size_bra', 'size_cup',
       'height', 'weight', 'adventurous_x', 'prices', 'job', 'date_birth',
       'age', 'is_mother', 'style_1', 'style_2', 'season', 'family', 'brand',
       'model', 'color', 'size', 'adventurous_y', 'back_neckline', 'basic',
       'bottom', 'chest_contour', 'chest_volume', 'closing',
       'composition_detail', 'cover', 'cut', 'elasticated_lining', 'fabric',
       'finishing', 'fit', 'heel_length', 'hip_contour', 'hips_volume',
       'leg_height', 'light', 'long_cm', 'neck', 'neckline', 'print',
       'rubber_waist', 'shot', 'shoulders_pad', 'size_feature', 'sizing',
       'sleeve', 'sleeve_long', 'sleeve_long_cm', 'sole_length', 'style',
       'thicknees', 'toecap', 'type_of

In [6]:
X = df_full[features].copy()
y = df_full['target'].astype(int)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
cat_cols = X.select_dtypes(include=['object', 'category', 'bool']).columns.tolist()
num_cols = X.select_dtypes(include=['number']).columns.tolist()
imputer = SimpleImputer(strategy='median')
X_train[num_cols] = imputer.fit_transform(X_train[num_cols])
X_test[num_cols] = imputer.transform(X_test[num_cols])
encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
X_train[cat_cols] = X_train[cat_cols].fillna("Unknown").astype(str)
X_test[cat_cols] = X_test[cat_cols].fillna("Unknown").astype(str)
X_train[cat_cols] = encoder.fit_transform(X_train[cat_cols])
X_test[cat_cols] = encoder.transform(X_test[cat_cols])
print(f"Datos listos. Train: {X_train.shape} Test: {X_test.shape}")

Datos listos. Train: (279409, 71) Test: (69853, 71)


**MODELOS**

In [None]:
ratio = float(np.sum(y_train == 0)) / np.sum(y_train == 1)

# XGBoost
xgb_params = {
    'n_estimators': [300, 500, 800, 1000],
    'learning_rate': [0.01, 0.03, 0.05, 0.1],
    'max_depth': [6, 8, 10, 12],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.6, 0.7, 0.8],
    'gamma': [0, 0.1, 0.2, 0.5],            
    'reg_alpha': [0, 0.1, 1, 5],             
    'reg_lambda': [1, 5, 10],               
    'scale_pos_weight': [ratio]              
}

# Random Forest
rf_params = {
    'n_estimators': [300, 500, 700],
    'max_depth': [15, 20, 25, 30, None],
    'min_samples_split': [2, 5, 10, 15],
    'min_samples_leaf': [1, 2, 4, 8],
    'max_features': ['sqrt', 'log2'],
    'class_weight': ['balanced', 'balanced_subsample']
}

# LightGBM
lgbm_params = {
    'n_estimators': [500, 800, 1000],
    'learning_rate': [0.01, 0.05, 0.1],
    'num_leaves': [31, 50, 70, 100],         
    'max_depth': [-1, 10, 15, 20],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.7, 0.8, 0.9],
    'reg_alpha': [0, 0.1, 1],
    'class_weight': ['balanced']
}

# CatBoost
cat_params = {
    'iterations': [500, 800, 1000],
    'learning_rate': [0.03, 0.05, 0.1],
    'depth': [6, 8, 10],
    'l2_leaf_reg': [1, 3, 5, 7],
    'border_count': [64, 128, 254],
    'auto_class_weights': ['Balanced']
}
modelos_a_optimizar = [
    ("XGBoost", XGBClassifier(n_jobs=-1, random_state=42, eval_metric='auc'), xgb_params),
    ("Random Forest", RandomForestClassifier(n_jobs=-1, random_state=42), rf_params),
    ("LightGBM", LGBMClassifier(n_jobs=-1, random_state=42, verbose=-1), lgbm_params),
    ("CatBoost", CatBoostClassifier(verbose=0, random_state=42, allow_writing_files=False), cat_params)]

In [None]:
mejores_resultados = []
for nombre, modelo, params in modelos_a_optimizar:
    print(f"\nOptimizando {nombre}")
    start = time.time()
    search = RandomizedSearchCV(
        estimator=modelo,
        param_distributions=params,
        n_iter=15,               
        scoring='roc_auc',       
        cv=3,                
        verbose=1,
        n_jobs=-1,
        random_state=42)
    search.fit(X_train, y_train)
    best_auc = search.best_score_
    best_params = search.best_params_
    best_estimator = search.best_estimator_
    tiempo = time.time() - start
    print(f"Mejor AUC (CV): {best_auc:.4f}")
    print(f"Tiempo: {tiempo:.2f}s")
    mejores_resultados.append({
        'Modelo': nombre,
        'Best_AUC_CV': best_auc,
        'Best_Params': best_params,
        'Estimator': best_estimator})
    
print("RESULTADOS FINALES EN TEST")
tabla_final = []
for item in mejores_resultados:
    modelo_opt = item['Estimator']
    nombre = item['Modelo']
    y_prob = modelo_opt.predict_proba(X_test)[:, 1]
    y_pred = modelo_opt.predict(X_test)
    auc_final = roc_auc_score(y_test, y_prob)
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, zero_division=0) # zero_division evita errores si no predice nada
    rec = recall_score(y_test, y_pred, zero_division=0)
    f1 = f1_score(y_test, y_pred, zero_division=0)
    ll = log_loss(y_test, y_prob)
    tabla_final.append({
        'Modelo': nombre,'AUC': auc_final,
        'Accuracy': acc,'F1-Score': f1,
        'Precision': prec,'Recall': rec,
        'Log Loss': ll,'Mejores Parámetros': item['Best_Params'],
        'Estimator': modelo_opt})
df_res = pd.DataFrame(tabla_final).sort_values(by='AUC', ascending=False)
print(df_res[['Modelo', 'AUC']].to_string(index=False))
ganador = df_res.iloc[0]
print(f"EL MEJOR MODELO OPTIMIZADO ES: **{ganador['Modelo']}**")
print(f"   AUC Score: {ganador['AUC']:.5f}")
print("   Configuración Ganadora:")
print(ganador['Mejores Parámetros'])


Optimizando XGBoost
Fitting 3 folds for each of 15 candidates, totalling 45 fits
Mejor AUC (CV): 0.8070
Tiempo: 535.89s

Optimizando Random Forest
Fitting 3 folds for each of 15 candidates, totalling 45 fits
Mejor AUC (CV): 0.8074
Tiempo: 2686.64s

Optimizando LightGBM
Fitting 3 folds for each of 15 candidates, totalling 45 fits
Mejor AUC (CV): 0.8024
Tiempo: 451.44s

Optimizando CatBoost
Fitting 3 folds for each of 15 candidates, totalling 45 fits
Mejor AUC (CV): 0.8025
Tiempo: 1245.75s
RESULTADOS FINALES EN TEST
       Modelo      AUC
      XGBoost 0.819682
Random Forest 0.815550
     LightGBM 0.813532
     CatBoost 0.809855
EL MEJOR MODELO OPTIMIZADO ES: **XGBoost**
   AUC Score: 0.81968
   Configuración Ganadora:
{'subsample': 0.8, 'scale_pos_weight': np.float64(1.772932524835505), 'reg_lambda': 1, 'reg_alpha': 0, 'n_estimators': 800, 'max_depth': 12, 'learning_rate': 0.05, 'gamma': 0.2, 'colsample_bytree': 0.6}


In [None]:
umbral_final = float(ganador['Mejores Parámetros'].split('Threshold: ')[1].split(' ')[0]) if 'Threshold:' in str(ganador['Mejores Parámetros']) else 0.5
artifacts = {
    'model': ganador['Estimator'],
    'threshold': umbral_final,
    'encoder': encoder,
    'imputer': imputer,
    'features': features, 
    'num_cols': num_cols, 
    'cat_cols': cat_cols, 'input_dtypes': X_train.dtypes.to_dict(),
    'metrics': {'auc': ganador['AUC'], 'f1': ganador['F1-Score'],
        'Accuracy': ganador['Accuracy'],'Precision': ganador['Precision'],
        'Recall': ganador['Recall'],'Log Loss': ganador['Log Loss']}}
ruta_archivo = os.path.join('Modelos', 'modelo_looklike.pkl')
os.makedirs('Modelos', exist_ok=True)
joblib.dump(artifacts, ruta_archivo)

['Modelos\\modelo_looklike.pkl']

In [None]:
df_res

Unnamed: 0,Modelo,AUC,Accuracy,F1-Score,Precision,Recall,Log Loss,Mejores Parámetros,Estimator
0,XGBoost,0.819682,0.758178,0.660503,0.668919,0.652296,0.498529,"{'subsample': 0.8, 'scale_pos_weight': 1.77293...","XGBClassifier(base_score=None, booster=None, c..."
1,Random Forest,0.81555,0.759166,0.628689,0.707994,0.565361,0.50563,"{'n_estimators': 700, 'min_samples_split': 2, ...","(DecisionTreeClassifier(max_features='sqrt', m..."
2,LightGBM,0.813532,0.743204,0.663288,0.629135,0.701362,0.515946,"{'subsample': 0.8, 'reg_alpha': 1, 'num_leaves...","LGBMClassifier(class_weight='balanced', colsam..."
3,CatBoost,0.809855,0.736618,0.661876,0.616235,0.714819,0.528612,"{'learning_rate': 0.05, 'l2_leaf_reg': 3, 'ite...",<catboost.core.CatBoostClassifier object at 0x...


In [None]:
count_neg = len(y_train) - sum(y_train)
count_pos = sum(y_train)
scale_weight = count_neg / count_pos


xgb_params = {
    'n_estimators': 1000,'max_depth': 20,
    'learning_rate': 0.1,'subsample': 0.8,
    'colsample_bytree': 0.7,'reg_alpha': 1,
    'scale_pos_weight': scale_weight, 
    'n_jobs': -1,'random_state': 42,
    'eval_metric': 'logloss','use_label_encoder': False}

rf_params = {
    'n_estimators': 700,'min_samples_split': 2,
    'min_samples_leaf': 2,'max_features': 'sqrt',
    'max_depth': None,'class_weight': 'balanced_subsample',
    'n_jobs': -1,'random_state': 42}

lgbm_params = {
    'n_estimators': 1000,'max_depth': 20,
    'learning_rate': 0.1,'num_leaves': 100,
    'subsample': 0.8,'colsample_bytree': 0.7,
    'reg_alpha': 1,'class_weight': 'balanced',
    'n_jobs': -1,'random_state': 42,'verbose': -1}

estimators = [
    ('xgboost', XGBClassifier(**xgb_params)),
    ('lightgbm', LGBMClassifier(**lgbm_params)),
    ('rf', RandomForestClassifier(**rf_params)),
    ('knn', make_pipeline(StandardScaler(), KNeighborsClassifier(n_neighbors=15)))]

clf_stacking = StackingClassifier(
    estimators=estimators,final_estimator=LogisticRegression(C=1.0, random_state=42),
    passthrough=False, cv=5,n_jobs=1)
clf_stacking.fit(X_train, y_train)

y_prob_stack = clf_stacking.predict_proba(X_test)[:, 1]
thresholds = np.arange(0.1, 0.9, 0.01)
f1_scores = []
for thresh in thresholds:
    y_pred_thresh = (y_prob_stack >= thresh).astype(int)
    f1_scores.append(f1_score(y_test, y_pred_thresh, zero_division=0))
best_idx = np.argmax(f1_scores)
best_thresh = thresholds[best_idx]
y_pred_opt = (y_prob_stack >= best_thresh).astype(int)

acc_stack = accuracy_score(y_test, y_pred_opt)
prec_stack = precision_score(y_test, y_pred_opt, zero_division=0)
rec_stack = recall_score(y_test, y_pred_opt, zero_division=0)
f1_stack = f1_score(y_test, y_pred_opt, zero_division=0) 
auc_stack = roc_auc_score(y_test, y_prob_stack)
ll_stack = log_loss(y_test, y_prob_stack)
if 'tabla_final' not in locals(): tabla_final = []
tabla_final.append({
    'Modelo': 'Stacking (XGB+RF+LGB+KNN)','AUC': auc_stack,
    'Accuracy': acc_stack,'F1-Score': f1_stack,
    'Precision': prec_stack,'Recall': rec_stack,
    'Log Loss': ll_stack,'Mejores Parámetros': f'Threshold: {best_thresh:.2f}','Estimator': clf_stacking})
df_res = pd.DataFrame(tabla_final).sort_values(by='AUC', ascending=False)
print(f"Mejor Umbral encontrado: {best_thresh:.2f}")
print(df_res.head())

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Mejor Umbral encontrado: 0.33
                      Modelo       AUC  Accuracy  F1-Score  Precision  \
4  Stacking (XGB+RF+LGB+KNN)  0.823082  0.740942  0.677157   0.614951   
0                    XGBoost  0.819682  0.758178  0.660503   0.668919   
1              Random Forest  0.815550  0.759166  0.628689   0.707994   
2                   LightGBM  0.813532  0.743204  0.663288   0.629135   
3                   CatBoost  0.809855  0.736618  0.661876   0.616235   

     Recall  Log Loss                                 Mejores Parámetros  \
4  0.753364  0.489761                                    Threshold: 0.33   
0  0.652296  0.498529  {'subsample': 0.8, 'scale_pos_weight': 1.77293...   
1  0.565361  0.505630  {'n_estimators': 700, 'min_samples_split': 2, ...   
2  0.701362  0.515946  {'subsample': 0.8, 'reg_alpha': 1, 'num_leaves...   
3  0.714819  0.528612  {'learning_rate': 0.05, 'l2_leaf_reg': 3, 'ite...   

                                           Estimator  
4  StackingClassifi

In [None]:
artifacts = {
    'model': clf_stacking,
    'threshold': best_thresh,
    'features': X_train.columns.tolist(),
    'num_cols': num_cols, 
    'cat_cols': cat_cols,
    'input_dtypes': X_train.dtypes.to_dict(),
    'encoder': encoder,
    'imputer': imputer,
    'metrics': {
        'AUC': auc_stack,'F1': f1_stack,
        'Accuracy': acc_stack,'Precision': prec_stack,
        'Recall': rec_stack,'Log Loss': ll_stack}
}
ruta_archivo = os.path.join('Modelos', 'modelo_Stacking.pkl')
os.makedirs('Modelos', exist_ok=True)
joblib.dump(artifacts, ruta_archivo)

['Modelos\\modelo_Stacking.pkl']

In [None]:
estimators_diversos = [
    # ExtraTrees
    ('et', ExtraTreesClassifier(
        n_estimators=300, max_depth=10, 
        min_samples_leaf=2, class_weight='balanced', 
        n_jobs=-1, random_state=42)),
    
    # HistGradientBoosting
    ('hgb', HistGradientBoostingClassifier(
        max_iter=200, learning_rate=0.05, 
        max_depth=10,class_weight='balanced', 
        random_state=42))]

clf_stacking_div = StackingClassifier(
    estimators=estimators_diversos,final_estimator=LogisticRegression(penalty='l2', C=1.0, random_state=42),
    passthrough=False, cv=5,n_jobs=-1)

clf_stacking_div.fit(X_train, y_train)
y_prob_stack_div = clf_stacking_div.predict_proba(X_test)[:, 1]
thresholds = np.arange(0.1, 0.9, 0.01)
f1_scores_div = []
for thresh in thresholds:
    y_pred_thresh = (y_prob_stack_div >= thresh).astype(int)
    f1_scores_div.append(f1_score(y_test, y_pred_thresh, zero_division=0))
best_idx_div = np.argmax(f1_scores_div)
best_thresh_div = thresholds[best_idx_div]
y_pred_opt_div = (y_prob_stack_div >= best_thresh_div).astype(int)

acc_div = accuracy_score(y_test, y_pred_opt_div)
prec_div = precision_score(y_test, y_pred_opt_div, zero_division=0)
rec_div = recall_score(y_test, y_pred_opt_div, zero_division=0)
f1_div = f1_score(y_test, y_pred_opt_div, zero_division=0)
auc_div = roc_auc_score(y_test, y_prob_stack_div)
ll_div = log_loss(y_test, y_prob_stack_div)
tabla_final.append({
    'Modelo': 'Stacking (ET+HGB)',
    'AUC': auc_div,'Accuracy': acc_div,
    'F1-Score': f1_div,'Precision': prec_div,
    'Recall': rec_div,'Log Loss': ll_div,
    'Mejores Parámetros': f'Threshold: {best_thresh_div:.2f} | Diverse Stack','Estimator': clf_stacking_div})
df_res = pd.DataFrame(tabla_final).sort_values(by='AUC', ascending=False)
print(f"Mejor Umbral: {best_thresh_div:.2f}")
print(df_res[['Modelo', 'AUC', 'F1-Score', 'Recall']].head(10))

Mejor Umbral: 0.30
                      Modelo       AUC  F1-Score    Recall
4  Stacking (XGB+RF+LGB+KNN)  0.823082  0.677157  0.753364
0                    XGBoost  0.819682  0.660503  0.652296
1              Random Forest  0.815550  0.628689  0.565361
2                   LightGBM  0.813532  0.663288  0.701362
3                   CatBoost  0.809855  0.661876  0.714819
5          Stacking (ET+HGB)  0.760134  0.619749  0.767139


In [None]:
estimators_hybrid = [
    ('lgbm', LGBMClassifier(
        n_estimators=300, learning_rate=0.03,
        num_leaves=31,class_weight='balanced', 
        random_state=42, verbose=-1)),
    ('et', ExtraTreesClassifier(
        n_estimators=200, max_depth=12, 
        min_samples_leaf=3, class_weight='balanced', 
        n_jobs=-1, random_state=42)),
    ('knn', make_pipeline(
        StandardScaler(), KNeighborsClassifier(n_neighbors=25, weights='distance'))),
    ('logreg_base', make_pipeline(
        StandardScaler(), LogisticRegression(C=0.5, class_weight='balanced', solver='liblinear', random_state=42)))]

clf_stacking_hybrid = StackingClassifier(
    estimators=estimators_hybrid,
    final_estimator=LogisticRegression(C=1.0, penalty='l2', random_state=42),
    passthrough=False, cv=5,n_jobs=-1)

clf_stacking_hybrid.fit(X_train, y_train)
y_prob_stack_hyb = clf_stacking_hybrid.predict_proba(X_test)[:, 1]
thresholds = np.arange(0.1, 0.9, 0.01)
f1_scores_hyb = []
for thresh in thresholds:
    y_pred_thresh = (y_prob_stack_hyb >= thresh).astype(int)
    f1_scores_hyb.append(f1_score(y_test, y_pred_thresh, zero_division=0))
best_idx_hyb = np.argmax(f1_scores_hyb)
best_thresh_hyb = thresholds[best_idx_hyb]
y_pred_opt_hyb = (y_prob_stack_hyb >= best_thresh_hyb).astype(int)

acc_hyb = accuracy_score(y_test, y_pred_opt_hyb)
prec_hyb = precision_score(y_test, y_pred_opt_hyb, zero_division=0)
rec_hyb = recall_score(y_test, y_pred_opt_hyb, zero_division=0)
f1_hyb = f1_score(y_test, y_pred_opt_hyb, zero_division=0)
auc_hyb = roc_auc_score(y_test, y_prob_stack_hyb)
ll_hyb = log_loss(y_test, y_prob_stack_hyb)
tabla_final.append({
    'Modelo': 'Stacking (LGB+ET+KNN+LOG)','AUC': auc_hyb,
    'Accuracy': acc_hyb,'F1-Score': f1_hyb,
    'Precision': prec_hyb,'Recall': rec_hyb,
    'Log Loss': ll_hyb,'Mejores Parámetros': f'Threshold: {best_thresh_hyb:.2f} | Passthrough Enabled',
    'Estimator': clf_stacking_hybrid})

df_res = pd.DataFrame(tabla_final).sort_values(by='AUC', ascending=False)
print(f"Umbral óptimo: {best_thresh_hyb:.2f}")
print(df_res[['Modelo', 'AUC', 'F1-Score', 'Log Loss']].to_string(index=False))

Umbral óptimo: 0.32
                   Modelo      AUC  F1-Score  Log Loss
Stacking (XGB+RF+LGB+KNN) 0.823082  0.677157  0.489761
                  XGBoost 0.819682  0.660503  0.498529
            Random Forest 0.815550  0.628689  0.505630
                 LightGBM 0.813532  0.663288  0.515946
                 CatBoost 0.809855  0.661876  0.528612
Stacking (LGB+ET+KNN+LOG) 0.788336  0.643457  0.524282
        Stacking (ET+HGB) 0.760134  0.619749  0.549169


In [None]:
count_neg = len(y_train) - sum(y_train)
count_pos = sum(y_train)
scale_weight = count_neg / count_pos

estimators_champions = [
    ('xgboost', XGBClassifier(
        n_estimators=300,max_depth=5, 
        learning_rate=0.05,scale_pos_weight=scale_weight, 
        use_label_encoder=False, eval_metric='logloss', random_state=42)),
    ('rf', RandomForestClassifier(
        n_estimators=300,max_depth=15, 
        class_weight='balanced',random_state=42)),
    ('lightgbm', LGBMClassifier(
        n_estimators=300, learning_rate=0.04,
        class_weight='balanced', random_state=42, verbose=-1))]

meta_model = LogisticRegression(
    C=0.1,solver='lbfgs',random_state=42)

clf_stacking_champ = StackingClassifier(
    estimators=estimators_champions,final_estimator=meta_model,
    passthrough=False,cv=5,n_jobs=-1)

clf_stacking_champ.fit(X_train, y_train)
y_prob_stack_champ = clf_stacking_champ.predict_proba(X_test)[:, 1]
thresholds = np.arange(0.1, 0.9, 0.01)
f1_scores_champ = []
for thresh in thresholds:
    y_pred_thresh = (y_prob_stack_champ >= thresh).astype(int)
    f1_scores_champ.append(f1_score(y_test, y_pred_thresh, zero_division=0))
best_idx_champ = np.argmax(f1_scores_champ)
best_thresh_champ = thresholds[best_idx_champ]
y_pred_opt_champ = (y_prob_stack_champ >= best_thresh_champ).astype(int)

acc_champ = accuracy_score(y_test, y_pred_opt_champ)
prec_champ = precision_score(y_test, y_pred_opt_champ, zero_division=0)
rec_champ = recall_score(y_test, y_pred_opt_champ, zero_division=0)
f1_champ = f1_score(y_test, y_pred_opt_champ, zero_division=0)
auc_champ = roc_auc_score(y_test, y_prob_stack_champ)
ll_champ = log_loss(y_test, y_prob_stack_champ)
tabla_final.append({
    'Modelo': 'Stacking (XGB+RF+LGB)',
    'AUC': auc_champ,'Accuracy': acc_champ,
    'F1-Score': f1_champ,'Precision': prec_champ,
    'Recall': rec_champ,'Log Loss': ll_champ,
    'Mejores Parámetros': f'Threshold: {best_thresh_champ:.2f} | Top3 Models','Estimator': clf_stacking_champ})
df_res = pd.DataFrame(tabla_final).sort_values(by='AUC', ascending=False)
df_res.to_csv('Datos/Transformados/Resultado_Modelos.csv')
print(f"Mejor Umbral: {best_thresh_champ:.2f}")
print(df_res[['Modelo', 'AUC', 'Accuracy', 'F1-Score']].head(10))

Mejor Umbral: 0.34
                      Modelo       AUC  Accuracy  F1-Score
4  Stacking (XGB+RF+LGB+KNN)  0.823082  0.740942  0.677157
0                    XGBoost  0.819682  0.758178  0.660503
1              Random Forest  0.815550  0.759166  0.628689
2                   LightGBM  0.813532  0.743204  0.663288
3                   CatBoost  0.809855  0.736618  0.661876
7      Stacking (XGB+RF+LGB)  0.794229  0.709075  0.648372
6  Stacking (LGB+ET+KNN+LOG)  0.788336  0.702962  0.643457
5          Stacking (ET+HGB)  0.760134  0.660516  0.619749


In [None]:
print(df_res)

                      Modelo       AUC  Accuracy  F1-Score  Precision  \
4  Stacking (XGB+RF+LGB+KNN)  0.823082  0.740942  0.677157   0.614951   
0                    XGBoost  0.819682  0.758178  0.660503   0.668919   
1              Random Forest  0.815550  0.759166  0.628689   0.707994   
2                   LightGBM  0.813532  0.743204  0.663288   0.629135   
3                   CatBoost  0.809855  0.736618  0.661876   0.616235   
7      Stacking (XGB+RF+LGB)  0.794229  0.709075  0.648372   0.574671   
6  Stacking (LGB+ET+KNN+LOG)  0.788336  0.702962  0.643457   0.567295   
5          Stacking (ET+HGB)  0.760134  0.660516  0.619749   0.519867   

     Recall  Log Loss                                 Mejores Parámetros  \
4  0.753364  0.489761                                    Threshold: 0.33   
0  0.652296  0.498529  {'subsample': 0.8, 'scale_pos_weight': 1.77293...   
1  0.565361  0.505630  {'n_estimators': 700, 'min_samples_split': 2, ...   
2  0.701362  0.515946  {'subsample': 0

In [None]:
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=UserWarning)
pd.set_option('mode.chained_assignment', None)
from reglas import *
user_random = df_users['user_id'].sample(1).iloc[0]
prod_random = df_products['product_variant_id'].sample(1).iloc[0]
simular_match(df_users,df_products,user_random,prod_random)

SIMULADOR
User: d897293cfa134d53a717521430782460
Prod: 1ccd9fc6-07c4-4eee-9171-6f6430ae3abd
--------------------------------------------------
Probabilidad: 65.37%
Resultado:    ❤️ LIKE
