In [None]:
# ⬅︎ Cell 1: Imports & Config
import os
import json
import warnings
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, make_column_selector as selector
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.base import BaseEstimator, TransformerMixin
import joblib
# מודלים
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
# Optuna
import optuna
from optuna.pruners import MedianPruner

warnings.filterwarnings("ignore")

RANDOM_STATE = 42
DATA_PATH = "../data/raw/telco_churn.csv"  # התאם אם צריך
TARGET_COL = "Churn"


In [2]:
# ⬅︎ Cell 2: Load & Basic cleaning
df = pd.read_csv(DATA_PATH)

# Telco quirks
# TotalCharges מגיע כמחרוזת; ננקה לרמה נומרית ונטפל בחסרים
df["TotalCharges_num"] = pd.to_numeric(df["TotalCharges"], errors="coerce")
mask_na = df["TotalCharges_num"].isna()
df.loc[mask_na, "TotalCharges_num"] = df.loc[mask_na, "MonthlyCharges"] * df.loc[mask_na, "tenure"].clip(lower=1)

# יעד לבוליאני
df[TARGET_COL] = df[TARGET_COL].map({"Yes": 1, "No": 0}).astype(int)

# פיצ’ר־אנג’ינירינג קל בתוך מחלקת טרנספורמר כדי לשלב ב-Pipeline
service_cols = [
    "PhoneService","MultipleLines","InternetService","OnlineSecurity","OnlineBackup",
    "DeviceProtection","TechSupport","StreamingTV","StreamingMovies"
]

class AddFeatures(BaseEstimator, TransformerMixin):
    def __init__(self, service_cols):
        self.service_cols = service_cols
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X = X.copy()
        # תקופת מנוי בשנים
        X["tenure_years"] = X["tenure"] / 12.0
        
        # יחס חיובים לותק (charge per month בפועל)
        X["charges_per_tenure"] = X["TotalCharges_num"] / np.maximum(X["tenure"], 1)
        
        # ספירת שירותים פעילים (Yes)
        def yes(x): 
            return 1 if str(x).strip().lower()=="yes" else 0
        X["services_count"] = X[self.service_cols].applymap(yes).sum(axis=1)
        
        # אינדיקטורים שימושיים
        X["has_streaming"] = ((X.get("StreamingTV","No")=="Yes") | (X.get("StreamingMovies","No")=="Yes")).astype(int)
        X["has_fiber"] = (X.get("InternetService","None")=="Fiber optic").astype(int)
        X["is_electronic_check"] = (X.get("PaymentMethod","").str.contains("Electronic check", case=False, na=False)).astype(int)
        X["auto_pay"] = (X.get("PaymentMethod","").str.contains("automatic", case=False, na=False)).astype(int)
        
        # מיפוי משך חוזה לחודשים
        contract_map = {"Month-to-month": 1, "One year": 12, "Two year": 24}
        X["contract_term"] = X.get("Contract","Month-to-month").map(contract_map).fillna(1).astype(int)
        
        # אינטראקציות פשוטות
        X["monthly_x_term"] = X["MonthlyCharges"] * X["contract_term"]
        X["tenure_x_services"] = X["tenure"] * X["services_count"]
        
        return X


In [3]:
# ⬅︎ Cell 3: Split
X = df.drop(columns=[TARGET_COL])
y = df[TARGET_COL].values

X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.30, stratify=y, random_state=RANDOM_STATE
)
X_valid, X_test, y_valid, y_test = train_test_split(
    X_temp, y_temp, test_size=0.50, stratify=y_temp, random_state=RANDOM_STATE
)

pos = y_train.sum()
neg = len(y_train) - pos
scale_pos_weight = neg / max(pos, 1)
scale_pos_weight


2.7691131498470947

In [4]:
# ⬅︎ Cell 4: Preprocessing pipeline
feature_adder = AddFeatures(service_cols=service_cols)

numeric_proc = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler(with_mean=False))  # with_mean=False לשילוב עם sparse
])

categorical_proc = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_proc, selector(dtype_include=["int64","float64","int32","float32","uint8","bool"])),
        ("cat", categorical_proc, selector(dtype_include=object))
    ],
    remainder="drop"
)

# נבנה Pipeline כללי: add features -> preprocess -> model (יוחדר בדינמיות)
def make_pipeline(model):
    return Pipeline(steps=[
        ("add", feature_adder),
        ("prep", preprocessor),
        ("model", model)
    ])


In [5]:
# ⬅︎ Cell 5: Eval helper
def evaluate(model, X_tr, y_tr, X_va, y_va):
    proba_tr = model.predict_proba(X_tr)[:,1]
    proba_va = model.predict_proba(X_va)[:,1]
    metrics = {
        "roc_auc_train": roc_auc_score(y_tr, proba_tr),
        "pr_auc_train":  average_precision_score(y_tr, proba_tr),
        "roc_auc_valid": roc_auc_score(y_va, proba_va),
        "pr_auc_valid":  average_precision_score(y_va, proba_va),
    }
    return metrics


In [6]:
# ⬅︎ Cell 6: Optuna objective (5-fold CV on train)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

def objective(trial):
    model_type = trial.suggest_categorical("model_type", ["xgb", "lgbm"])
    
    if model_type == "xgb":
        params = {
            "n_estimators": trial.suggest_int("xgb_n_estimators", 200, 800),
            "max_depth": trial.suggest_int("xgb_max_depth", 3, 8),
            "learning_rate": trial.suggest_float("xgb_lr", 1e-2, 3e-1, log=True),
            "subsample": trial.suggest_float("xgb_subsample", 0.6, 1.0),
            "colsample_bytree": trial.suggest_float("xgb_colsample", 0.6, 1.0),
            "reg_alpha": trial.suggest_float("xgb_alpha", 1e-8, 10.0, log=True),
            "reg_lambda": trial.suggest_float("xgb_lambda", 1e-8, 10.0, log=True),
            "min_child_weight": trial.suggest_int("xgb_min_child_weight", 1, 10),
            "gamma": trial.suggest_float("xgb_gamma", 0.0, 5.0),
            "random_state": RANDOM_STATE,
            "n_jobs": -1,
            "objective": "binary:logistic",
            "tree_method": "hist",
            "eval_metric": "aucpr",
            "scale_pos_weight": scale_pos_weight
        }
        base_model = XGBClassifier(**params)
    else:
        params = {
            "n_estimators": trial.suggest_int("lgbm_n_estimators", 300, 1000),
            "num_leaves": trial.suggest_int("lgbm_num_leaves", 31, 255),
            "learning_rate": trial.suggest_float("lgbm_lr", 1e-2, 3e-1, log=True),
            "subsample": trial.suggest_float("lgbm_subsample", 0.5, 1.0),
            "colsample_bytree": trial.suggest_float("lgbm_colsample", 0.5, 1.0),
            "min_child_samples": trial.suggest_int("lgbm_min_child_samples", 5, 100),
            "reg_alpha": trial.suggest_float("lgbm_alpha", 1e-8, 10.0, log=True),
            "reg_lambda": trial.suggest_float("lgbm_lambda", 1e-8, 10.0, log=True),
            "random_state": RANDOM_STATE,
            "n_jobs": -1,
            "objective": "binary",
            "metric": "average_precision",
            "verbose": -1,
            "scale_pos_weight": scale_pos_weight
        }
        base_model = LGBMClassifier(**params)
    
    pipe = make_pipeline(base_model)

    pr_aucs = []
    for fold, (tr_idx, va_idx) in enumerate(cv.split(X_train, y_train), start=1):
        X_tr, X_va = X_train.iloc[tr_idx], X_train.iloc[va_idx]
        y_tr, y_va = y_train[tr_idx], y_train[va_idx]

        pipe.fit(X_tr, y_tr)
        proba = pipe.predict_proba(X_va)[:, 1]
        pr_auc = average_precision_score(y_va, proba)
        pr_aucs.append(pr_auc)
        
        trial.report(np.mean(pr_aucs), step=fold)
        if trial.should_prune():
            raise optuna.TrialPruned()
    
    return float(np.mean(pr_aucs))


In [7]:
# ⬅︎ Cell 7: Run study
study = optuna.create_study(direction="maximize", pruner=MedianPruner())
study.optimize(objective, n_trials=40, timeout=None, n_jobs=1, show_progress_bar=True)

print("Best value (PR-AUC):", study.best_value)
print("Best params:", study.best_params)


[I 2025-08-30 20:01:44,262] A new study created in memory with name: no-name-47eb74f8-920a-4f33-ac8e-8a16b61f3572
Best trial: 0. Best value: 0.58022:   2%|▎         | 1/40 [00:07<04:51,  7.47s/it]

[I 2025-08-30 20:01:51,733] Trial 0 finished with value: 0.5802196151770449 and parameters: {'model_type': 'lgbm', 'lgbm_n_estimators': 899, 'lgbm_num_leaves': 175, 'lgbm_lr': 0.16126147911448108, 'lgbm_subsample': 0.9292528655795912, 'lgbm_colsample': 0.8620782818264461, 'lgbm_min_child_samples': 50, 'lgbm_alpha': 9.392959240742093e-05, 'lgbm_lambda': 0.00406647984315883}. Best is trial 0 with value: 0.5802196151770449.


Best trial: 1. Best value: 0.626587:   5%|▌         | 2/40 [00:11<03:17,  5.20s/it]

[I 2025-08-30 20:01:55,348] Trial 1 finished with value: 0.6265870801208628 and parameters: {'model_type': 'lgbm', 'lgbm_n_estimators': 756, 'lgbm_num_leaves': 58, 'lgbm_lr': 0.03157870399942338, 'lgbm_subsample': 0.7369003632320126, 'lgbm_colsample': 0.6251482979911165, 'lgbm_min_child_samples': 81, 'lgbm_alpha': 0.00034240966258748066, 'lgbm_lambda': 2.645081158570957e-07}. Best is trial 1 with value: 0.6265870801208628.


Best trial: 2. Best value: 0.650662:   8%|▊         | 3/40 [00:13<02:24,  3.91s/it]

[I 2025-08-30 20:01:57,712] Trial 2 finished with value: 0.6506616029291081 and parameters: {'model_type': 'xgb', 'xgb_n_estimators': 308, 'xgb_max_depth': 5, 'xgb_lr': 0.24984116262333178, 'xgb_subsample': 0.9640247752119213, 'xgb_colsample': 0.6163980054378699, 'xgb_alpha': 0.6959004804688235, 'xgb_lambda': 0.0025253533861880942, 'xgb_min_child_weight': 6, 'xgb_gamma': 1.3428037769064816}. Best is trial 2 with value: 0.6506616029291081.


Best trial: 2. Best value: 0.650662:  10%|█         | 4/40 [00:18<02:41,  4.48s/it]

[I 2025-08-30 20:02:03,082] Trial 3 finished with value: 0.5732173983872716 and parameters: {'model_type': 'lgbm', 'lgbm_n_estimators': 737, 'lgbm_num_leaves': 127, 'lgbm_lr': 0.1803110035980306, 'lgbm_subsample': 0.5507898611253473, 'lgbm_colsample': 0.974501180163092, 'lgbm_min_child_samples': 73, 'lgbm_alpha': 0.00055256440201886, 'lgbm_lambda': 0.002751794644092757}. Best is trial 2 with value: 0.6506616029291081.


Best trial: 2. Best value: 0.650662:  12%|█▎        | 5/40 [00:22<02:20,  4.02s/it]

[I 2025-08-30 20:02:06,272] Trial 4 finished with value: 0.607046140456428 and parameters: {'model_type': 'lgbm', 'lgbm_n_estimators': 311, 'lgbm_num_leaves': 144, 'lgbm_lr': 0.13657230401614484, 'lgbm_subsample': 0.7489991024163175, 'lgbm_colsample': 0.7734012845236162, 'lgbm_min_child_samples': 71, 'lgbm_alpha': 0.00028505817709386104, 'lgbm_lambda': 1.1833803940478857e-07}. Best is trial 2 with value: 0.6506616029291081.


Best trial: 2. Best value: 0.650662:  15%|█▌        | 6/40 [00:23<01:52,  3.30s/it]

[I 2025-08-30 20:02:08,180] Trial 5 pruned. 


Best trial: 6. Best value: 0.666212:  18%|█▊        | 7/40 [00:33<02:56,  5.34s/it]

[I 2025-08-30 20:02:17,710] Trial 6 finished with value: 0.6662122204175436 and parameters: {'model_type': 'xgb', 'xgb_n_estimators': 759, 'xgb_max_depth': 5, 'xgb_lr': 0.021402863195369392, 'xgb_subsample': 0.7877274672271997, 'xgb_colsample': 0.7541871116634934, 'xgb_alpha': 0.004880891308414537, 'xgb_lambda': 0.07489029518524612, 'xgb_min_child_weight': 7, 'xgb_gamma': 3.411025413028421}. Best is trial 6 with value: 0.6662122204175436.


Best trial: 7. Best value: 0.668617:  20%|██        | 8/40 [00:43<03:43,  6.98s/it]

[I 2025-08-30 20:02:28,203] Trial 7 finished with value: 0.6686173085071581 and parameters: {'model_type': 'xgb', 'xgb_n_estimators': 295, 'xgb_max_depth': 6, 'xgb_lr': 0.011215013204143432, 'xgb_subsample': 0.6406242991999294, 'xgb_colsample': 0.6749075316191062, 'xgb_alpha': 1.186251597200353e-05, 'xgb_lambda': 0.00019667873899066994, 'xgb_min_child_weight': 2, 'xgb_gamma': 1.6965925200062448}. Best is trial 7 with value: 0.6686173085071581.


Best trial: 7. Best value: 0.668617:  22%|██▎       | 9/40 [00:45<02:41,  5.20s/it]

[I 2025-08-30 20:02:29,482] Trial 8 pruned. 


Best trial: 9. Best value: 0.670756:  25%|██▌       | 10/40 [00:50<02:37,  5.24s/it]

[I 2025-08-30 20:02:34,819] Trial 9 finished with value: 0.6707563664807674 and parameters: {'model_type': 'xgb', 'xgb_n_estimators': 380, 'xgb_max_depth': 3, 'xgb_lr': 0.011223932063958247, 'xgb_subsample': 0.7713718912140309, 'xgb_colsample': 0.9418656817480863, 'xgb_alpha': 7.058033539491168e-08, 'xgb_lambda': 0.00014801570804099607, 'xgb_min_child_weight': 9, 'xgb_gamma': 0.3636009522477901}. Best is trial 9 with value: 0.6707563664807674.


Best trial: 10. Best value: 0.671221:  28%|██▊       | 11/40 [00:55<02:33,  5.29s/it]

[I 2025-08-30 20:02:40,218] Trial 10 finished with value: 0.6712214030830241 and parameters: {'model_type': 'xgb', 'xgb_n_estimators': 461, 'xgb_max_depth': 3, 'xgb_lr': 0.039122466726426657, 'xgb_subsample': 0.7725953487820547, 'xgb_colsample': 0.9838373738836463, 'xgb_alpha': 1.3279441034994423e-08, 'xgb_lambda': 6.871020925721481e-07, 'xgb_min_child_weight': 10, 'xgb_gamma': 4.8565974930513764}. Best is trial 10 with value: 0.6712214030830241.


Best trial: 10. Best value: 0.671221:  30%|███       | 12/40 [01:01<02:32,  5.44s/it]

[I 2025-08-30 20:02:45,993] Trial 11 finished with value: 0.6706816806187561 and parameters: {'model_type': 'xgb', 'xgb_n_estimators': 457, 'xgb_max_depth': 3, 'xgb_lr': 0.04299831470530539, 'xgb_subsample': 0.7703507174638998, 'xgb_colsample': 0.9714162137360043, 'xgb_alpha': 1.0370224540744482e-08, 'xgb_lambda': 5.749096104173173e-07, 'xgb_min_child_weight': 10, 'xgb_gamma': 4.962011826024938}. Best is trial 10 with value: 0.6712214030830241.


Best trial: 10. Best value: 0.671221:  32%|███▎      | 13/40 [01:09<02:43,  6.06s/it]

[I 2025-08-30 20:02:53,494] Trial 12 finished with value: 0.6619211004914793 and parameters: {'model_type': 'xgb', 'xgb_n_estimators': 464, 'xgb_max_depth': 3, 'xgb_lr': 0.06237489183042325, 'xgb_subsample': 0.7063406496439146, 'xgb_colsample': 0.9873932296051184, 'xgb_alpha': 1.1880569498513635e-08, 'xgb_lambda': 2.595860388975485e-05, 'xgb_min_child_weight': 10, 'xgb_gamma': 3.1322236302805955}. Best is trial 10 with value: 0.6712214030830241.


Best trial: 10. Best value: 0.671221:  35%|███▌      | 14/40 [01:17<02:57,  6.84s/it]

[I 2025-08-30 20:03:02,148] Trial 13 pruned. 


Best trial: 10. Best value: 0.671221:  38%|███▊      | 15/40 [01:24<02:47,  6.68s/it]

[I 2025-08-30 20:03:08,463] Trial 14 finished with value: 0.6682033056632994 and parameters: {'model_type': 'xgb', 'xgb_n_estimators': 585, 'xgb_max_depth': 4, 'xgb_lr': 0.04997172781825461, 'xgb_subsample': 0.8427633196644202, 'xgb_colsample': 0.8948738970395248, 'xgb_alpha': 8.324176758593911e-07, 'xgb_lambda': 1.0222189744053295e-08, 'xgb_min_child_weight': 9, 'xgb_gamma': 4.9530253374023925}. Best is trial 10 with value: 0.6712214030830241.


Best trial: 10. Best value: 0.671221:  40%|████      | 16/40 [01:30<02:38,  6.62s/it]

[I 2025-08-30 20:03:14,939] Trial 15 finished with value: 0.6708387537296489 and parameters: {'model_type': 'xgb', 'xgb_n_estimators': 212, 'xgb_max_depth': 4, 'xgb_lr': 0.02144012849435319, 'xgb_subsample': 0.7102666688858237, 'xgb_colsample': 0.8959578568136578, 'xgb_alpha': 2.0073616453937027e-07, 'xgb_lambda': 2.137780738878139, 'xgb_min_child_weight': 5, 'xgb_gamma': 3.6980553441231745}. Best is trial 10 with value: 0.6712214030830241.


Best trial: 16. Best value: 0.672963:  42%|████▎     | 17/40 [01:35<02:22,  6.22s/it]

[I 2025-08-30 20:03:20,208] Trial 16 finished with value: 0.6729633709471536 and parameters: {'model_type': 'xgb', 'xgb_n_estimators': 201, 'xgb_max_depth': 4, 'xgb_lr': 0.026604318470071057, 'xgb_subsample': 0.6794910626490949, 'xgb_colsample': 0.8538589681187536, 'xgb_alpha': 0.00012548964513500356, 'xgb_lambda': 8.088844275488134, 'xgb_min_child_weight': 4, 'xgb_gamma': 3.9520539561675396}. Best is trial 16 with value: 0.6729633709471536.


Best trial: 16. Best value: 0.672963:  45%|████▌     | 18/40 [01:37<01:44,  4.76s/it]

[I 2025-08-30 20:03:21,591] Trial 17 pruned. 


Best trial: 16. Best value: 0.672963:  48%|████▊     | 19/40 [01:39<01:24,  4.01s/it]

[I 2025-08-30 20:03:23,851] Trial 18 pruned. 


Best trial: 16. Best value: 0.672963:  50%|█████     | 20/40 [01:46<01:35,  4.79s/it]

[I 2025-08-30 20:03:30,439] Trial 19 finished with value: 0.6692176733187226 and parameters: {'model_type': 'xgb', 'xgb_n_estimators': 247, 'xgb_max_depth': 4, 'xgb_lr': 0.03239926683832711, 'xgb_subsample': 0.7254137282815938, 'xgb_colsample': 0.9297701549470863, 'xgb_alpha': 7.096496843371748e-05, 'xgb_lambda': 0.33580025716390627, 'xgb_min_child_weight': 1, 'xgb_gamma': 4.196170942332686}. Best is trial 16 with value: 0.6729633709471536.


Best trial: 16. Best value: 0.672963:  52%|█████▎    | 21/40 [01:48<01:16,  4.03s/it]

[I 2025-08-30 20:03:32,726] Trial 20 pruned. 


Best trial: 16. Best value: 0.672963:  55%|█████▌    | 22/40 [01:54<01:22,  4.58s/it]

[I 2025-08-30 20:03:38,583] Trial 21 finished with value: 0.6682757406690165 and parameters: {'model_type': 'xgb', 'xgb_n_estimators': 204, 'xgb_max_depth': 4, 'xgb_lr': 0.01791010013357103, 'xgb_subsample': 0.6668554462096298, 'xgb_colsample': 0.8925377541571926, 'xgb_alpha': 2.797754586977561e-07, 'xgb_lambda': 9.96570989817957, 'xgb_min_child_weight': 5, 'xgb_gamma': 3.5046236293504895}. Best is trial 16 with value: 0.6729633709471536.


Best trial: 16. Best value: 0.672963:  57%|█████▊    | 23/40 [02:02<01:35,  5.64s/it]

[I 2025-08-30 20:03:46,675] Trial 22 finished with value: 0.6704913721141736 and parameters: {'model_type': 'xgb', 'xgb_n_estimators': 201, 'xgb_max_depth': 5, 'xgb_lr': 0.02040947274792727, 'xgb_subsample': 0.7270359919234385, 'xgb_colsample': 0.8556522594561796, 'xgb_alpha': 5.461283616322755e-06, 'xgb_lambda': 0.6675281898211727, 'xgb_min_child_weight': 6, 'xgb_gamma': 2.7718499376976564}. Best is trial 16 with value: 0.6729633709471536.


Best trial: 16. Best value: 0.672963:  60%|██████    | 24/40 [02:03<01:08,  4.28s/it]

[I 2025-08-30 20:03:47,787] Trial 23 pruned. 


Best trial: 16. Best value: 0.672963:  62%|██████▎   | 25/40 [02:04<00:49,  3.32s/it]

[I 2025-08-30 20:03:48,865] Trial 24 pruned. 


Best trial: 16. Best value: 0.672963:  65%|██████▌   | 26/40 [02:09<00:53,  3.85s/it]

[I 2025-08-30 20:03:53,961] Trial 25 finished with value: 0.6690751918626568 and parameters: {'model_type': 'xgb', 'xgb_n_estimators': 248, 'xgb_max_depth': 4, 'xgb_lr': 0.016091810459226148, 'xgb_subsample': 0.8209077625782829, 'xgb_colsample': 0.8508269066540279, 'xgb_alpha': 8.015797255065084e-08, 'xgb_lambda': 2.400941519258643, 'xgb_min_child_weight': 7, 'xgb_gamma': 4.586909187185974}. Best is trial 16 with value: 0.6729633709471536.


Best trial: 16. Best value: 0.672963:  68%|██████▊   | 27/40 [02:10<00:37,  2.92s/it]

[I 2025-08-30 20:03:54,708] Trial 26 pruned. 


Best trial: 16. Best value: 0.672963:  70%|███████   | 28/40 [02:12<00:32,  2.68s/it]

[I 2025-08-30 20:03:56,814] Trial 27 pruned. 


Best trial: 16. Best value: 0.672963:  72%|███████▎  | 29/40 [02:21<00:49,  4.46s/it]

[I 2025-08-30 20:04:05,445] Trial 28 finished with value: 0.671860375896944 and parameters: {'model_type': 'xgb', 'xgb_n_estimators': 673, 'xgb_max_depth': 3, 'xgb_lr': 0.014505327605514665, 'xgb_subsample': 0.7556588147970215, 'xgb_colsample': 0.877986222639177, 'xgb_alpha': 0.0009392912662014197, 'xgb_lambda': 0.09907253074611642, 'xgb_min_child_weight': 7, 'xgb_gamma': 2.242221440934748}. Best is trial 16 with value: 0.6729633709471536.


Best trial: 16. Best value: 0.672963:  75%|███████▌  | 30/40 [02:21<00:33,  3.31s/it]

[I 2025-08-30 20:04:06,064] Trial 29 pruned. 


Best trial: 16. Best value: 0.672963:  78%|███████▊  | 31/40 [02:29<00:41,  4.56s/it]

[I 2025-08-30 20:04:13,531] Trial 30 finished with value: 0.667340567913831 and parameters: {'model_type': 'xgb', 'xgb_n_estimators': 654, 'xgb_max_depth': 3, 'xgb_lr': 0.030143991510030874, 'xgb_subsample': 0.7547939506013681, 'xgb_colsample': 0.799064392623379, 'xgb_alpha': 0.0011033740856710948, 'xgb_lambda': 0.0010960337454006485, 'xgb_min_child_weight': 8, 'xgb_gamma': 2.1276668706263973}. Best is trial 16 with value: 0.6729633709471536.


Best trial: 16. Best value: 0.672963:  80%|████████  | 32/40 [02:41<00:53,  6.74s/it]

[I 2025-08-30 20:04:25,380] Trial 31 finished with value: 0.6691768257552213 and parameters: {'model_type': 'xgb', 'xgb_n_estimators': 776, 'xgb_max_depth': 3, 'xgb_lr': 0.015365227912833707, 'xgb_subsample': 0.6894647533427344, 'xgb_colsample': 0.8798523825013357, 'xgb_alpha': 7.824568151802178e-05, 'xgb_lambda': 0.06424719958463297, 'xgb_min_child_weight': 5, 'xgb_gamma': 3.015618081919194}. Best is trial 16 with value: 0.6729633709471536.


Best trial: 16. Best value: 0.672963:  82%|████████▎ | 33/40 [02:43<00:37,  5.37s/it]

[I 2025-08-30 20:04:27,545] Trial 32 pruned. 


Best trial: 16. Best value: 0.672963:  85%|████████▌ | 34/40 [02:53<00:40,  6.80s/it]

[I 2025-08-30 20:04:37,682] Trial 33 finished with value: 0.6693672259261702 and parameters: {'model_type': 'xgb', 'xgb_n_estimators': 656, 'xgb_max_depth': 3, 'xgb_lr': 0.014218732225014652, 'xgb_subsample': 0.8449091810779092, 'xgb_colsample': 0.8563443463088936, 'xgb_alpha': 2.7303176375841826e-08, 'xgb_lambda': 0.18968436603121422, 'xgb_min_child_weight': 6, 'xgb_gamma': 4.6343677991853705}. Best is trial 16 with value: 0.6729633709471536.


Best trial: 16. Best value: 0.672963:  88%|████████▊ | 35/40 [02:54<00:25,  5.11s/it]

[I 2025-08-30 20:04:38,851] Trial 34 pruned. 


Best trial: 16. Best value: 0.672963:  90%|█████████ | 36/40 [02:56<00:16,  4.16s/it]

[I 2025-08-30 20:04:40,797] Trial 35 pruned. 


Best trial: 16. Best value: 0.672963:  92%|█████████▎| 37/40 [02:57<00:09,  3.28s/it]

[I 2025-08-30 20:04:42,013] Trial 36 pruned. 


Best trial: 16. Best value: 0.672963:  95%|█████████▌| 38/40 [03:04<00:08,  4.24s/it]

[I 2025-08-30 20:04:48,487] Trial 37 finished with value: 0.6720757078534583 and parameters: {'model_type': 'xgb', 'xgb_n_estimators': 477, 'xgb_max_depth': 3, 'xgb_lr': 0.013436047448706563, 'xgb_subsample': 0.636385375546444, 'xgb_colsample': 0.7955327293112278, 'xgb_alpha': 0.03946320789636545, 'xgb_lambda': 1.3706944695549545e-08, 'xgb_min_child_weight': 8, 'xgb_gamma': 1.8888131891350917}. Best is trial 16 with value: 0.6729633709471536.


Best trial: 16. Best value: 0.672963:  98%|█████████▊| 39/40 [03:10<00:04,  4.86s/it]

[I 2025-08-30 20:04:54,799] Trial 38 finished with value: 0.6728431752906807 and parameters: {'model_type': 'xgb', 'xgb_n_estimators': 506, 'xgb_max_depth': 3, 'xgb_lr': 0.013809205742227134, 'xgb_subsample': 0.6275850136598596, 'xgb_colsample': 0.7741402979495506, 'xgb_alpha': 0.12712513365458816, 'xgb_lambda': 1.1751123641656253e-08, 'xgb_min_child_weight': 8, 'xgb_gamma': 1.733776316402445}. Best is trial 16 with value: 0.6729633709471536.


Best trial: 16. Best value: 0.672963: 100%|██████████| 40/40 [03:11<00:00,  4.78s/it]

[I 2025-08-30 20:04:55,558] Trial 39 pruned. 
Best value (PR-AUC): 0.6729633709471536
Best params: {'model_type': 'xgb', 'xgb_n_estimators': 201, 'xgb_max_depth': 4, 'xgb_lr': 0.026604318470071057, 'xgb_subsample': 0.6794910626490949, 'xgb_colsample': 0.8538589681187536, 'xgb_alpha': 0.00012548964513500356, 'xgb_lambda': 8.088844275488134, 'xgb_min_child_weight': 4, 'xgb_gamma': 3.9520539561675396}





In [8]:
# ⬅︎ Cell 8: Refit on train+valid and evaluate on test
best = study.best_params
best_type = best["model_type"]

if best_type == "xgb":
    model = XGBClassifier(
        n_estimators=best["xgb_n_estimators"],
        max_depth=best["xgb_max_depth"],
        learning_rate=best["xgb_lr"],
        subsample=best["xgb_subsample"],
        colsample_bytree=best["xgb_colsample"],
        reg_alpha=best["xgb_alpha"],
        reg_lambda=best["xgb_lambda"],
        min_child_weight=best["xgb_min_child_weight"],
        gamma=best["xgb_gamma"],
        random_state=RANDOM_STATE,
        n_jobs=-1,
        objective="binary:logistic",
        tree_method="hist",
        eval_metric="aucpr",
        scale_pos_weight=scale_pos_weight
    )
else:
    model = LGBMClassifier(
        n_estimators=best["lgbm_n_estimators"],
        num_leaves=best["lgbm_num_leaves"],
        learning_rate=best["lgbm_lr"],
        subsample=best["lgbm_subsample"],
        colsample_bytree=best["lgbm_colsample"],
        min_child_samples=best["lgbm_min_child_samples"],
        reg_alpha=best["lgbm_alpha"],
        reg_lambda=best["lgbm_lambda"],
        random_state=RANDOM_STATE,
        n_jobs=-1,
        objective="binary",
        metric="average_precision",
        verbose=-1,
        scale_pos_weight=scale_pos_weight
    )

pipe_best = make_pipeline(model)
pipe_best.fit(pd.concat([X_train, X_valid]), np.concatenate([y_train, y_valid]))

# Evaluate
metrics = evaluate(pipe_best, pd.concat([X_train, X_valid]), np.concatenate([y_train, y_valid]), X_test, y_test)
metrics


{'roc_auc_train': 0.875278993886619,
 'pr_auc_train': 0.7092686381097678,
 'roc_auc_valid': 0.8440469420699268,
 'pr_auc_valid': 0.6683111396512327}

In [9]:
# ⬅︎ Cell 9: Save artifacts
os.makedirs("../models", exist_ok=True)

joblib.dump(pipe_best, "../models/best_pipeline.pkl")

# שמירת מטא־דאטה כולל שמות פיצ'רים אחרי OneHot (יעיל להסברים וחשיבות פיצ'רים)
# שים לב: get_feature_names_out קיים בגרסאות sklearn חדשות
prep = pipe_best.named_steps["prep"]
feature_names = prep.get_feature_names_out()
meta = {
    "model_type": best_type,
    "best_params": best,
    "feature_names": feature_names.tolist()
}
with open("../models/metadata.json", "w") as f:
    json.dump(meta, f, indent=2)

print("Saved ../models/best_pipeline.pkl and ../models/metadata.json")


Saved ../models/best_pipeline.pkl and ../models/metadata.json


In [10]:
# ⬅︎ Cell 10: Feature importance (top 25)
import numpy as np
def get_importances(pipe):
    mdl = pipe.named_steps["model"]
    if hasattr(mdl, "feature_importances_"):
        imp = mdl.feature_importances_
    elif hasattr(mdl, "booster_") and hasattr(mdl.booster_, "feature_importances_"):
        imp = mdl.booster_.feature_importances_
    else:
        return None
    names = pipe.named_steps["prep"].get_feature_names_out()
    return pd.DataFrame({"feature": names, "importance": imp}).sort_values("importance", ascending=False)

fi = get_importances(pipe_best)
if fi is not None:
    display(fi.head(25))
    fi.to_csv("../models/feature_importance_top.csv", index=False)
else:
    print("No feature_importances_ available for this model.")


Unnamed: 0,feature,importance
6032,cat__Contract_Month-to-month,0.227361
11,num__contract_term,0.182746
6012,cat__InternetService_Fiber optic,0.080113
8,num__has_fiber,0.051039
6013,cat__InternetService_No,0.033348
12,num__monthly_x_term,0.031696
6014,cat__OnlineSecurity_No,0.028228
6023,cat__TechSupport_No,0.024719
6039,cat__PaymentMethod_Electronic check,0.023015
4,num__tenure_years,0.020648


In [11]:
# ⬅︎ Cell 11: Quick smoke test on a small sample
Xs = X_train.sample(200, random_state=RANDOM_STATE)
ys = y_train[np.isin(X_train.index, Xs.index)]

pipe_smoke = make_pipeline(LGBMClassifier(n_estimators=50, random_state=RANDOM_STATE))
pipe_smoke.fit(Xs, ys)
proba = pipe_smoke.predict_proba(Xs)[:,1]
print("Smoke PR-AUC:", average_precision_score(ys, proba))


Smoke PR-AUC: 0.9691152348567608
