In [5]:
# ⬅︎ Cell 1: Imports & Config
import json
import os
import warnings

import joblib
import numpy as np

# Optuna
import optuna
import pandas as pd
from lightgbm import LGBMClassifier
from optuna.pruners import MedianPruner
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector as selector
from sklearn.impute import SimpleImputer
from sklearn.metrics import average_precision_score, roc_auc_score
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# מודלים
from xgboost import XGBClassifier

warnings.filterwarnings("ignore")

RANDOM_STATE = 42
DATA_PATH = "../data/raw/telco_churn.csv"  # התאם אם צריך
TARGET_COL = "Churn"


In [6]:
import pathlib
import sys

# מזהה את שורש הפרויקט כך ש-src תהיה בתוכו
cwd = pathlib.Path().resolve()
project_root = cwd if (cwd / "src").exists() else cwd.parent  # אם אתה בתוך notebooks/
sys.path.append(str(project_root))

# בדיקה מהירה:
assert (project_root / "src" / "features" / "build_features.py").exists()

from src.features.build_features import AddFeatures

print("OK:", AddFeatures)


OK: <class 'src.features.build_features.AddFeatures'>


In [7]:
# ⬅︎ Cell 2: Load & Basic cleaning
df = pd.read_csv(DATA_PATH)

# Telco quirks
# TotalCharges מגיע כמחרוזת; ננקה לרמה נומרית ונטפל בחסרים
df["TotalCharges_num"] = pd.to_numeric(df["TotalCharges"], errors="coerce")
mask_na = df["TotalCharges_num"].isna()
df.loc[mask_na, "TotalCharges_num"] = df.loc[mask_na, "MonthlyCharges"] * df.loc[mask_na, "tenure"].clip(lower=1)

# יעד לבוליאני
df[TARGET_COL] = df[TARGET_COL].map({"Yes": 1, "No": 0}).astype(int)

# פיצ’ר־אנג’ינירינג קל בתוך מחלקת טרנספורמר כדי לשלב ב-Pipeline
service_cols = [
    "PhoneService","MultipleLines","InternetService","OnlineSecurity","OnlineBackup",
    "DeviceProtection","TechSupport","StreamingTV","StreamingMovies"
]



In [8]:
# ⬅︎ Cell 3: Split
X = df.drop(columns=[TARGET_COL])
y = df[TARGET_COL].values

X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.30, stratify=y, random_state=RANDOM_STATE
)
X_valid, X_test, y_valid, y_test = train_test_split(
    X_temp, y_temp, test_size=0.50, stratify=y_temp, random_state=RANDOM_STATE
)

pos = y_train.sum()
neg = len(y_train) - pos
scale_pos_weight = neg / max(pos, 1)
scale_pos_weight


2.7691131498470947

In [9]:
# ⬅︎ Cell 4: Preprocessing pipeline
feature_adder = AddFeatures(service_cols=service_cols)

numeric_proc = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler(with_mean=False))  # with_mean=False לשילוב עם sparse
])

categorical_proc = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_proc, selector(dtype_include=["int64","float64","int32","float32","uint8","bool"])),
        ("cat", categorical_proc, selector(dtype_include=object))
    ],
    remainder="drop"
)

# נבנה Pipeline כללי: add features -> preprocess -> model (יוחדר בדינמיות)
def make_pipeline(model):
    return Pipeline(steps=[
        ("add", feature_adder),
        ("prep", preprocessor),
        ("model", model)
    ])


In [10]:
# ⬅︎ Cell 5: Eval helper
def evaluate(model, X_tr, y_tr, X_va, y_va):
    proba_tr = model.predict_proba(X_tr)[:,1]
    proba_va = model.predict_proba(X_va)[:,1]
    metrics = {
        "roc_auc_train": roc_auc_score(y_tr, proba_tr),
        "pr_auc_train":  average_precision_score(y_tr, proba_tr),
        "roc_auc_valid": roc_auc_score(y_va, proba_va),
        "pr_auc_valid":  average_precision_score(y_va, proba_va),
    }
    return metrics


In [11]:
# ⬅︎ Cell 6: Optuna objective (5-fold CV on train)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

def objective(trial):
    model_type = trial.suggest_categorical("model_type", ["xgb", "lgbm"])
    
    if model_type == "xgb":
        params = {
            "n_estimators": trial.suggest_int("xgb_n_estimators", 200, 800),
            "max_depth": trial.suggest_int("xgb_max_depth", 3, 8),
            "learning_rate": trial.suggest_float("xgb_lr", 1e-2, 3e-1, log=True),
            "subsample": trial.suggest_float("xgb_subsample", 0.6, 1.0),
            "colsample_bytree": trial.suggest_float("xgb_colsample", 0.6, 1.0),
            "reg_alpha": trial.suggest_float("xgb_alpha", 1e-8, 10.0, log=True),
            "reg_lambda": trial.suggest_float("xgb_lambda", 1e-8, 10.0, log=True),
            "min_child_weight": trial.suggest_int("xgb_min_child_weight", 1, 10),
            "gamma": trial.suggest_float("xgb_gamma", 0.0, 5.0),
            "random_state": RANDOM_STATE,
            "n_jobs": -1,
            "objective": "binary:logistic",
            "tree_method": "hist",
            "eval_metric": "aucpr",
            "scale_pos_weight": scale_pos_weight
        }
        base_model = XGBClassifier(**params)
    else:
        params = {
            "n_estimators": trial.suggest_int("lgbm_n_estimators", 300, 1000),
            "num_leaves": trial.suggest_int("lgbm_num_leaves", 31, 255),
            "learning_rate": trial.suggest_float("lgbm_lr", 1e-2, 3e-1, log=True),
            "subsample": trial.suggest_float("lgbm_subsample", 0.5, 1.0),
            "colsample_bytree": trial.suggest_float("lgbm_colsample", 0.5, 1.0),
            "min_child_samples": trial.suggest_int("lgbm_min_child_samples", 5, 100),
            "reg_alpha": trial.suggest_float("lgbm_alpha", 1e-8, 10.0, log=True),
            "reg_lambda": trial.suggest_float("lgbm_lambda", 1e-8, 10.0, log=True),
            "random_state": RANDOM_STATE,
            "n_jobs": -1,
            "objective": "binary",
            "metric": "average_precision",
            "verbose": -1,
            "scale_pos_weight": scale_pos_weight
        }
        base_model = LGBMClassifier(**params)
    
    pipe = make_pipeline(base_model)

    pr_aucs = []
    for fold, (tr_idx, va_idx) in enumerate(cv.split(X_train, y_train), start=1):
        X_tr, X_va = X_train.iloc[tr_idx], X_train.iloc[va_idx]
        y_tr, y_va = y_train[tr_idx], y_train[va_idx]

        pipe.fit(X_tr, y_tr)
        proba = pipe.predict_proba(X_va)[:, 1]
        pr_auc = average_precision_score(y_va, proba)
        pr_aucs.append(pr_auc)
        
        trial.report(np.mean(pr_aucs), step=fold)
        if trial.should_prune():
            raise optuna.TrialPruned()
    
    return float(np.mean(pr_aucs))


In [12]:
# ⬅︎ Cell 7: Run study
study = optuna.create_study(direction="maximize", pruner=MedianPruner())
study.optimize(objective, n_trials=40, timeout=None, n_jobs=1, show_progress_bar=True)

print("Best value (PR-AUC):", study.best_value)
print("Best params:", study.best_params)


[I 2025-09-04 17:00:19,340] A new study created in memory with name: no-name-01872ff7-c0a9-40f0-837b-5700b35bd15c
Best trial: 0. Best value: 0.66435:   2%|▎         | 1/40 [00:14<09:43, 14.97s/it]

[I 2025-09-04 17:00:34,312] Trial 0 finished with value: 0.6643503380345617 and parameters: {'model_type': 'xgb', 'xgb_n_estimators': 675, 'xgb_max_depth': 6, 'xgb_lr': 0.011660629980382855, 'xgb_subsample': 0.7847610420673713, 'xgb_colsample': 0.7263495806606615, 'xgb_alpha': 0.00017955931112961662, 'xgb_lambda': 0.00036364163408810776, 'xgb_min_child_weight': 6, 'xgb_gamma': 1.7451964095804362}. Best is trial 0 with value: 0.6643503380345617.


Best trial: 1. Best value: 0.667969:   5%|▌         | 2/40 [00:23<07:15, 11.45s/it]

[I 2025-09-04 17:00:43,302] Trial 1 finished with value: 0.6679689693361087 and parameters: {'model_type': 'xgb', 'xgb_n_estimators': 219, 'xgb_max_depth': 7, 'xgb_lr': 0.011614068148080407, 'xgb_subsample': 0.763941062613682, 'xgb_colsample': 0.6705027321313649, 'xgb_alpha': 3.035665372093254e-06, 'xgb_lambda': 4.671822394047869e-06, 'xgb_min_child_weight': 9, 'xgb_gamma': 1.6859117171732074}. Best is trial 1 with value: 0.6679689693361087.


Best trial: 1. Best value: 0.667969:   8%|▊         | 3/40 [00:33<06:28, 10.50s/it]

[I 2025-09-04 17:00:52,677] Trial 2 finished with value: 0.6663078345483255 and parameters: {'model_type': 'xgb', 'xgb_n_estimators': 778, 'xgb_max_depth': 4, 'xgb_lr': 0.017087472570303547, 'xgb_subsample': 0.6884406224712076, 'xgb_colsample': 0.7008779530061466, 'xgb_alpha': 8.558355380426362e-07, 'xgb_lambda': 3.1137403320175977e-06, 'xgb_min_child_weight': 2, 'xgb_gamma': 2.375622431399292}. Best is trial 1 with value: 0.6679689693361087.


Best trial: 3. Best value: 0.672116:  10%|█         | 4/40 [00:38<05:03,  8.43s/it]

[I 2025-09-04 17:00:57,939] Trial 3 finished with value: 0.6721159252417129 and parameters: {'model_type': 'xgb', 'xgb_n_estimators': 679, 'xgb_max_depth': 4, 'xgb_lr': 0.06139407172509567, 'xgb_subsample': 0.8496706063048791, 'xgb_colsample': 0.7857466464862991, 'xgb_alpha': 3.223882195537856e-05, 'xgb_lambda': 9.984111524261742, 'xgb_min_child_weight': 3, 'xgb_gamma': 4.398403650674861}. Best is trial 3 with value: 0.6721159252417129.


Best trial: 3. Best value: 0.672116:  12%|█▎        | 5/40 [00:43<04:07,  7.07s/it]

[I 2025-09-04 17:01:02,589] Trial 4 finished with value: 0.6285881011725575 and parameters: {'model_type': 'xgb', 'xgb_n_estimators': 258, 'xgb_max_depth': 7, 'xgb_lr': 0.2555642836008814, 'xgb_subsample': 0.8555656103906959, 'xgb_colsample': 0.8152854318237908, 'xgb_alpha': 1.0673728735501308e-08, 'xgb_lambda': 6.104098000236868, 'xgb_min_child_weight': 2, 'xgb_gamma': 1.314677888745499}. Best is trial 3 with value: 0.6721159252417129.


Best trial: 3. Best value: 0.672116:  15%|█▌        | 6/40 [00:44<02:48,  4.97s/it]

[I 2025-09-04 17:01:03,478] Trial 5 pruned. 


Best trial: 3. Best value: 0.672116:  18%|█▊        | 7/40 [00:51<03:07,  5.67s/it]

[I 2025-09-04 17:01:10,585] Trial 6 finished with value: 0.6703680129450198 and parameters: {'model_type': 'xgb', 'xgb_n_estimators': 667, 'xgb_max_depth': 3, 'xgb_lr': 0.013767594407620803, 'xgb_subsample': 0.9119678874680749, 'xgb_colsample': 0.8074683402450583, 'xgb_alpha': 1.8847371703379534, 'xgb_lambda': 0.3071633073759995, 'xgb_min_child_weight': 2, 'xgb_gamma': 1.209577311564336}. Best is trial 3 with value: 0.6721159252417129.


Best trial: 3. Best value: 0.672116:  20%|██        | 8/40 [00:51<02:09,  4.06s/it]

[I 2025-09-04 17:01:11,200] Trial 7 pruned. 


Best trial: 3. Best value: 0.672116:  22%|██▎       | 9/40 [00:54<01:52,  3.62s/it]

[I 2025-09-04 17:01:13,853] Trial 8 pruned. 


Best trial: 3. Best value: 0.672116:  25%|██▌       | 10/40 [00:56<01:34,  3.14s/it]

[I 2025-09-04 17:01:15,923] Trial 9 pruned. 


Best trial: 3. Best value: 0.672116:  28%|██▊       | 11/40 [00:57<01:15,  2.60s/it]

[I 2025-09-04 17:01:17,313] Trial 10 pruned. 


Best trial: 3. Best value: 0.672116:  30%|███       | 12/40 [00:58<00:58,  2.09s/it]

[I 2025-09-04 17:01:18,235] Trial 11 pruned. 


Best trial: 3. Best value: 0.672116:  32%|███▎      | 13/40 [01:03<01:14,  2.78s/it]

[I 2025-09-04 17:01:22,586] Trial 12 finished with value: 0.6688920875925002 and parameters: {'model_type': 'xgb', 'xgb_n_estimators': 581, 'xgb_max_depth': 3, 'xgb_lr': 0.049394770312419845, 'xgb_subsample': 0.9777294211544861, 'xgb_colsample': 0.864356494093447, 'xgb_alpha': 0.6634790779487254, 'xgb_lambda': 0.14341978018018853, 'xgb_min_child_weight': 4, 'xgb_gamma': 3.6660105088931862}. Best is trial 3 with value: 0.6721159252417129.


Best trial: 3. Best value: 0.672116:  35%|███▌      | 14/40 [01:07<01:20,  3.10s/it]

[I 2025-09-04 17:01:26,425] Trial 13 finished with value: 0.6694462565499988 and parameters: {'model_type': 'xgb', 'xgb_n_estimators': 564, 'xgb_max_depth': 3, 'xgb_lr': 0.046361086436017405, 'xgb_subsample': 0.9157485490333988, 'xgb_colsample': 0.6154984604356809, 'xgb_alpha': 0.08683567842556275, 'xgb_lambda': 0.03830044107921511, 'xgb_min_child_weight': 3, 'xgb_gamma': 3.492167585945788}. Best is trial 3 with value: 0.6721159252417129.


Best trial: 3. Best value: 0.672116:  38%|███▊      | 15/40 [01:07<00:59,  2.38s/it]

[I 2025-09-04 17:01:27,132] Trial 14 pruned. 


Best trial: 3. Best value: 0.672116:  40%|████      | 16/40 [01:09<00:53,  2.22s/it]

[I 2025-09-04 17:01:28,989] Trial 15 pruned. 


Best trial: 3. Best value: 0.672116:  42%|████▎     | 17/40 [01:14<01:11,  3.10s/it]

[I 2025-09-04 17:01:34,138] Trial 16 finished with value: 0.6663204393016231 and parameters: {'model_type': 'xgb', 'xgb_n_estimators': 468, 'xgb_max_depth': 4, 'xgb_lr': 0.030515171800342, 'xgb_subsample': 0.9855330095821778, 'xgb_colsample': 0.9950966435127773, 'xgb_alpha': 0.03211035865474305, 'xgb_lambda': 1.6083374556223593e-08, 'xgb_min_child_weight': 4, 'xgb_gamma': 2.869242375376577}. Best is trial 3 with value: 0.6721159252417129.


Best trial: 3. Best value: 0.672116:  45%|████▌     | 18/40 [01:15<00:54,  2.48s/it]

[I 2025-09-04 17:01:35,166] Trial 17 pruned. 


Best trial: 3. Best value: 0.672116:  48%|████▊     | 19/40 [01:17<00:44,  2.10s/it]

[I 2025-09-04 17:01:36,372] Trial 18 pruned. 


Best trial: 3. Best value: 0.672116:  50%|█████     | 20/40 [01:18<00:37,  1.86s/it]

[I 2025-09-04 17:01:37,682] Trial 19 pruned. 


Best trial: 3. Best value: 0.672116:  52%|█████▎    | 21/40 [01:23<00:55,  2.94s/it]

[I 2025-09-04 17:01:43,144] Trial 20 finished with value: 0.6679767265120164 and parameters: {'model_type': 'xgb', 'xgb_n_estimators': 601, 'xgb_max_depth': 4, 'xgb_lr': 0.029954325225373894, 'xgb_subsample': 0.8749851915970969, 'xgb_colsample': 0.7811260305947243, 'xgb_alpha': 0.009950271569719093, 'xgb_lambda': 0.2632969136826796, 'xgb_min_child_weight': 7, 'xgb_gamma': 3.6691504495844596}. Best is trial 3 with value: 0.6721159252417129.


Best trial: 3. Best value: 0.672116:  55%|█████▌    | 22/40 [01:27<00:56,  3.13s/it]

[I 2025-09-04 17:01:46,718] Trial 21 finished with value: 0.6697928437161316 and parameters: {'model_type': 'xgb', 'xgb_n_estimators': 553, 'xgb_max_depth': 3, 'xgb_lr': 0.050941974728899476, 'xgb_subsample': 0.9366265586368809, 'xgb_colsample': 0.6045847754722513, 'xgb_alpha': 0.3704809848106717, 'xgb_lambda': 0.020134801903732888, 'xgb_min_child_weight': 3, 'xgb_gamma': 3.6611967727650065}. Best is trial 3 with value: 0.6721159252417129.


Best trial: 3. Best value: 0.672116:  57%|█████▊    | 23/40 [01:31<00:58,  3.43s/it]

[I 2025-09-04 17:01:50,832] Trial 22 finished with value: 0.6660583243637652 and parameters: {'model_type': 'xgb', 'xgb_n_estimators': 794, 'xgb_max_depth': 3, 'xgb_lr': 0.08053876648466886, 'xgb_subsample': 0.9504721342078295, 'xgb_colsample': 0.6049012933661515, 'xgb_alpha': 9.785303277041592, 'xgb_lambda': 0.016301317106210987, 'xgb_min_child_weight': 3, 'xgb_gamma': 4.372089669462932}. Best is trial 3 with value: 0.6721159252417129.


Best trial: 3. Best value: 0.672116:  60%|██████    | 24/40 [01:32<00:41,  2.61s/it]

[I 2025-09-04 17:01:51,524] Trial 23 pruned. 


Best trial: 3. Best value: 0.672116:  62%|██████▎   | 25/40 [01:33<00:32,  2.20s/it]

[I 2025-09-04 17:01:52,766] Trial 24 pruned. 


Best trial: 3. Best value: 0.672116:  65%|██████▌   | 26/40 [01:34<00:26,  1.92s/it]

[I 2025-09-04 17:01:54,056] Trial 25 pruned. 


Best trial: 3. Best value: 0.672116:  68%|██████▊   | 27/40 [01:35<00:19,  1.51s/it]

[I 2025-09-04 17:01:54,607] Trial 26 pruned. 


Best trial: 3. Best value: 0.672116:  70%|███████   | 28/40 [01:39<00:26,  2.20s/it]

[I 2025-09-04 17:01:58,423] Trial 27 finished with value: 0.6687826191718895 and parameters: {'model_type': 'xgb', 'xgb_n_estimators': 553, 'xgb_max_depth': 3, 'xgb_lr': 0.06345650342208474, 'xgb_subsample': 0.9506614135470279, 'xgb_colsample': 0.742590869093584, 'xgb_alpha': 1.5432947599109144, 'xgb_lambda': 4.847089328824392e-05, 'xgb_min_child_weight': 5, 'xgb_gamma': 3.1703144785699595}. Best is trial 3 with value: 0.6721159252417129.


Best trial: 3. Best value: 0.672116:  72%|███████▎  | 29/40 [01:39<00:19,  1.75s/it]

[I 2025-09-04 17:01:59,126] Trial 28 pruned. 


Best trial: 3. Best value: 0.672116:  75%|███████▌  | 30/40 [01:40<00:15,  1.54s/it]

[I 2025-09-04 17:02:00,158] Trial 29 pruned. 


Best trial: 3. Best value: 0.672116:  78%|███████▊  | 31/40 [01:42<00:13,  1.46s/it]

[I 2025-09-04 17:02:01,426] Trial 30 pruned. 


Best trial: 3. Best value: 0.672116:  80%|████████  | 32/40 [01:46<00:19,  2.38s/it]

[I 2025-09-04 17:02:05,954] Trial 31 finished with value: 0.6691136383351487 and parameters: {'model_type': 'xgb', 'xgb_n_estimators': 545, 'xgb_max_depth': 3, 'xgb_lr': 0.04061262024370887, 'xgb_subsample': 0.9118529122914744, 'xgb_colsample': 0.609521038254819, 'xgb_alpha': 0.1348168504150836, 'xgb_lambda': 0.0808928619772485, 'xgb_min_child_weight': 3, 'xgb_gamma': 3.479683016342258}. Best is trial 3 with value: 0.6721159252417129.


Best trial: 3. Best value: 0.672116:  82%|████████▎ | 33/40 [01:50<00:20,  2.96s/it]

[I 2025-09-04 17:02:10,287] Trial 32 finished with value: 0.670416099868352 and parameters: {'model_type': 'xgb', 'xgb_n_estimators': 598, 'xgb_max_depth': 3, 'xgb_lr': 0.04462746853341049, 'xgb_subsample': 0.9549061492731824, 'xgb_colsample': 0.6402861476377715, 'xgb_alpha': 2.03882726948712, 'xgb_lambda': 0.015768576656097624, 'xgb_min_child_weight': 4, 'xgb_gamma': 3.522854712973806}. Best is trial 3 with value: 0.6721159252417129.


Best trial: 3. Best value: 0.672116:  85%|████████▌ | 34/40 [01:51<00:14,  2.36s/it]

[I 2025-09-04 17:02:11,246] Trial 33 pruned. 


Best trial: 3. Best value: 0.672116:  88%|████████▊ | 35/40 [01:52<00:09,  1.88s/it]

[I 2025-09-04 17:02:11,985] Trial 34 pruned. 


Best trial: 3. Best value: 0.672116:  90%|█████████ | 36/40 [01:53<00:06,  1.63s/it]

[I 2025-09-04 17:02:13,027] Trial 35 pruned. 


Best trial: 3. Best value: 0.672116:  92%|█████████▎| 37/40 [01:56<00:05,  1.94s/it]

[I 2025-09-04 17:02:15,692] Trial 36 pruned. 


Best trial: 3. Best value: 0.672116:  95%|█████████▌| 38/40 [02:01<00:06,  3.03s/it]

[I 2025-09-04 17:02:21,286] Trial 37 finished with value: 0.6697029111422438 and parameters: {'model_type': 'xgb', 'xgb_n_estimators': 514, 'xgb_max_depth': 3, 'xgb_lr': 0.010008160123367405, 'xgb_subsample': 0.8103149113392549, 'xgb_colsample': 0.7036328165381951, 'xgb_alpha': 2.629847586052216e-05, 'xgb_lambda': 0.0016445122994628713, 'xgb_min_child_weight': 6, 'xgb_gamma': 3.9398614047757765}. Best is trial 3 with value: 0.6721159252417129.


Best trial: 3. Best value: 0.672116:  98%|█████████▊| 39/40 [02:03<00:02,  2.46s/it]

[I 2025-09-04 17:02:22,420] Trial 38 pruned. 


Best trial: 3. Best value: 0.672116: 100%|██████████| 40/40 [02:04<00:00,  3.12s/it]

[I 2025-09-04 17:02:23,952] Trial 39 pruned. 
Best value (PR-AUC): 0.6721159252417129
Best params: {'model_type': 'xgb', 'xgb_n_estimators': 679, 'xgb_max_depth': 4, 'xgb_lr': 0.06139407172509567, 'xgb_subsample': 0.8496706063048791, 'xgb_colsample': 0.7857466464862991, 'xgb_alpha': 3.223882195537856e-05, 'xgb_lambda': 9.984111524261742, 'xgb_min_child_weight': 3, 'xgb_gamma': 4.398403650674861}





In [13]:
# ⬅︎ Cell 8: Refit on train+valid and evaluate on test
best = study.best_params
best_type = best["model_type"]

if best_type == "xgb":
    model = XGBClassifier(
        n_estimators=best["xgb_n_estimators"],
        max_depth=best["xgb_max_depth"],
        learning_rate=best["xgb_lr"],
        subsample=best["xgb_subsample"],
        colsample_bytree=best["xgb_colsample"],
        reg_alpha=best["xgb_alpha"],
        reg_lambda=best["xgb_lambda"],
        min_child_weight=best["xgb_min_child_weight"],
        gamma=best["xgb_gamma"],
        random_state=RANDOM_STATE,
        n_jobs=-1,
        objective="binary:logistic",
        tree_method="hist",
        eval_metric="aucpr",
        scale_pos_weight=scale_pos_weight
    )
else:
    model = LGBMClassifier(
        n_estimators=best["lgbm_n_estimators"],
        num_leaves=best["lgbm_num_leaves"],
        learning_rate=best["lgbm_lr"],
        subsample=best["lgbm_subsample"],
        colsample_bytree=best["lgbm_colsample"],
        min_child_samples=best["lgbm_min_child_samples"],
        reg_alpha=best["lgbm_alpha"],
        reg_lambda=best["lgbm_lambda"],
        random_state=RANDOM_STATE,
        n_jobs=-1,
        objective="binary",
        metric="average_precision",
        verbose=-1,
        scale_pos_weight=scale_pos_weight
    )

pipe_best = make_pipeline(model)
pipe_best.fit(pd.concat([X_train, X_valid]), np.concatenate([y_train, y_valid]))

# Evaluate
metrics = evaluate(pipe_best, pd.concat([X_train, X_valid]), np.concatenate([y_train, y_valid]), X_test, y_test)
metrics


{'roc_auc_train': 0.8822909256898314,
 'pr_auc_train': 0.7224447789715509,
 'roc_auc_valid': 0.8423042704626335,
 'pr_auc_valid': 0.6646421326089454}

In [14]:
# ⬅︎ Cell 9: Save artifacts
os.makedirs("../models", exist_ok=True)

joblib.dump(pipe_best, "../models/best_pipeline.pkl")

# שמירת מטא־דאטה כולל שמות פיצ'רים אחרי OneHot (יעיל להסברים וחשיבות פיצ'רים)
# שים לב: get_feature_names_out קיים בגרסאות sklearn חדשות
prep = pipe_best.named_steps["prep"]
feature_names = prep.get_feature_names_out()
meta = {
    "model_type": best_type,
    "best_params": best,
    "feature_names": feature_names.tolist()
}
with open("../models/metadata.json", "w") as f:
    json.dump(meta, f, indent=2)

print("Saved ../models/best_pipeline.pkl and ../models/metadata.json")


Saved ../models/best_pipeline.pkl and ../models/metadata.json


In [15]:
# ⬅︎ Cell 10: Feature importance (top 25)
import numpy as np


def get_importances(pipe):
    mdl = pipe.named_steps["model"]
    if hasattr(mdl, "feature_importances_"):
        imp = mdl.feature_importances_
    elif hasattr(mdl, "booster_") and hasattr(mdl.booster_, "feature_importances_"):
        imp = mdl.booster_.feature_importances_
    else:
        return None
    names = pipe.named_steps["prep"].get_feature_names_out()
    return pd.DataFrame({"feature": names, "importance": imp}).sort_values("importance", ascending=False)

fi = get_importances(pipe_best)
if fi is not None:
    display(fi.head(25))
    fi.to_csv("../models/feature_importance_top.csv", index=False)
else:
    print("No feature_importances_ available for this model.")


Unnamed: 0,feature,importance
6032,cat__Contract_Month-to-month,0.375308
11,num__contract_term,0.182006
6012,cat__InternetService_Fiber optic,0.046414
8,num__has_fiber,0.043145
6014,cat__OnlineSecurity_No,0.037685
6034,cat__Contract_Two year,0.020311
6023,cat__TechSupport_No,0.020236
4,num__tenure_years,0.017728
6011,cat__InternetService_DSL,0.014983
12,num__monthly_x_term,0.01442


In [None]:
# ⬅︎ Cell 11: Quick smoke test on a small sample
Xs = X_train.sample(200, random_state=RANDOM_STATE)
ys = y_train[np.isin(X_train.index, Xs.index)]

pipe_smoke = make_pipeline(LGBMClassifier(n_estimators=50, random_state=RANDOM_STATE))
pipe_smoke.fit(Xs, ys)
proba = pipe_smoke.predict_proba(Xs)[:,1]
print("Smoke PR-AUC:", average_precision_score(ys, proba))


Smoke PR-AUC: 0.9691152348567608


: 