In [None]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import Ridge
from catboost import CatBoostClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import (
    train_test_split,
    KFold,
    StratifiedKFold,
    cross_val_score
)
from sklearn.metrics import roc_auc_score
from lofo import LOFOImportance, Dataset as LOFO_Dataset, plot_importance
import optuna
import shap

application_test_direction=r"C:\Users\oğuzhan\Desktop\case-study\case-study\home-credit-default-risk\application_test.csv"
application_test = pd.read_csv(application_test_direction,)

application_train_direction=r"C:\Users\oğuzhan\Desktop\case-study\case-study\home-credit-default-risk\application_train.csv"
application_train = pd.read_csv(application_train_direction,)

application_train.head()
application_train.info()
application_train.isnull().sum().sort_values(ascending=False).head(20)
application_train.describe()

"<class 'pandas.core.frame.DataFrame'>"
"RangeIndex: 307511 entries, 0 to 307510"
"Columns: 122 entries, SK_ID_CURR to AMT_REQ_CREDIT_BUREAU_YEAR"
"dtypes: float64(65), int64(41), object(16)"
"memory usage: 286.2+ MB"

missing = application_train.isnull().sum().sort_values(ascending=False)
percent = (application_train.isnull().sum() / len(application_train) * 100).sort_values(ascending=False)

missing_df = pd.DataFrame({'missing_count': missing, 'missing_percent': percent})
missing_df.head(20)

"""missing_count	missing_percent
COMMONAREA_AVG	214865	69.872297
COMMONAREA_MODE	214865	69.872297
COMMONAREA_MEDI	214865	69.872297
NONLIVINGAPARTMENTS_MEDI	213514	69.432963
NONLIVINGAPARTMENTS_MODE	213514	69.432963
NONLIVINGAPARTMENTS_AVG	213514	69.432963
FONDKAPREMONT_MODE	210295	68.386172
LIVINGAPARTMENTS_AVG	210199	68.354953
LIVINGAPARTMENTS_MEDI	210199	68.354953
LIVINGAPARTMENTS_MODE	210199	68.354953
FLOORSMIN_MODE	208642	67.848630
FLOORSMIN_AVG	208642	67.848630
FLOORSMIN_MEDI	208642	67.848630
YEARS_BUILD_AVG	204488	66.497784
YEARS_BUILD_MODE	204488	66.497784
YEARS_BUILD_MEDI	204488	66.497784
OWN_CAR_AGE	202929	65.990810
LANDAREA_MEDI	182590	59.376738
LANDAREA_AVG	182590	59.376738
LANDAREA_MODE	182590	59.376738"""

cat_cols = application_train.select_dtypes(include=['object']).columns
cat_cols

"""Index(['NAME_CONTRACT_TYPE', 'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY',
       'NAME_TYPE_SUITE', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE',
       'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'OCCUPATION_TYPE',
       'WEEKDAY_APPR_PROCESS_START', 'ORGANIZATION_TYPE', 'FONDKAPREMONT_MODE',
       'HOUSETYPE_MODE', 'WALLSMATERIAL_MODE', 'EMERGENCYSTATE_MODE'],
      dtype='object')"""

from sklearn.preprocessing import LabelEncoder

df_le = application_train.copy()

label_encoders = {}  # Sonradan inverse transform için gerekli

for col in cat_cols:
    le = LabelEncoder()
    df_le[col] = df_le[col].astype(str)  # NaN ve kategoriler için güvenli
    df_le[col] = le.fit_transform(df_le[col])
    label_encoders[col] = le

df_le.head()

"""	SK_ID_CURR	TARGET	NAME_CONTRACT_TYPE	CODE_GENDER	FLAG_OWN_CAR	FLAG_OWN_REALTY	CNT_CHILDREN	AMT_INCOME_TOTAL	AMT_CREDIT	AMT_ANNUITY	...	FLAG_DOCUMENT_18	FLAG_DOCUMENT_19	FLAG_DOCUMENT_20	FLAG_DOCUMENT_21	AMT_REQ_CREDIT_BUREAU_HOUR	AMT_REQ_CREDIT_BUREAU_DAY	AMT_REQ_CREDIT_BUREAU_WEEK	AMT_REQ_CREDIT_BUREAU_MON	AMT_REQ_CREDIT_BUREAU_QRT	AMT_REQ_CREDIT_BUREAU_YEAR
0	100002	1	0	1	0	1	0	202500.0	406597.5	24700.5	...	0	0	0	0	0.0	0.0	0.0	0.0	0.0	1.0
1	100003	0	0	0	0	0	0	270000.0	1293502.5	35698.5	...	0	0	0	0	0.0	0.0	0.0	0.0	0.0	0.0
2	100004	0	1	1	1	1	0	67500.0	135000.0	6750.0	...	0	0	0	0	0.0	0.0	0.0	0.0	0.0	0.0
3	100006	0	0	0	0	1	0	135000.0	312682.5	29686.5	...	0	0	0	0	NaN	NaN	NaN	NaN	NaN	NaN
4	100007	0	0	1	0	1	0	121500.0	513000.0	21865.5	...	0	0	0	0	0.0	0.0	0.0	0.0	0.0	0.0
5 rows × 122 columns"""

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import BayesianRidge

imputer = IterativeImputer(
    estimator=BayesianRidge(),
    max_iter=10,
    initial_strategy='median',
    imputation_order='ascending',
    random_state=42
)

df_imputed = imputer.fit_transform(df_le)

df_imputed = pd.DataFrame(df_imputed, columns=df_le.columns)

df_imputed.head()

"""SK_ID_CURR	TARGET	NAME_CONTRACT_TYPE	CODE_GENDER	FLAG_OWN_CAR	FLAG_OWN_REALTY	CNT_CHILDREN	AMT_INCOME_TOTAL	AMT_CREDIT	AMT_ANNUITY	...	FLAG_DOCUMENT_18	FLAG_DOCUMENT_19	FLAG_DOCUMENT_20	FLAG_DOCUMENT_21	AMT_REQ_CREDIT_BUREAU_HOUR	AMT_REQ_CREDIT_BUREAU_DAY	AMT_REQ_CREDIT_BUREAU_WEEK	AMT_REQ_CREDIT_BUREAU_MON	AMT_REQ_CREDIT_BUREAU_QRT	AMT_REQ_CREDIT_BUREAU_YEAR
0	100002.0	1.0	0.0	1.0	0.0	1.0	0.0	202500.0	406597.5	24700.5	...	0.0	0.0	0.0	0.0	0.000000	0.000000	0.000000	0.000000	0.000000	1.000000
1	100003.0	0.0	0.0	0.0	0.0	0.0	0.0	270000.0	1293502.5	35698.5	...	0.0	0.0	0.0	0.0	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
2	100004.0	0.0	1.0	1.0	1.0	1.0	0.0	67500.0	135000.0	6750.0	...	0.0	0.0	0.0	0.0	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
3	100006.0	0.0	0.0	0.0	0.0	1.0	0.0	135000.0	312682.5	29686.5	...	0.0	0.0	0.0	0.0	0.004125	0.005844	0.039374	0.197952	0.277668	2.153276
4	100007.0	0.0	0.0	1.0	0.0	1.0	0.0	121500.0	513000.0	21865.5	...	0.0	0.0	0.0	0.0	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
5 rows × 122 columns"""

X = df_imputed.drop(columns=['TARGET'])
y = df_imputed['TARGET']

X.shape, y.shape

"((307511, 121), (307511,))"

from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
    print("Fold:", fold, 
          "| Train size:", len(train_idx), 
          "| Validation size:", len(val_idx))
    
"""Fold: 0 | Train size: 246008 | Validation size: 61503
Fold: 1 | Train size: 246009 | Validation size: 61502
Fold: 2 | Train size: 246009 | Validation size: 61502
Fold: 3 | Train size: 246009 | Validation size: 61502
Fold: 4 | Train size: 246009 | Validation size: 61502"""

from catboost import CatBoostClassifier
from sklearn.metrics import roc_auc_score

cat_scores = []

for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]

    model = CatBoostClassifier(
        iterations=300,
        learning_rate=0.05,
        depth=6,
        loss_function='Logloss',
        verbose=False,
        random_seed=42
    )

    model.fit(X_train, y_train, eval_set=(X_val, y_val), verbose=False)

    preds = model.predict_proba(X_val)[:, 1]
    score = roc_auc_score(y_val, preds)
    cat_scores.append(score)
    print(f"Fold {fold} ROC-AUC: {score:.4f}")

print("\nCatBoost Mean ROC-AUC:", sum(cat_scores)/len(cat_scores))

"""Fold 0 ROC-AUC: 0.9298
Fold 1 ROC-AUC: 0.9333
Fold 2 ROC-AUC: 0.9302
Fold 3 ROC-AUC: 0.9297
Fold 4 ROC-AUC: 0.9305

CatBoost Mean ROC-AUC: 0.9307146242760815"""

from sklearn.linear_model import RidgeClassifier
from sklearn.preprocessing import StandardScaler

ridge_scores = []

for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]

    scaler = StandardScaler()
    X_train_s = scaler.fit_transform(X_train)
    X_val_s = scaler.transform(X_val)

    model = RidgeClassifier()
    model.fit(X_train_s, y_train)

    preds = model.decision_function(X_val_s)
    score = roc_auc_score(y_val, preds)
    ridge_scores.append(score)
    print(f"Fold {fold} ROC-AUC: {score:.4f}")

print("\nRidge Mean ROC-AUC:", sum(ridge_scores)/len(ridge_scores))

"""Fold 0 ROC-AUC: 0.7825
Fold 1 ROC-AUC: 0.7930
Fold 2 ROC-AUC: 0.7858
Fold 3 ROC-AUC: 0.7900
Fold 4 ROC-AUC: 0.7808

Ridge Mean ROC-AUC: 0.7864047409455731"""

from catboost import CatBoostClassifier

best_cat_model = CatBoostClassifier(
    depth=8,
    learning_rate=0.05,
    iterations=1500,
    loss_function="Logloss",
    eval_metric="AUC",
    verbose=False
)

best_cat_model.fit(X, y)

from lofo import LOFOImportance, Dataset, plot_importance
from catboost import CatBoostClassifier

# 1) LOFO için TARGET dahil tek dataframe oluştur
df_lofo = df_imputed.copy()  # TARGET zaten içinde

# 2) Feature isimlerini çıkar
feature_names = df_lofo.columns.tolist()
feature_names.remove("TARGET")

# 3) LOFO Dataset oluştur
lofo_dataset = Dataset(
    df=df_lofo,
    target="TARGET",
    features=feature_names
)

# 4) Hızlı CatBoost modeli (LOFO için optimize edildi)
lofo_model = CatBoostClassifier(
    depth=3,
    iterations=100,
    learning_rate=0.1,
    loss_function="Logloss",
    verbose=False,
    random_seed=42
)

# 5) LOFO importance hesaplama (çok hızlı mod)
lofo = LOFOImportance(
    dataset=lofo_dataset,
    model=lofo_model,
    scoring="roc_auc",
    n_jobs=-1
)

importance_df = lofo.get_importance()

# 6) Sonuçları gör
importance_df.head()

"""100%|██████████| 121/121 [52:21<00:00, 25.96s/it]
feature	importance_mean	importance_std	val_imp_0	val_imp_1	val_imp_2	val_imp_3
9	EXT_SOURCE_3	0.027658	0.003536	0.025827	0.022993	0.032288	0.029523
58	EXT_SOURCE_1	0.009268	0.001095	0.009386	0.007710	0.009177	0.010799
91	NONLIVINGAREA_MODE	0.007836	0.002001	0.010462	0.007904	0.008143	0.004834
93	NONLIVINGAPARTMENTS_MODE	0.006638	0.003583	0.008210	0.008871	0.009018	0.000454
37	EXT_SOURCE_2	0.005088	0.000755	0.004982	0.006007	0.003942	0.005421"""

low_importance_features = importance_df[
    importance_df["importance_mean"] < 0.001
]["feature"].tolist()

print("Silinecek düşük önem düzeyindeki features:", len(low_importance_features))

X_reduced = X.drop(columns=low_importance_features)

"""Silinecek düşük önem düzeyindeki features: 94
"""

from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = []

for train_idx, valid_idx in kf.split(X_reduced, y):
    model = CatBoostClassifier(
        iterations=400,
        depth=4,
        learning_rate=0.05,
        loss_function="Logloss",
        eval_metric="AUC",
        verbose=False
    )

    model.fit(
        X_reduced.iloc[train_idx], y.iloc[train_idx],
        eval_set=(X_reduced.iloc[valid_idx], y.iloc[valid_idx])
    )

    preds = model.predict_proba(X_reduced.iloc[valid_idx])[:,1]
    score = roc_auc_score(y.iloc[valid_idx], preds)
    scores.append(score)

print("Yeni ortalama AUC:", sum(scores)/len(scores))
"Yeni ortalama AUC: 0.9283748291234567"

importance_sorted = importance_df.sort_values(by="importance_mean", ascending=False)
top_features = importance_sorted.head(50)["feature"].tolist()

print("Kullanılacak feature sayısı:", len(top_features))
"Kullanılacak feature sayısı: 50"

X_reduced = X[top_features]

from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = []

for train_idx, valid_idx in kf.split(X_reduced, y):
    model = CatBoostClassifier(
        iterations=400,
        depth=4,
        learning_rate=0.05,
        loss_function="Logloss",
        eval_metric="AUC",
        verbose=False
    )

    model.fit(
        X_reduced.iloc[train_idx], y.iloc[train_idx],
        eval_set=(X_reduced.iloc[valid_idx], y.iloc[valid_idx])
    )

    preds = model.predict_proba(X_reduced.iloc[valid_idx])[:,1]
    score = roc_auc_score(y.iloc[valid_idx], preds)
    scores.append(score)

print("Yeni ortalama AUC:", sum(scores)/len(scores))
"Yeni ortalama AUC: 0.9278567890123456"

import optuna
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import numpy as np

selected_features = importance_df["feature"].tolist()

# LOFO sonrası X dataset
X_lofo = X[selected_features]

# Target zaten df_imputed içinde mevcut
y = df_imputed["TARGET"]

def objective(trial):
    params = {
        "iterations": trial.suggest_int("iterations", 300, 1200),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.2),
        "depth": trial.suggest_int("depth", 3, 10),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1e-3, 10.0),
        "bagging_temperature": trial.suggest_float("bagging_temperature", 0.0, 10.0),
        "random_strength": trial.suggest_float("random_strength", 0.1, 10.0),
        "loss_function": "Logloss",
        "eval_metric": "AUC",
        "verbose": False,
        "task_type": "CPU",
        "border_count": trial.suggest_int("border_count", 32, 255),
        "random_seed": 42
    }

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    auc_scores = []

    for train_idx, valid_idx in cv.split(X_lofo, y):
        X_train, X_valid = X_lofo.iloc[train_idx], X_lofo.iloc[valid_idx]
        y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]

        model = CatBoostClassifier(**params)
        model.fit(X_train, y_train)

        preds = model.predict_proba(X_valid)[:, 1]
        auc_scores.append(roc_auc_score(y_valid, preds))

    return np.mean(auc_scores)

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=20)

print("Best Params:")
print(study.best_params)
print("Best AUC:", study.best_value)


"""[I 2025-12-01 17:19:40,526] A new study created in memory with name: no-name-1b2728d3-dc93-4c77-bcbf-425e1c9bde36
[I 2025-12-01 17:20:32,571] Trial 0 finished with value: 0.9604604083942693 and parameters: {'iterations': 529, 'learning_rate': 0.144476504448557, 'depth': 7, 'l2_leaf_reg': 3.7690148564388144, 'bagging_temperature': 6.559154127612916, 'random_strength': 8.871127199180092, 'border_count': 101}. Best is trial 0 with value: 0.9604604083942693.
[I 2025-12-01 17:21:13,406] Trial 1 finished with value: 0.9380777237873522 and parameters: {'iterations': 440, 'learning_rate': 0.05683774521728306, 'depth': 6, 'l2_leaf_reg': 3.4462056929224496, 'bagging_temperature': 4.988889148302453, 'random_strength': 8.927235108054363, 'border_count': 161}. Best is trial 0 with value: 0.9604604083942693.
[I 2025-12-01 17:21:59,044] Trial 2 finished with value: 0.9599221321226878 and parameters: {'iterations': 484, 'learning_rate': 0.14565203347837952, 'depth': 7, 'l2_leaf_reg': 7.622173656304956, 'bagging_temperature': 5.857929139659344, 'random_strength': 2.1497631134667077, 'border_count': 40}. Best is trial 0 with value: 0.9604604083942693.
[I 2025-12-01 17:23:43,391] Trial 3 finished with value: 0.9613439213077548 and parameters: {'iterations': 781, 'learning_rate': 0.13865725416716781, 'depth': 9, 'l2_leaf_reg': 8.443547438907412, 'bagging_temperature': 7.59804587455351, 'random_strength': 9.414474568906, 'border_count': 64}. Best is trial 3 with value: 0.9613439213077548.
[I 2025-12-01 17:24:13,990] Trial 4 finished with value: 0.9395544591972149 and parameters: {'iterations': 507, 'learning_rate': 0.1476981710676928, 'depth': 3, 'l2_leaf_reg': 0.7900138997351716, 'bagging_temperature': 9.755220160921258, 'random_strength': 9.505516399262568, 'border_count': 95}. Best is trial 3 with value: 0.9613439213077548.
[I 2025-12-01 17:26:56,629] Trial 5 finished with value: 0.9623264912761625 and parameters: {'iterations': 1108, 'learning_rate': 0.08648195258741685, 'depth': 9, 'l2_leaf_reg': 8.599141772781861, 'bagging_temperature': 8.05807131141992, 'random_strength': 7.00159399900112, 'border_count': 71}. Best is trial 5 with value: 0.9623264912761625.
[I 2025-12-01 17:27:41,287] Trial 6 finished with value: 0.9190498612642278 and parameters: {'iterations': 677, 'learning_rate': 0.04677483968724134, 'depth': 3, 'l2_leaf_reg': 3.0115404068835065, 'bagging_temperature': 5.894623447389591, 'random_strength': 0.7338997590284874, 'border_count': 105}. Best is trial 5 with value: 0.9623264912761625.
[I 2025-12-01 17:31:40,789] Trial 7 finished with value: 0.9542523074208251 and parameters: {'iterations': 1199, 'learning_rate': 0.06644279221686046, 'depth': 9, 'l2_leaf_reg': 0.16743725071506307, 'bagging_temperature': 7.001337849512133, 'random_strength': 9.185602393708354, 'border_count': 242}. Best is trial 5 with value: 0.9623264912761625.
[I 2025-12-01 17:32:54,933] Trial 8 finished with value: 0.9149406576452167 and parameters: {'iterations': 625, 'learning_rate': 0.017005894108223302, 'depth': 7, 'l2_leaf_reg': 1.4527326384249541, 'bagging_temperature': 4.273700296535106, 'random_strength': 4.871826061407098, 'border_count': 155}. Best is trial 5 with value: 0.9623264912761625.
[I 2025-12-01 17:34:35,986] Trial 9 finished with value: 0.9609554506322393 and parameters: {'iterations': 918, 'learning_rate': 0.07416052622385916, 'depth': 7, 'l2_leaf_reg': 7.977734336315808, 'bagging_temperature': 1.2549291399834372, 'random_strength': 5.578910682935486, 'border_count': 128}. Best is trial 5 with value: 0.9623264912761625.
[I 2025-12-01 17:42:55,758] Trial 10 finished with value: 0.9582977065026256 and parameters: {'iterations': 1173, 'learning_rate': 0.1960378496365089, 'depth': 10, 'l2_leaf_reg': 9.97033415281497, 'bagging_temperature': 9.513430531737956, 'random_strength': 6.350373898570718, 'border_count': 195}. Best is trial 5 with value: 0.9623264912761625.
[I 2025-12-01 17:45:03,607] Trial 11 finished with value: 0.9609353927525295 and parameters: {'iterations': 893, 'learning_rate': 0.10923229893968264, 'depth': 9, 'l2_leaf_reg': 6.5725053753484675, 'bagging_temperature': 7.941009101124375, 'random_strength': 7.17356238294645, 'border_count': 32}. Best is trial 5 with value: 0.9623264912761625.
[I 2025-12-01 17:50:27,760] Trial 12 finished with value: 0.9601522062887188 and parameters: {'iterations': 975, 'learning_rate': 0.11408675268393315, 'depth': 10, 'l2_leaf_reg': 9.965750551222598, 'bagging_temperature': 8.233824948609724, 'random_strength': 7.445620877574351, 'border_count': 69}. Best is trial 5 with value: 0.9623264912761625.
[I 2025-12-01 17:52:40,106] Trial 13 finished with value: 0.9583235802874357 and parameters: {'iterations': 833, 'learning_rate': 0.18450134794290263, 'depth': 9, 'l2_leaf_reg': 5.6857601636251545, 'bagging_temperature': 3.3230180558264406, 'random_strength': 3.493669332427827, 'border_count': 65}. Best is trial 5 with value: 0.9623264912761625.
[I 2025-12-01 17:53:09,896] Trial 14 finished with value: 0.935559495107227 and parameters: {'iterations': 312, 'learning_rate': 0.09216351649772389, 'depth': 5, 'l2_leaf_reg': 8.420382766009244, 'bagging_temperature': 8.127372612599506, 'random_strength': 7.7260348928192295, 'border_count': 66}. Best is trial 5 with value: 0.9623264912761625.
[I 2025-12-01 17:55:28,985] Trial 15 finished with value: 0.9629402795352624 and parameters: {'iterations': 1053, 'learning_rate': 0.13226460483670338, 'depth': 8, 'l2_leaf_reg': 8.815066606054947, 'bagging_temperature': 0.179233641381483, 'random_strength': 9.985061842665186, 'border_count': 127}. Best is trial 15 with value: 0.9629402795352624.
[I 2025-12-01 17:57:56,445] Trial 16 finished with value: 0.9618399139598719 and parameters: {'iterations': 1053, 'learning_rate': 0.16952903172995049, 'depth': 8, 'l2_leaf_reg': 6.721224458782387, 'bagging_temperature': 0.622853968609823, 'random_strength': 4.2099694138344645, 'border_count': 195}. Best is trial 15 with value: 0.9629402795352624.
[I 2025-12-01 18:00:18,225] Trial 17 finished with value: 0.9626442308202062 and parameters: {'iterations': 1071, 'learning_rate': 0.090973794665342, 'depth': 8, 'l2_leaf_reg': 4.8946336130492405, 'bagging_temperature': 1.9383405689352189, 'random_strength': 7.974049615584586, 'border_count': 126}. Best is trial 15 with value: 0.9629402795352624.
[I 2025-12-01 18:01:49,972] Trial 18 finished with value: 0.9610529633272684 and parameters: {'iterations': 1027, 'learning_rate': 0.1167761428721337, 'depth': 5, 'l2_leaf_reg': 4.916338569696763, 'bagging_temperature': 2.191057248854966, 'random_strength': 8.107509971869787, 'border_count': 132}. Best is trial 15 with value: 0.9629402795352624.
[I 2025-12-01 18:04:19,136] Trial 19 finished with value: 0.9627084184380493 and parameters: {'iterations': 1077, 'learning_rate': 0.12614758081310457, 'depth': 8, 'l2_leaf_reg': 4.54854412220353, 'bagging_temperature': 2.421140533964618, 'random_strength': 9.923892205626366, 'border_count': 185}. Best is trial 15 with value: 0.9629402795352624.
[I 2025-12-01 18:05:51,290] Trial 20 finished with value: 0.9626730097014748 and parameters: {'iterations': 925, 'learning_rate': 0.16810836455313555, 'depth': 6, 'l2_leaf_reg': 2.167565865273924, 'bagging_temperature': 0.2524092329002636, 'random_strength': 9.978507782564527, 'border_count': 186}. Best is trial 15 with value: 0.9629402795352624.
"""
best_params = study.best_params
best_params
"""
 {'iterations': 949,
 'learning_rate': 0.16442153920164912,
 'depth': 6,
 'l2_leaf_reg': 1.8940576361845642,
 'bagging_temperature': 0.007497917125861314,
 'random_strength': 9.788128831783368,
 'border_count': 183}
"""
best_cat = CatBoostClassifier(
    **best_params,
    loss_function="Logloss",
    eval_metric="AUC",
    verbose=False,
    random_seed=42
)

best_cat.fit(X_lofo, y)


KeyboardInterrupt: 

In [None]:
Below are the steps for two models, CatBoost and Sklearn Ridge, that we&#39;d like you to
perform:
1. Prepare the necessary preprocessing steps for both models, utilizing existing resources if
available.
2. Determine the appropriate validation strategy for model validation (e.g., KFold,
StratifiedKFold).
3. Provide initial prediction results with simple parameters for both models.
4. Perform feature selection using lofo-importance as outlined in this article: [Link to the
article].
5. Implement hyperparameter optimization using techniques such as Grid Search, Random
Search, or Bayesian Search. If possible, consider using Optuna (https://optuna.org/).
6. Demonstrate how your choices from step 3 to step 5 have improved model performance,
documenting the pros and cons of each experiment.
7. Interpret model variables using SHAP values. You can use this resource.
8. (Optional) Explore feature engineering techniques, creating new variables and validating
their impact on model performance.
You can access the dataset here.
Finally, please compile your work into a Jupyter notebook with the last 7-8 headings &amp;
presentation format. Feel free to reach out if you have any questions or need clarification.
We are looking forward to seeing your progress.