In [1]:
# ----------------------------
# 0️⃣ Kütüphaneler
# ----------------------------
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import BayesianRidge, RidgeClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

from catboost import CatBoostClassifier
from lofo import LOFOImportance, Dataset as LOFO_Dataset

import optuna
import shap

# ----------------------------
# 1️⃣ Veri Yükleme
# ----------------------------
application_train_direction = r"C:\Users\oğuzhan\Desktop\case-study\case-study\home-credit-default-risk\application_train.csv"
application_test_direction = r"C:\Users\oğuzhan\Desktop\case-study\case-study\home-credit-default-risk\application_test.csv"

application_train = pd.read_csv(application_train_direction)
application_test = pd.read_csv(application_test_direction)

# ----------------------------
# 2️⃣ Kategorik Değişkenlerin Encode Edilmesi
# ----------------------------
cat_cols = application_train.select_dtypes(include=['object']).columns
df_le = application_train.copy()
label_encoders = {}

for col in cat_cols:
    le = LabelEncoder()
    df_le[col] = df_le[col].astype(str)
    df_le[col] = le.fit_transform(df_le[col])
    label_encoders[col] = le

# ----------------------------
# 3️⃣ Eksik Değerlerin Doldurulması
# ----------------------------
imputer = IterativeImputer(estimator=BayesianRidge(), max_iter=10, initial_strategy='median', random_state=42)
df_imputed = pd.DataFrame(imputer.fit_transform(df_le), columns=df_le.columns)

X = df_imputed.drop(columns=['TARGET'])
y = df_imputed['TARGET']

# ----------------------------
# 4️⃣ LOFO ile Feature Selection
# ----------------------------
df_lofo = df_imputed.copy()
feature_names = df_lofo.columns.tolist()
feature_names.remove("TARGET")

lofo_dataset = LOFO_Dataset(df=df_lofo, target="TARGET", features=feature_names)
lofo_model = CatBoostClassifier(depth=3, iterations=100, learning_rate=0.1, loss_function="Logloss", verbose=False, random_seed=42)

lofo = LOFOImportance(dataset=lofo_dataset, model=lofo_model, scoring="roc_auc", n_jobs=-1)
importance_df = lofo.get_importance()

# Düşük önemli feature'ları çıkar
low_importance_features = importance_df[importance_df["importance_mean"] < 0.001]["feature"].tolist()
X_lofo = X.drop(columns=low_importance_features)

# ----------------------------
# 5️⃣ CatBoost Hyperparameter Tuning
# ----------------------------
def objective_cat(trial):
    params = {
        "iterations": trial.suggest_int("iterations", 300, 1200),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.2),
        "depth": trial.suggest_int("depth", 3, 10),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1e-3, 10.0),
        "bagging_temperature": trial.suggest_float("bagging_temperature", 0.0, 10.0),
        "random_strength": trial.suggest_float("random_strength", 0.1, 10.0),
        "border_count": trial.suggest_int("border_count", 32, 255),
        "loss_function": "Logloss",
        "eval_metric": "AUC",
        "verbose": False,
        "random_seed": 42
    }
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    auc_scores = []
    for train_idx, valid_idx in cv.split(X_lofo, y):
        X_train, X_valid = X_lofo.iloc[train_idx], X_lofo.iloc[valid_idx]
        y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]
        model = CatBoostClassifier(**params)
        model.fit(X_train, y_train)
        preds = model.predict_proba(X_valid)[:, 1]
        auc_scores.append(roc_auc_score(y_valid, preds))
    return np.mean(auc_scores)

study_cat = optuna.create_study(direction="maximize")
study_cat.optimize(objective_cat, n_trials=20)

best_cat_params = study_cat.best_params
best_cat = CatBoostClassifier(**best_cat_params, loss_function="Logloss", eval_metric="AUC", verbose=False, random_seed=42)
best_cat.fit(X_lofo, y)

# ----------------------------
# 6️⃣ Ridge Hyperparameter Tuning
# ----------------------------
def objective_ridge(trial):
    alpha = trial.suggest_float("alpha", 0.001, 10.0)
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    auc_scores = []
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_lofo)
    for train_idx, valid_idx in cv.split(X_scaled, y):
        X_train, X_valid = X_scaled[train_idx], X_scaled[valid_idx]
        y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]
        model = RidgeClassifier(alpha=alpha)
        model.fit(X_train, y_train)
        preds = model.decision_function(X_valid)
        auc_scores.append(roc_auc_score(y_valid, preds))
    return np.mean(auc_scores)

study_ridge = optuna.create_study(direction="maximize")
study_ridge.optimize(objective_ridge, n_trials=20)

best_ridge_alpha = study_ridge.best_params["alpha"]
ridge_model = RidgeClassifier(alpha=best_ridge_alpha)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_lofo)
ridge_model.fit(X_scaled, y)

# ----------------------------
# 7️⃣ SHAP Görselleştirmesi
# ----------------------------
# CatBoost
explainer_cat = shap.TreeExplainer(best_cat)
shap_values_cat = explainer_cat.shap_values(X_lofo)
shap.summary_plot(shap_values_cat, X_lofo, plot_type="bar", max_display=20)

# Ridge
explainer_ridge = shap.LinearExplainer(ridge_model, X_scaled, feature_perturbation="interventional")
shap_values_ridge = explainer_ridge.shap_values(X_scaled)
shap.summary_plot(shap_values_ridge, X_lofo, plot_type="bar", max_display=20)


KeyboardInterrupt: 