In [None]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import Ridge
from catboost import CatBoostClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import (
    train_test_split,
    KFold,
    StratifiedKFold,
    cross_val_score
)
from sklearn.metrics import roc_auc_score
from lofo import LOFOImportance, Dataset as LOFO_Dataset, plot_importance
import optuna
import shap

application_test_direction=r"C:\Users\oğuzhan\Desktop\case-study\case-study\home-credit-default-risk\application_test.csv"
application_test = pd.read_csv(application_test_direction,)

application_train_direction=r"C:\Users\oğuzhan\Desktop\case-study\case-study\home-credit-default-risk\application_train.csv"
application_train = pd.read_csv(application_train_direction,)

application_train.head()
application_train.info()
application_train.isnull().sum().sort_values(ascending=False).head(20)
application_train.describe()

"<class 'pandas.core.frame.DataFrame'>"
"RangeIndex: 307511 entries, 0 to 307510"
"Columns: 122 entries, SK_ID_CURR to AMT_REQ_CREDIT_BUREAU_YEAR"
"dtypes: float64(65), int64(41), object(16)"
"memory usage: 286.2+ MB"

missing = application_train.isnull().sum().sort_values(ascending=False)
percent = (application_train.isnull().sum() / len(application_train) * 100).sort_values(ascending=False)

missing_df = pd.DataFrame({'missing_count': missing, 'missing_percent': percent})
missing_df.head(20)

"""missing_count	missing_percent
COMMONAREA_AVG	214865	69.872297
COMMONAREA_MODE	214865	69.872297
COMMONAREA_MEDI	214865	69.872297
NONLIVINGAPARTMENTS_MEDI	213514	69.432963
NONLIVINGAPARTMENTS_MODE	213514	69.432963
NONLIVINGAPARTMENTS_AVG	213514	69.432963
FONDKAPREMONT_MODE	210295	68.386172
LIVINGAPARTMENTS_AVG	210199	68.354953
LIVINGAPARTMENTS_MEDI	210199	68.354953
LIVINGAPARTMENTS_MODE	210199	68.354953
FLOORSMIN_MODE	208642	67.848630
FLOORSMIN_AVG	208642	67.848630
FLOORSMIN_MEDI	208642	67.848630
YEARS_BUILD_AVG	204488	66.497784
YEARS_BUILD_MODE	204488	66.497784
YEARS_BUILD_MEDI	204488	66.497784
OWN_CAR_AGE	202929	65.990810
LANDAREA_MEDI	182590	59.376738
LANDAREA_AVG	182590	59.376738
LANDAREA_MODE	182590	59.376738"""

cat_cols = application_train.select_dtypes(include=['object']).columns
cat_cols

"""Index(['NAME_CONTRACT_TYPE', 'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY',
       'NAME_TYPE_SUITE', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE',
       'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 'OCCUPATION_TYPE',
       'WEEKDAY_APPR_PROCESS_START', 'ORGANIZATION_TYPE', 'FONDKAPREMONT_MODE',
       'HOUSETYPE_MODE', 'WALLSMATERIAL_MODE', 'EMERGENCYSTATE_MODE'],
      dtype='object')"""

from sklearn.preprocessing import LabelEncoder

df_le = application_train.copy()

label_encoders = {}  # Sonradan inverse transform için gerekli

for col in cat_cols:
    le = LabelEncoder()
    df_le[col] = df_le[col].astype(str)  # NaN ve kategoriler için güvenli
    df_le[col] = le.fit_transform(df_le[col])
    label_encoders[col] = le

df_le.head()

"""	SK_ID_CURR	TARGET	NAME_CONTRACT_TYPE	CODE_GENDER	FLAG_OWN_CAR	FLAG_OWN_REALTY	CNT_CHILDREN	AMT_INCOME_TOTAL	AMT_CREDIT	AMT_ANNUITY	...	FLAG_DOCUMENT_18	FLAG_DOCUMENT_19	FLAG_DOCUMENT_20	FLAG_DOCUMENT_21	AMT_REQ_CREDIT_BUREAU_HOUR	AMT_REQ_CREDIT_BUREAU_DAY	AMT_REQ_CREDIT_BUREAU_WEEK	AMT_REQ_CREDIT_BUREAU_MON	AMT_REQ_CREDIT_BUREAU_QRT	AMT_REQ_CREDIT_BUREAU_YEAR
0	100002	1	0	1	0	1	0	202500.0	406597.5	24700.5	...	0	0	0	0	0.0	0.0	0.0	0.0	0.0	1.0
1	100003	0	0	0	0	0	0	270000.0	1293502.5	35698.5	...	0	0	0	0	0.0	0.0	0.0	0.0	0.0	0.0
2	100004	0	1	1	1	1	0	67500.0	135000.0	6750.0	...	0	0	0	0	0.0	0.0	0.0	0.0	0.0	0.0
3	100006	0	0	0	0	1	0	135000.0	312682.5	29686.5	...	0	0	0	0	NaN	NaN	NaN	NaN	NaN	NaN
4	100007	0	0	1	0	1	0	121500.0	513000.0	21865.5	...	0	0	0	0	0.0	0.0	0.0	0.0	0.0	0.0
5 rows × 122 columns"""

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import BayesianRidge

imputer = IterativeImputer(
    estimator=BayesianRidge(),
    max_iter=10,
    initial_strategy='median',
    imputation_order='ascending',
    random_state=42
)

df_imputed = imputer.fit_transform(df_le)

df_imputed = pd.DataFrame(df_imputed, columns=df_le.columns)

df_imputed.head()

"""SK_ID_CURR	TARGET	NAME_CONTRACT_TYPE	CODE_GENDER	FLAG_OWN_CAR	FLAG_OWN_REALTY	CNT_CHILDREN	AMT_INCOME_TOTAL	AMT_CREDIT	AMT_ANNUITY	...	FLAG_DOCUMENT_18	FLAG_DOCUMENT_19	FLAG_DOCUMENT_20	FLAG_DOCUMENT_21	AMT_REQ_CREDIT_BUREAU_HOUR	AMT_REQ_CREDIT_BUREAU_DAY	AMT_REQ_CREDIT_BUREAU_WEEK	AMT_REQ_CREDIT_BUREAU_MON	AMT_REQ_CREDIT_BUREAU_QRT	AMT_REQ_CREDIT_BUREAU_YEAR
0	100002.0	1.0	0.0	1.0	0.0	1.0	0.0	202500.0	406597.5	24700.5	...	0.0	0.0	0.0	0.0	0.000000	0.000000	0.000000	0.000000	0.000000	1.000000
1	100003.0	0.0	0.0	0.0	0.0	0.0	0.0	270000.0	1293502.5	35698.5	...	0.0	0.0	0.0	0.0	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
2	100004.0	0.0	1.0	1.0	1.0	1.0	0.0	67500.0	135000.0	6750.0	...	0.0	0.0	0.0	0.0	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
3	100006.0	0.0	0.0	0.0	0.0	1.0	0.0	135000.0	312682.5	29686.5	...	0.0	0.0	0.0	0.0	0.004125	0.005844	0.039374	0.197952	0.277668	2.153276
4	100007.0	0.0	0.0	1.0	0.0	1.0	0.0	121500.0	513000.0	21865.5	...	0.0	0.0	0.0	0.0	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
5 rows × 122 columns"""

X = df_imputed.drop(columns=['TARGET'])
y = df_imputed['TARGET']

X.shape, y.shape

"((307511, 121), (307511,))"

from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
    print("Fold:", fold, 
          "| Train size:", len(train_idx), 
          "| Validation size:", len(val_idx))
    
"""Fold: 0 | Train size: 246008 | Validation size: 61503
Fold: 1 | Train size: 246009 | Validation size: 61502
Fold: 2 | Train size: 246009 | Validation size: 61502
Fold: 3 | Train size: 246009 | Validation size: 61502
Fold: 4 | Train size: 246009 | Validation size: 61502"""

from catboost import CatBoostClassifier
from sklearn.metrics import roc_auc_score

cat_scores = []

for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]

    model = CatBoostClassifier(
        iterations=300,
        learning_rate=0.05,
        depth=6,
        loss_function='Logloss',
        verbose=False,
        random_seed=42
    )

    model.fit(X_train, y_train, eval_set=(X_val, y_val), verbose=False)

    preds = model.predict_proba(X_val)[:, 1]
    score = roc_auc_score(y_val, preds)
    cat_scores.append(score)
    print(f"Fold {fold} ROC-AUC: {score:.4f}")

print("\nCatBoost Mean ROC-AUC:", sum(cat_scores)/len(cat_scores))

"""Fold 0 ROC-AUC: 0.9298
Fold 1 ROC-AUC: 0.9333
Fold 2 ROC-AUC: 0.9302
Fold 3 ROC-AUC: 0.9297
Fold 4 ROC-AUC: 0.9305

CatBoost Mean ROC-AUC: 0.9307146242760815"""

from sklearn.linear_model import RidgeClassifier
from sklearn.preprocessing import StandardScaler

ridge_scores = []

for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]

    scaler = StandardScaler()
    X_train_s = scaler.fit_transform(X_train)
    X_val_s = scaler.transform(X_val)

    model = RidgeClassifier()
    model.fit(X_train_s, y_train)

    preds = model.decision_function(X_val_s)
    score = roc_auc_score(y_val, preds)
    ridge_scores.append(score)
    print(f"Fold {fold} ROC-AUC: {score:.4f}")

print("\nRidge Mean ROC-AUC:", sum(ridge_scores)/len(ridge_scores))

"""Fold 0 ROC-AUC: 0.7825
Fold 1 ROC-AUC: 0.7930
Fold 2 ROC-AUC: 0.7858
Fold 3 ROC-AUC: 0.7900
Fold 4 ROC-AUC: 0.7808

Ridge Mean ROC-AUC: 0.7864047409455731"""

from catboost import CatBoostClassifier

best_cat_model = CatBoostClassifier(
    depth=8,
    learning_rate=0.05,
    iterations=1500,
    loss_function="Logloss",
    eval_metric="AUC",
    verbose=False
)

best_cat_model.fit(X, y)

df_lofo = df_imputed.copy()  # TARGET dahil

feature_names = df_lofo.columns.tolist()
feature_names.remove("TARGET")

lofo_dataset = Dataset(
    df_lofo,           # tek dataframe
    "TARGET",          # target'ın kolon adı (string)
    feature_names      # feature listesi
)

lofo_model = CatBoostClassifier(
    depth=4,
    iterations=500,
    learning_rate=0.05,
    loss_function="Logloss",
    eval_metric="AUC",
    verbose=False,
    random_seed=42
)
importance_df = lofo.get_importance()

plot_importance(importance_df)


KeyboardInterrupt: 

In [None]:
Below are the steps for two models, CatBoost and Sklearn Ridge, that we&#39;d like you to
perform:
1. Prepare the necessary preprocessing steps for both models, utilizing existing resources if
available.
2. Determine the appropriate validation strategy for model validation (e.g., KFold,
StratifiedKFold).
3. Provide initial prediction results with simple parameters for both models.
4. Perform feature selection using lofo-importance as outlined in this article: [Link to the
article].
5. Implement hyperparameter optimization using techniques such as Grid Search, Random
Search, or Bayesian Search. If possible, consider using Optuna (https://optuna.org/).
6. Demonstrate how your choices from step 3 to step 5 have improved model performance,
documenting the pros and cons of each experiment.
7. Interpret model variables using SHAP values. You can use this resource.
8. (Optional) Explore feature engineering techniques, creating new variables and validating
their impact on model performance.
You can access the dataset here.
Finally, please compile your work into a Jupyter notebook with the last 7-8 headings &amp;
presentation format. Feel free to reach out if you have any questions or need clarification.
We are looking forward to seeing your progress.