In [19]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
from config import normalized_features, encoded_features, to_remove_features
from sklearn.model_selection import cross_val_score, StratifiedKFold, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, roc_auc_score
import mlflow
import dagshub
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
import json
import optuna
import warnings

warnings.filterwarnings("ignore", category=UserWarning)

In [20]:
# sql engine
engine = create_engine("postgresql://user:password@localhost:5432/home_credit_db")

In [21]:
# dagshub init
dagshub.init(
    repo_owner="maulanasyaa", repo_name="Home-Credit_Default_Risk", mlflow=True
)

# mlflow.autolog()

In [22]:
query_app = """
    select * from application_train_clean
"""

df = pd.read_sql(query_app, engine)


## feature engineering

In [23]:
# feature engineering
df = df.drop(columns=to_remove_features)

print(df.shape)
df.head()

(307507, 53)


Unnamed: 0,sk_id_curr,target,name_contract_type,code_gender,flag_own_car,flag_own_realty,amt_income_total,amt_credit,amt_annuity,amt_goods_price,...,livingarea_mode,floorsmax_medi,fondkapremont_mode,housetype_mode,totalarea_mode,wallsmaterial_mode,emergencystate_mode,obs_30_cnt_social_circle,days_last_phone_change,amt_req_credit_bureau_year
0,100176,0,Cash loans,F,N,Y,135000.0,1078200.0,31653.0,900000.0,...,0.0729,0.3333,reg oper account,block of flats,0.0685,Panel,No,1.0,-131.0,1.0
1,100177,0,Cash loans,M,N,N,112500.0,312682.5,33309.0,297000.0,...,,,,,,,,0.0,-328.0,0.0
2,100178,0,Cash loans,F,N,Y,119250.0,679500.0,28917.0,679500.0,...,,,,,,,,0.0,-2085.0,0.0
3,100179,0,Cash loans,F,Y,N,202500.0,675000.0,53329.5,675000.0,...,0.7488,0.9167,reg oper account,block of flats,0.7334,Monolithic,No,0.0,-1792.0,4.0
4,100180,0,Cash loans,F,N,Y,315000.0,1288350.0,37800.0,1125000.0,...,0.0874,0.1667,reg oper account,block of flats,0.0763,Panel,No,1.0,-1020.0,3.0


In [24]:
# create feature
# external source mean
df["ext_source_avg"] = np.mean(
    df[["ext_source_1", "ext_source_2", "ext_source_3"]], axis=1
)

# debt to income ratio
df["debt_to_income_ratio"] = df["amt_credit"] / df["amt_income_total"]

# payment to income ratio
df["payment_to_income_ratio"] = df["amt_annuity"] / df["amt_income_total"]

# credit to goods ratio
df["credit_to_goods_ratio"] = df["amt_credit"] / df["amt_goods_price"]

# days employed percentage
df["days_employed_percentage"] = df["days_employed"] / df["days_birth"]

# income per person
df["income_per_person"] = df["amt_income_total"] / df["cnt_fam_members"]

df.head(3)

Unnamed: 0,sk_id_curr,target,name_contract_type,code_gender,flag_own_car,flag_own_realty,amt_income_total,amt_credit,amt_annuity,amt_goods_price,...,emergencystate_mode,obs_30_cnt_social_circle,days_last_phone_change,amt_req_credit_bureau_year,ext_source_avg,debt_to_income_ratio,payment_to_income_ratio,credit_to_goods_ratio,days_employed_percentage,income_per_person
0,100176,0,Cash loans,F,N,Y,135000.0,1078200.0,31653.0,900000.0,...,No,1.0,-131.0,1.0,0.700692,7.986667,0.234467,1.198,0.24426,67500.0
1,100177,0,Cash loans,M,N,N,112500.0,312682.5,33309.0,297000.0,...,,0.0,-328.0,0.0,0.506156,2.7794,0.29608,1.052803,0.023339,37500.0
2,100178,0,Cash loans,F,N,Y,119250.0,679500.0,28917.0,679500.0,...,,0.0,-2085.0,0.0,0.635199,5.698113,0.242491,1.0,0.076999,119250.0


# ------------------------------------------

In [25]:
mlflow.set_experiment("home_credit_modelling")

<Experiment: artifact_location='mlflow-artifacts:/53e8be0b12414e6896ea4ac1452766e3', creation_time=1767969132883, experiment_id='0', last_update_time=1767969132883, lifecycle_stage='active', name='home_credit_modelling', tags={}>

## preprocessing pipeline

In [26]:
X = df.drop(columns=["sk_id_curr", "target"])
y = df["target"]

# select feature to scaled
num_features = X.select_dtypes(include=np.number).columns
cat_features = X.select_dtypes(include=np.object_).columns


col_to_drop = [
    col for col in (normalized_features + encoded_features) if col in num_features
]

col_to_scaled = df[num_features].drop(columns=col_to_drop).columns

In [27]:
# preprocessing pipeline

numeric_transformer = Pipeline(
    steps=[
        (
            "imputer",
            SimpleImputer(strategy="median"),
        ),
        ("scaler", StandardScaler()),
    ]
)

categorical_transformer = Pipeline(
    steps=[
        (
            "imputer",
            SimpleImputer(strategy="most_frequent"),
        ),
        (
            "encoder",
            OneHotEncoder(handle_unknown="ignore"),
        ),
    ]
)

preprocessing_pipeline = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, col_to_scaled),
        ("cat", categorical_transformer, cat_features),
    ]
)

## data split

In [28]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

## Logistic Regression

In [29]:
# with mlflow.start_run(run_name="LogisticRegression"):
#     LR_pipeline = Pipeline(
#         steps=[
#             ("preprocessing", preprocessing_pipeline),
#             (
#                 "logistic_regression",
#                 LogisticRegression(
#                     solver="lbfgs",
#                     penalty="l2",
#                     C=1.0,
#                     max_iter=1000,
#                     random_state=42,
#                     class_weight="balanced",
#                 ),
#             ),
#         ]
#     )

#     # cross validation
#     cv_scores = cross_val_score(
#         LR_pipeline, X_train, y_train, cv=cv, scoring="accuracy"
#     )

#     mlflow.log_metric("avg_cv_accuracy", np.mean(cv_scores))

#     LR_pipeline.fit(X_train, y_train)

#     # test validation
#     y_pred = LR_pipeline.predict(X_test)
#     acc_score = accuracy_score(y_test, y_pred)

#     y_pred_proba = LR_pipeline.predict_proba(X_test)[:, 1]
#     auc = roc_auc_score(y_test, y_pred_proba)

#     mlflow.log_metric("test_accuracy", acc_score)
#     mlflow.log_metric("test_auc", auc)

#     # feature artifact
#     feature_names = X_train.columns.tolist()

#     with open("feature_names.json", "w") as f:
#         json.dump(feature_names, f)

#     mlflow.log_artifact("feature_names.json")


## XGBoost

In [30]:
# with mlflow.start_run(run_name="XGBoost"):
#     XGB_pipeline = Pipeline(
#         steps=[
#             ("preprocessing", preprocessing_pipeline),
#             (
#                 "xgboost",
#                 XGBClassifier(
#                     max_depth=5,
#                     n_estimators=200,
#                     learning_rate=0.1,
#                     scale_pos_weight=11,
#                     eval_metric="auc",
#                     objective="binary:logistic",
#                     tree_method="hist",
#                     random_state=42,
#                 ),
#             ),
#         ]
#     )

#     # cross validation
#     cv_scores = cross_val_score(
#         XGB_pipeline, X_train, y_train, cv=cv, scoring="accuracy"
#     )

#     mlflow.log_metric("avg_cv_accuracy", np.mean(cv_scores))

#     XGB_pipeline.fit(X_train, y_train)

#     # test validation
#     y_pred = XGB_pipeline.predict(X_test)
#     acc_score = accuracy_score(y_test, y_pred)

#     y_pred_proba = XGB_pipeline.predict_proba(X_test)[:, 1]
#     auc = roc_auc_score(y_test, y_pred_proba)

#     mlflow.log_metric("test_accuracy", acc_score)
#     mlflow.log_metric("test_auc", auc)

#     # feature artifact
#     feature_names = X_train.columns.tolist()

#     with open("feature_names.json", "w") as f:
#         json.dump(feature_names, f)

#     mlflow.log_artifact("feature_names.json")

## LightGBM

In [31]:
# with mlflow.start_run(run_name="LightGBM_remove_feat_5"):
#     LGBM_pipeline = Pipeline(
#         steps=[
#             ("preprocessing", preprocessing_pipeline),
#             (
#                 "lightgbm",
#                 LGBMClassifier(
#                     n_estimators=200,
#                     learning_rate=0.1,
#                     num_leaves=31,
#                     is_unbalance=True,
#                     objective="binary",
#                     metric="auc",
#                     random_state=42,
#                     n_jobs=-1,
#                     verbose=-1,
#                 ),
#             ),
#         ]
#     )

#     # cross validation
#     cv_scores = cross_val_score(
#         LGBM_pipeline, X_train, y_train, cv=cv, scoring="accuracy"
#     )

#     mlflow.log_metric("avg_cv_accuracy", np.mean(cv_scores))

#     LGBM_pipeline.fit(X_train, y_train)

#     # test validation
#     y_pred = LGBM_pipeline.predict(X_test)
#     acc_score = accuracy_score(y_test, y_pred)

#     y_pred_proba = LGBM_pipeline.predict_proba(X_test)[:, 1]
#     auc = roc_auc_score(y_test, y_pred_proba)

#     mlflow.log_metric("test_accuracy", acc_score)
#     mlflow.log_metric("test_auc", auc)

#     # feature artifact
#     feature_names = X_train.columns.tolist()

#     with open("feature_names.json", "w") as f:
#         json.dump(feature_names, f)

#     # removed features
#     removed_feat = to_remove_features

#     with open("removed_feat.json", "w") as f:
#         json.dump(removed_feat, f)

#     mlflow.log_artifact("feature_names.json")
#     mlflow.log_artifact("removed_feat.json")

In [None]:
# # lgbm after add 6 feature from feat engineering
# with mlflow.start_run(run_name="LightGBM_add_6_feature_engineering"):
#     mlflow.autolog()
#     LGBM_pipeline = Pipeline(
#         steps=[
#             ("preprocessing", preprocessing_pipeline),
#             (
#                 "lightgbm",
#                 LGBMClassifier(
#                     n_estimators=200,
#                     learning_rate=0.1,
#                     num_leaves=31,
#                     is_unbalance=True,
#                     objective="binary",
#                     metric="auc",
#                     random_state=42,
#                     n_jobs=-1,
#                     verbose=-1,
#                 ),
#             ),
#         ]
#     )

#     # cross validation
#     cv_scores = cross_val_score(
#         LGBM_pipeline, X_train, y_train, cv=cv, scoring="accuracy"
#     )

#     mlflow.log_metric("avg_cv_accuracy", np.mean(cv_scores))

#     LGBM_pipeline.fit(X_train, y_train)

#     # test validation
#     y_pred = LGBM_pipeline.predict(X_test)
#     acc_score = accuracy_score(y_test, y_pred)

#     y_pred_proba = LGBM_pipeline.predict_proba(X_test)[:, 1]
#     auc = roc_auc_score(y_test, y_pred_proba)

#     mlflow.log_metric("test_accuracy", acc_score)
#     mlflow.log_metric("test_auc", auc)

#     # feature artifact
#     feature_names = X_train.columns.tolist()

#     with open("feature_names.json", "w") as f:
#         json.dump(feature_names, f)

#     mlflow.log_artifact("feature_names.json")
#     mlflow.autolog(disable=True)

2026/02/10 20:27:26 INFO mlflow.tracking.fluent: Autologging successfully enabled for lightgbm.
2026/02/10 20:27:27 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2026/02/10 20:27:27 INFO mlflow.tracking.fluent: Autologging successfully enabled for xgboost.


üèÉ View run LightGBM_add_6_feature_engineering at: https://dagshub.com/maulanasyaa/Home-Credit_Default_Risk.mlflow/#/experiments/0/runs/25d102123bf848b4ba84b26b48a81ca9
üß™ View experiment at: https://dagshub.com/maulanasyaa/Home-Credit_Default_Risk.mlflow/#/experiments/0


In [None]:
# # feature importance after feature engineering
# model_imp = LGBM_pipeline.named_steps["lightgbm"]
# feature_names_imp = LGBM_pipeline.named_steps["preprocessing"].get_feature_names_out()

# importance_df = pd.DataFrame(
#     {"feature": feature_names_imp, "importance": model_imp.feature_importances_}
# )

# importance_df = importance_df.sort_values("importance", ascending=False)
# importance_df

# # all created features is good

Unnamed: 0,feature,importance
14,num__ext_source_avg,471
2,num__amt_annuity,393
4,num__days_birth,376
7,num__days_id_publish,375
12,num__days_last_phone_change,337
...,...,...
124,cat__organization_type_Religion,0
38,cat__name_income_type_Maternity leave,0
53,cat__name_family_status_Unknown,0
30,cat__name_type_suite_Group of people,0


## hyperparameter tuning

In [None]:
def objective(trial):
    # param
    param_learning_rate = trial.suggest_float("learning_rate", 0.001, 0.3)
    param_num_leaves = trial.suggest_int("num_leaves", 20, 300)
    param_max_depth = trial.suggest_int("max_depth", 3, 12)
    param_reg_alpha = trial.suggest_float("reg_alpha", 0.0, 1.0)
    param_is_unbalance = trial.suggest_categorical("is_unbalance", [True, False])

    # model
    model = LGBMClassifier(
        n_estimators=100,
        random_state=42,
        learning_rate=param_learning_rate,
        num_leaves=param_num_leaves,
        max_depth=param_max_depth,
        reg_alpha=param_reg_alpha,
        is_unbalance=param_is_unbalance,
        verbosity=-1,
    )

    trial_pipeline = Pipeline(
        steps=[("preprocessing", preprocessing_pipeline), ("lgbm_model", model)]
    )

    cv_score = cross_val_score(
        trial_pipeline, X_train, y_train, cv=cv, scoring="roc_auc"
    )

    avg_cv_score = np.mean(cv_score)

    return avg_cv_score


# opt study
study = optuna.create_study(direction="maximize")
# study.optimize(objective, n_trials=100) -> turn off

# best val
print(study.best_value)
print(study.best_params)

[32m[I 2026-02-04 21:14:08,618][0m A new study created in memory with name: no-name-969b0636-8401-4086-92ab-8412de89293d[0m
[32m[I 2026-02-04 21:14:40,044][0m Trial 0 finished with value: 0.6754108368198544 and parameters: {'learning_rate': 0.11856353816472162, 'num_leaves': 204, 'max_depth': 12, 'reg_alpha': 0.38525553696534065, 'is_unbalance': False}. Best is trial 0 with value: 0.6754108368198544.[0m
[32m[I 2026-02-04 21:14:53,170][0m Trial 1 finished with value: 0.6841358804828721 and parameters: {'learning_rate': 0.12909898186900495, 'num_leaves': 112, 'max_depth': 3, 'reg_alpha': 0.19308199989179808, 'is_unbalance': False}. Best is trial 1 with value: 0.6841358804828721.[0m
[32m[I 2026-02-04 21:15:08,738][0m Trial 2 finished with value: 0.684554822214082 and parameters: {'learning_rate': 0.04597203190064387, 'num_leaves': 68, 'max_depth': 5, 'reg_alpha': 0.4547386680989389, 'is_unbalance': True}. Best is trial 2 with value: 0.684554822214082.[0m
[32m[I 2026-02-04 21:

0.6881935752669801
{'learning_rate': 0.1946416270095889, 'num_leaves': 149, 'max_depth': 4, 'reg_alpha': 0.2696708391664497, 'is_unbalance': False}


In [None]:
# best param
best_params = study.best_params
print(best_params)

{'learning_rate': 0.1946416270095889, 'num_leaves': 149, 'max_depth': 4, 'reg_alpha': 0.2696708391664497, 'is_unbalance': False}


In [None]:
# # final model
# with mlflow.start_run(run_name="LGBM_optuna_hyperparam_tuning_100"):
#     mlflow.autolog()

#     final_model = LGBMClassifier(**best_params, n_estimators=100, random_state=42)

#     final_pipeline = Pipeline(
#         steps=[("preprocessing", preprocessing_pipeline), ("model", final_model)]
#     )

#     final_pipeline.fit(X_train, y_train)

#     # cross validation
#     cv_scores = cross_val_score(
#         final_pipeline, X_train, y_train, cv=cv, scoring="accuracy"
#     )

#     mlflow.log_metric("avg_cv_accuracy", np.mean(cv_scores))

#     # test validation
#     y_pred = final_pipeline.predict(X_test)
#     acc_score = accuracy_score(y_test, y_pred)

#     y_pred_proba = final_pipeline.predict_proba(X_test)[:, 1]
#     auc = roc_auc_score(y_test, y_pred_proba)
#     print(f"roc_auc score on test data: {auc}")

#     mlflow.log_metric("test_accuracy", acc_score)
#     mlflow.log_metric("test_auc", auc)

#     # feature artifact
#     feature_names = X_train.columns.tolist()

#     with open("feature_names.json", "w") as f:
#         json.dump(feature_names, f)

#     mlflow.log_artifact("feature_names.json")
#     mlflow.autolog(disable=True)

2026/02/04 21:43:48 INFO mlflow.tracking.fluent: Autologging successfully enabled for lightgbm.
2026/02/04 21:43:49 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2026/02/04 21:43:49 INFO mlflow.tracking.fluent: Autologging successfully enabled for xgboost.


roc_auc score on test data: 0.6886154906925607
üèÉ View run LGBM_optuna_hyperparam_tuning_100 at: https://dagshub.com/maulanasyaa/Home-Credit_Default_Risk.mlflow/#/experiments/0/runs/7c317d9c65dc44d3b7ed510f44ea953e
üß™ View experiment at: https://dagshub.com/maulanasyaa/Home-Credit_Default_Risk.mlflow/#/experiments/0


## next session task:
- feature engineering:
    1. External Sources Mean (Skor Eksternal Rata-rata)

    Ini biasanya adalah fitur paling kuat #1. Kolom ext_source_1, 2, dan 3 adalah skor kredit dari lembaga lain. Menggabungkannya mengurangi noise dan mengisi data yang kosong.

    Formula: Rata-rata dari (ext_source_1, ext_source_2, ext_source_3).

    2. Debt-to-Income Ratio (Rasio Utang thd Pendapatan)

    Indikator utama apakah nasabah "kebesaran pasak daripada tiang".

    Formula: amt_credit / amt_income_total

    3. Payment-to-Income Ratio (Beban Cicilan Bulanan)

    Berapa persen gaji nasabah yang habis hanya untuk membayar cicilan ini?

    Formula: amt_annuity / amt_income_total

    4. Credit-to-Goods Ratio (LTV Proxy)

    Apakah nasabah meminjam uang pas seharga barang, atau lebih? Jika rasionya > 1, berarti dia meminjam lebih dari harga barang (uang tunai tambahan), yang bisa berisiko lebih tinggi.

    Formula: amt_credit / amt_goods_price

    5. Days Employed Percentage (Stabilitas Kerja)

    Berapa persen dari hidup nasabah yang dihabiskan untuk bekerja? Orang yang sudah bekerja sejak muda dan terus bekerja biasanya lebih stabil secara finansial daripada yang baru bekerja.

    Formula: days_employed / days_birth

    6. Income Per Person (Pendapatan Riil Keluarga)

    Gaji 10 juta untuk bujangan berbeda nilainya dengan gaji 10 juta untuk ayah dengan 3 anak.

    Formula: amt_income_total / cnt_fam_members

- try to research about  before trying to optuna tuning with 1000 trial
- research:
    - TPE 
    - earlystop
    - pruning
    - save study to db
            study = optuna.create_study(
            study_name="lgbm_optimization",
            storage="sqlite:///lgbm_optuna.db", # Disimpan dalam file
            load_if_exists=True,
            direction="maximize"
        )
    - optuna.visualization.plot_optimization_history(study)
    - resume trial
        import optuna

        # Nama study harus konsisten agar bisa dipanggil kembali
        study_name = "lgbm_tuning_project" 
        storage_name = "sqlite:///optuna_database.db"

        study = optuna.create_study(
            study_name=study_name, 
            storage=storage_name, 
            direction="maximize",
            load_if_exists=True  # PENTING: Jika sudah ada di DB, dia akan ambil data lama
        )

        # Menjalankan 50 trial awal
        study.optimize(objective, n_trials=50)
    
    note: try using postgres docker

Act as a Senior Data Science & Coding Mentor. I want you to teach me all of topic we research before using a "Best Practice" approach, focusing on modularity, reproducibility, and clarity.

Please strictly follow this teaching style and structure for every response:

1.  **Iterative Learning:** Do NOT give me the full code at once. Break the lesson down into small, logical "Steps." Teach one step at a time and wait for my implementation before moving to the next.
2.  **Analogy & Concept First:** Before asking me to code, explain the *why* and the *what* using simple, relatable metaphors (e.g., calling the objective function an "Exam," or the study object a "Manager").
3.  **The "Assignment" Structure:** In every step, end with a section clearly labeled **"Your Assignment"**. Inside this section:
    * Give me numbered, specific instructions on what to write.
    * Tell me specific variable names to use (to keep us in sync).
    * Ask me to write the code in my environment and paste it back to you for review.
4.  **Feedback Loop:** When I provide the code, start your next response by validating it (e.g., "Excellent start!", "Spot on!"). If I make a mistake, gently correct it before moving to the next step.
5.  **Context Aware:** Acknowledge the variables and setup we have already built in previous steps.

**Tone:** Professional yet encouraging, structured, and insightful. Use headers to organize the text.

Let's start with Step 1.