In [1]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
from config import normalized_features, encoded_features, to_remove_features
from sklearn.model_selection import cross_val_score, StratifiedKFold, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, roc_auc_score
import mlflow
import dagshub
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
import json

In [2]:
# sql engine
engine = create_engine("postgresql://user:password@localhost:5432/home_credit_db")

In [3]:
# dagshub init
dagshub.init(
    repo_owner="maulanasyaa", repo_name="Home-Credit_Default_Risk", mlflow=True
)

mlflow.autolog()

2026/01/15 18:02:13 INFO mlflow.tracking.fluent: Autologging successfully enabled for lightgbm.
2026/01/15 18:02:13 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2026/01/15 18:02:13 INFO mlflow.tracking.fluent: Autologging successfully enabled for xgboost.


In [4]:
query_app = """
    select * from application_train_clean
"""

df = pd.read_sql(query_app, engine)


In [5]:
# feature engineering
df = df.drop(columns=to_remove_features)

print(df.shape)
df.head()

(307507, 53)


Unnamed: 0,sk_id_curr,target,name_contract_type,code_gender,flag_own_car,flag_own_realty,amt_income_total,amt_credit,amt_annuity,amt_goods_price,...,livingarea_mode,floorsmax_medi,fondkapremont_mode,housetype_mode,totalarea_mode,wallsmaterial_mode,emergencystate_mode,obs_30_cnt_social_circle,days_last_phone_change,amt_req_credit_bureau_year
0,100370,0,Cash loans,F,N,Y,45000.0,448056.0,16222.5,315000.0,...,0.0462,0.1667,reg oper account,block of flats,0.0387,"Stone, brick",No,8.0,-1088.0,4.0
1,100371,0,Cash loans,F,Y,Y,450000.0,808650.0,31464.0,675000.0,...,0.1794,0.6667,reg oper spec account,block of flats,0.6093,Panel,No,0.0,0.0,0.0
2,100372,0,Cash loans,F,N,N,90000.0,531000.0,29781.0,531000.0,...,,,,,,,,0.0,0.0,0.0
3,100373,0,Cash loans,F,N,N,225000.0,906228.0,46269.0,810000.0,...,,,,,,,,0.0,-1053.0,0.0
4,100374,0,Cash loans,F,Y,Y,112500.0,1308964.5,42354.0,1143000.0,...,,,,,,,,1.0,-644.0,2.0


In [6]:
mlflow.set_experiment("home_credit_modelling")

<Experiment: artifact_location='mlflow-artifacts:/53e8be0b12414e6896ea4ac1452766e3', creation_time=1767969132883, experiment_id='0', last_update_time=1767969132883, lifecycle_stage='active', name='home_credit_modelling', tags={}>

## preprocessing pipeline

In [7]:
X = df.drop(columns=["sk_id_curr", "target"])
y = df["target"]

# select feature to scaled
num_features = X.select_dtypes(include=np.number).columns
cat_features = X.select_dtypes(include=np.object_).columns


col_to_drop = [
    col for col in (normalized_features + encoded_features) if col in num_features
]

col_to_scaled = df[num_features].drop(columns=col_to_drop).columns

In [8]:
# preprocessing pipeline

numeric_transformer = Pipeline(
    steps=[
        (
            "imputer",
            SimpleImputer(strategy="median"),
        ),
        ("scaler", StandardScaler()),
    ]
)

categorical_transformer = Pipeline(
    steps=[
        (
            "imputer",
            SimpleImputer(strategy="most_frequent"),
        ),
        (
            "encoder",
            OneHotEncoder(handle_unknown="ignore"),
        ),
    ]
)

preprocessing_pipeline = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, col_to_scaled),
        ("cat", categorical_transformer, cat_features),
    ]
)

## data split

In [9]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

## Logistic Regression

In [10]:
# with mlflow.start_run(run_name="LogisticRegression"):
#     LR_pipeline = Pipeline(
#         steps=[
#             ("preprocessing", preprocessing_pipeline),
#             (
#                 "logistic_regression",
#                 LogisticRegression(
#                     solver="lbfgs",
#                     penalty="l2",
#                     C=1.0,
#                     max_iter=1000,
#                     random_state=42,
#                     class_weight="balanced",
#                 ),
#             ),
#         ]
#     )

#     # cross validation
#     cv_scores = cross_val_score(
#         LR_pipeline, X_train, y_train, cv=cv, scoring="accuracy"
#     )

#     mlflow.log_metric("avg_cv_accuracy", np.mean(cv_scores))

#     LR_pipeline.fit(X_train, y_train)

#     # test validation
#     y_pred = LR_pipeline.predict(X_test)
#     acc_score = accuracy_score(y_test, y_pred)

#     y_pred_proba = LR_pipeline.predict_proba(X_test)[:, 1]
#     auc = roc_auc_score(y_test, y_pred_proba)

#     mlflow.log_metric("test_accuracy", acc_score)
#     mlflow.log_metric("test_auc", auc)

#     # feature artifact
#     feature_names = X_train.columns.tolist()

#     with open("feature_names.json", "w") as f:
#         json.dump(feature_names, f)

#     mlflow.log_artifact("feature_names.json")


## XGBoost

In [11]:
# with mlflow.start_run(run_name="XGBoost"):
#     XGB_pipeline = Pipeline(
#         steps=[
#             ("preprocessing", preprocessing_pipeline),
#             (
#                 "xgboost",
#                 XGBClassifier(
#                     max_depth=5,
#                     n_estimators=200,
#                     learning_rate=0.1,
#                     scale_pos_weight=11,
#                     eval_metric="auc",
#                     objective="binary:logistic",
#                     tree_method="hist",
#                     random_state=42,
#                 ),
#             ),
#         ]
#     )

#     # cross validation
#     cv_scores = cross_val_score(
#         XGB_pipeline, X_train, y_train, cv=cv, scoring="accuracy"
#     )

#     mlflow.log_metric("avg_cv_accuracy", np.mean(cv_scores))

#     XGB_pipeline.fit(X_train, y_train)

#     # test validation
#     y_pred = XGB_pipeline.predict(X_test)
#     acc_score = accuracy_score(y_test, y_pred)

#     y_pred_proba = XGB_pipeline.predict_proba(X_test)[:, 1]
#     auc = roc_auc_score(y_test, y_pred_proba)

#     mlflow.log_metric("test_accuracy", acc_score)
#     mlflow.log_metric("test_auc", auc)

#     # feature artifact
#     feature_names = X_train.columns.tolist()

#     with open("feature_names.json", "w") as f:
#         json.dump(feature_names, f)

#     mlflow.log_artifact("feature_names.json")

## LightGBM

In [12]:
with mlflow.start_run(run_name="LightGBM_remove_feat_5"):
    LGBM_pipeline = Pipeline(
        steps=[
            ("preprocessing", preprocessing_pipeline),
            (
                "lightgbm",
                LGBMClassifier(
                    n_estimators=200,
                    learning_rate=0.1,
                    num_leaves=31,
                    is_unbalance=True,
                    objective="binary",
                    metric="auc",
                    random_state=42,
                    n_jobs=-1,
                    verbose=-1,
                ),
            ),
        ]
    )

    # cross validation
    cv_scores = cross_val_score(
        LGBM_pipeline, X_train, y_train, cv=cv, scoring="accuracy"
    )

    mlflow.log_metric("avg_cv_accuracy", np.mean(cv_scores))

    LGBM_pipeline.fit(X_train, y_train)

    # test validation
    y_pred = LGBM_pipeline.predict(X_test)
    acc_score = accuracy_score(y_test, y_pred)

    y_pred_proba = LGBM_pipeline.predict_proba(X_test)[:, 1]
    auc = roc_auc_score(y_test, y_pred_proba)

    mlflow.log_metric("test_accuracy", acc_score)
    mlflow.log_metric("test_auc", auc)

    # feature artifact
    feature_names = X_train.columns.tolist()

    with open("feature_names.json", "w") as f:
        json.dump(feature_names, f)

    # removed features
    removed_feat = to_remove_features

    with open("removed_feat.json", "w") as f:
        json.dump(removed_feat, f)

    mlflow.log_artifact("feature_names.json")
    mlflow.log_artifact("removed_feat.json")



üèÉ View run LightGBM_remove_feat_5 at: https://dagshub.com/maulanasyaa/Home-Credit_Default_Risk.mlflow/#/experiments/0/runs/31a2db8e44e147829a2137d1e439f654
üß™ View experiment at: https://dagshub.com/maulanasyaa/Home-Credit_Default_Risk.mlflow/#/experiments/0
