In [24]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
from config import normalized_features, encoded_features
from sklearn.model_selection import cross_val_score, StratifiedKFold, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, roc_auc_score
import mlflow
import dagshub
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

In [14]:
# sql engine
engine = create_engine("postgresql://user:password@localhost:5432/home_credit_db")

In [15]:
# dagshub init
dagshub.init(
    repo_owner="maulanasyaa", repo_name="Home-Credit_Default_Risk", mlflow=True
)

mlflow.autolog()

2026/01/13 15:05:05 INFO mlflow.tracking.fluent: Autologging successfully enabled for lightgbm.
2026/01/13 15:05:05 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2026/01/13 15:05:05 INFO mlflow.tracking.fluent: Autologging successfully enabled for xgboost.


In [16]:
query_app = """
    select * from application_train_clean
"""

df = pd.read_sql(query_app, engine)
print(df.shape)
df.head()

(307507, 102)


Unnamed: 0,sk_id_curr,target,name_contract_type,code_gender,flag_own_car,flag_own_realty,cnt_children,amt_income_total,amt_credit,amt_annuity,amt_goods_price,name_type_suite,name_income_type,name_education_type,name_family_status,name_housing_type,region_population_relative,days_birth,days_employed,days_registration,days_id_publish,own_car_age,flag_emp_phone,flag_work_phone,flag_phone,flag_email,occupation_type,cnt_fam_members,region_rating_client,region_rating_client_w_city,weekday_appr_process_start,hour_appr_process_start,reg_region_not_live_region,reg_region_not_work_region,live_region_not_work_region,reg_city_not_live_city,reg_city_not_work_city,live_city_not_work_city,organization_type,ext_source_1,...,floorsmax_mode,floorsmin_mode,landarea_mode,livingapartments_mode,livingarea_mode,nonlivingapartments_mode,nonlivingarea_mode,apartments_medi,basementarea_medi,years_beginexpluatation_medi,years_build_medi,commonarea_medi,entrances_medi,floorsmax_medi,floorsmin_medi,landarea_medi,livingapartments_medi,livingarea_medi,nonlivingapartments_medi,nonlivingarea_medi,fondkapremont_mode,housetype_mode,totalarea_mode,wallsmaterial_mode,emergencystate_mode,obs_30_cnt_social_circle,def_30_cnt_social_circle,obs_60_cnt_social_circle,def_60_cnt_social_circle,days_last_phone_change,flag_document_3,flag_document_6,flag_document_8,flag_document_16,amt_req_credit_bureau_hour,amt_req_credit_bureau_day,amt_req_credit_bureau_week,amt_req_credit_bureau_mon,amt_req_credit_bureau_qrt,amt_req_credit_bureau_year
0,100176,0,Cash loans,F,N,Y,0,135000.0,1078200.0,31653.0,900000.0,Unaccompanied,Working,Secondary / secondary special,Married,House / apartment,0.010006,-17682,-4319.0,-2630.0,-1211,,1,0,1,0,Laborers,2.0,2,2,WEDNESDAY,6,0,0,0,0,0,0,Government,,...,0.3333,0.375,0.0098,0.0606,0.0729,0.0078,0.0174,0.0708,0.0533,0.996,0.9463,0.0181,0.069,0.3333,0.375,0.0098,0.0564,0.0713,0.0078,0.0168,reg oper account,block of flats,0.0685,Panel,No,1.0,0.0,1.0,0.0,-131.0,1,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100177,0,Cash loans,M,N,N,1,112500.0,312682.5,33309.0,297000.0,Family,Working,Secondary / secondary special,Civil marriage,House / apartment,0.010643,-14311,-334.0,-8401.0,-4248,,1,1,0,0,,3.0,2,2,THURSDAY,10,0,1,1,0,1,1,Self-employed,0.319357,...,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,-328.0,1,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,100178,0,Cash loans,F,N,Y,0,119250.0,679500.0,28917.0,679500.0,Unaccompanied,Working,Higher education,Separated,House / apartment,0.008866,-19286,-1485.0,-7887.0,-2835,,1,1,1,0,,1.0,2,2,MONDAY,13,0,0,0,0,0,0,Self-employed,0.723371,...,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,0.0,-2085.0,1,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,100179,0,Cash loans,F,Y,N,0,202500.0,675000.0,53329.5,675000.0,Unaccompanied,Commercial associate,Higher education,Married,House / apartment,0.031329,-11375,-2311.0,-180.0,-2009,4.0,1,0,0,0,Managers,2.0,2,2,MONDAY,13,0,0,0,0,0,0,Trade: type 7,0.674832,...,0.9167,0.5,0.2912,0.4683,0.7488,0.2218,0.8183,0.5902,0.457,0.9945,0.9262,0.349,0.2414,0.9167,0.5,0.2896,0.4361,0.7317,0.2213,0.7892,reg oper account,block of flats,0.7334,Monolithic,No,0.0,0.0,0.0,0.0,-1792.0,0,0,1,0,0.0,0.0,0.0,1.0,0.0,4.0
4,100180,0,Cash loans,F,N,Y,1,315000.0,1288350.0,37800.0,1125000.0,Unaccompanied,Commercial associate,Higher education,Married,House / apartment,0.020713,-15581,-4940.0,-9520.0,-5239,,1,0,0,0,Accountants,3.0,3,2,FRIDAY,15,0,0,0,0,1,1,Transport: type 4,0.72909,...,0.1667,0.2083,0.2729,0.0808,0.0874,0.0,0.0,0.0916,0.1072,0.9811,0.7451,0.019,0.2069,0.1667,0.2083,0.2715,0.0752,0.0854,0.0,0.0,reg oper account,block of flats,0.0763,Panel,No,1.0,0.0,1.0,0.0,-1020.0,1,0,0,0,0.0,0.0,0.0,2.0,0.0,3.0


In [17]:
mlflow.set_experiment("home_credit_modelling")

<Experiment: artifact_location='mlflow-artifacts:/53e8be0b12414e6896ea4ac1452766e3', creation_time=1767969132883, experiment_id='0', last_update_time=1767969132883, lifecycle_stage='active', name='home_credit_modelling', tags={}>

## preprocessing pipeline

In [18]:
X = df.drop(columns=["sk_id_curr", "target"])
y = df["target"]

# select feature to scaled
num_features = X.select_dtypes(include=np.number).columns
cat_features = X.select_dtypes(include=np.object_).columns


col_to_drop = [
    col for col in (normalized_features + encoded_features) if col in num_features
]

col_to_scaled = df[num_features].drop(columns=col_to_drop).columns

In [19]:
# preprocessing pipeline

numeric_transformer = Pipeline(
    steps=[
        (
            "imputer",
            SimpleImputer(strategy="median"),
        ),
        ("scaler", StandardScaler()),
    ]
)

categorical_transformer = Pipeline(
    steps=[
        (
            "imputer",
            SimpleImputer(strategy="most_frequent"),
        ),
        (
            "encoder",
            OneHotEncoder(handle_unknown="ignore"),
        ),
    ]
)

preprocessing_pipeline = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, col_to_scaled),
        ("cat", categorical_transformer, cat_features),
    ]
)

## data split

In [20]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

## Logistic Regression

In [21]:
with mlflow.start_run(run_name="LogisticRegression"):
    LR_pipeline = Pipeline(
        steps=[
            ("preprocessing", preprocessing_pipeline),
            (
                "logistic_regression",
                LogisticRegression(
                    solver="lbfgs",
                    penalty="l2",
                    C=1.0,
                    max_iter=1000,
                    random_state=42,
                    class_weight="balanced",
                ),
            ),
        ]
    )

    # cross validation
    cv_scores = cross_val_score(
        LR_pipeline, X_train, y_train, cv=cv, scoring="accuracy"
    )

    mlflow.log_metric("avg_cv_accuracy", np.mean(cv_scores))

    LR_pipeline.fit(X_train, y_train)

    # test validation
    y_pred = LR_pipeline.predict(X_test)
    acc_score = accuracy_score(y_test, y_pred)

    y_pred_proba = LR_pipeline.predict_proba(X_test)[:, 1]
    auc = roc_auc_score(y_test, y_pred_proba)

    mlflow.log_metric("test_accuracy", acc_score)
    mlflow.log_metric("test_auc", auc)




üèÉ View run LogisticRegression at: https://dagshub.com/maulanasyaa/Home-Credit_Default_Risk.mlflow/#/experiments/0/runs/27c31e4a2cb34f008e6182076fb4a8f1
üß™ View experiment at: https://dagshub.com/maulanasyaa/Home-Credit_Default_Risk.mlflow/#/experiments/0


## XGBoost

In [22]:
with mlflow.start_run(run_name="XGBoost"):
    XGB_pipeline = Pipeline(
        steps=[
            ("preprocessing", preprocessing_pipeline),
            (
                "xgboost",
                XGBClassifier(
                    max_depth=5,
                    n_estimators=200,
                    learning_rate=0.1,
                    scale_pos_weight=11,
                    eval_metric="auc",
                    objective="binary:logistic",
                    tree_method="hist",
                    random_state=42,
                ),
            ),
        ]
    )

    # cross validation
    cv_scores = cross_val_score(
        XGB_pipeline, X_train, y_train, cv=cv, scoring="accuracy"
    )

    mlflow.log_metric("avg_cv_accuracy", np.mean(cv_scores))

    XGB_pipeline.fit(X_train, y_train)

    # test validation
    y_pred = XGB_pipeline.predict(X_test)
    acc_score = accuracy_score(y_test, y_pred)

    y_pred_proba = XGB_pipeline.predict_proba(X_test)[:, 1]
    auc = roc_auc_score(y_test, y_pred_proba)

    mlflow.log_metric("test_accuracy", acc_score)
    mlflow.log_metric("test_auc", auc)



üèÉ View run XGBoost at: https://dagshub.com/maulanasyaa/Home-Credit_Default_Risk.mlflow/#/experiments/0/runs/378f052311df460b9ddc1ae3c5469a5a
üß™ View experiment at: https://dagshub.com/maulanasyaa/Home-Credit_Default_Risk.mlflow/#/experiments/0


## LightGBM

In [23]:
with mlflow.start_run(run_name="LightGBM"):
    LGBM_pipeline = Pipeline(
        steps=[
            ("preprocessing", preprocessing_pipeline),
            (
                "lightgbm",
                LGBMClassifier(
                    n_estimators=200,
                    learning_rate=0.1,
                    num_leaves=31,
                    is_unbalance=True,
                    objective="binary",
                    metric="auc",
                    random_state=42,
                    n_jobs=-1,
                    verbose=-1,
                ),
            ),
        ]
    )

    # cross validation
    cv_scores = cross_val_score(
        LGBM_pipeline, X_train, y_train, cv=cv, scoring="accuracy"
    )

    mlflow.log_metric("avg_cv_accuracy", np.mean(cv_scores))

    LGBM_pipeline.fit(X_train, y_train)

    # test validation
    y_pred = LGBM_pipeline.predict(X_test)
    acc_score = accuracy_score(y_test, y_pred)

    y_pred_proba = LGBM_pipeline.predict_proba(X_test)[:, 1]
    auc = roc_auc_score(y_test, y_pred_proba)

    mlflow.log_metric("test_accuracy", acc_score)
    mlflow.log_metric("test_auc", auc)



üèÉ View run LightGBM at: https://dagshub.com/maulanasyaa/Home-Credit_Default_Risk.mlflow/#/experiments/0/runs/7ad28b5c712f402283807357b81cfe44
üß™ View experiment at: https://dagshub.com/maulanasyaa/Home-Credit_Default_Risk.mlflow/#/experiments/0
