In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Sklearn Imports
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import classification_report, roc_auc_score

# Models Imports
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier


In [2]:

# Load Data
df = pd.read_csv('./bank-additional-full.csv', sep= ';')

# Rename Target & Drop Duration (Opsional, tapi disarankan untuk menghindari leakage jika strict)
# Untuk latihan ini kita biarkan duration sesuai kode sebelumnya
df.rename(columns = {'y':'deposit'}, inplace = True)

# Pisahkan Fitur (X) dan Target (y)
X = df.drop('deposit', axis=1)
y = df['deposit'].map({'yes': 1, 'no': 0})

# Split data (Stratify penting untuk Imbalanced Data)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Shape X_train: {X_train.shape}")
print(f"Shape X_test:  {X_test.shape}")

Shape X_train: (32950, 20)
Shape X_test:  (8238, 20)


In [3]:
# --- CUSTOM TRANSFORMERS ---

class CustomImputer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X_copy = X.copy()
        # Rule-based imputation
        X_copy.loc[(X_copy['age']>60) & (X_copy['job']=='unknown'), 'job'] = 'retired'
        X_copy.loc[(X_copy['education']=='unknown') & (X_copy['job']=='management'), 'education'] = 'university.degree'
        X_copy.loc[(X_copy['education']=='unknown') & (X_copy['job']=='services'), 'education'] = 'high.school'
        X_copy.loc[(X_copy['education']=='unknown') & (X_copy['job']=='housemaid'), 'education'] = 'basic.4y'
        X_copy.loc[(X_copy['job'] == 'unknown') & (X_copy['education']=='basic.4y'), 'job'] = 'blue-collar'
        X_copy.loc[(X_copy['job'] == 'unknown') & (X_copy['education']=='basic.6y'), 'job'] = 'blue-collar'
        X_copy.loc[(X_copy['job'] == 'unknown') & (X_copy['education']=='basic.9y'), 'job'] = 'blue-collar'
        X_copy.loc[(X_copy['job']=='unknown') & (X_copy['education']=='professional.course'), 'job'] = 'technician'
        return X_copy

class CyclicalFeatureTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X_copy = X.copy()
        month_map = {'mar':3, 'apr':4, 'may':5, 'jun':6, 'jul':7, 'aug':8, 'sep':9, 'oct':10, 'nov':11, 'dec':12}
        day_map = {'mon':1, 'tue':2, 'wed':3, 'thu':4, 'fri':5}

        # Month
        if 'month' in X_copy.columns:
            X_copy['month_num'] = X_copy['month'].map(month_map)
            X_copy['month_sin'] = np.sin(2 * np.pi * X_copy['month_num']/12)
            X_copy['month_cos'] = np.cos(2 * np.pi * X_copy['month_num']/12)
            X_copy.drop(columns=['month', 'month_num'], inplace=True)

        # Day
        if 'day_of_week' in X_copy.columns:
            X_copy['day_num'] = X_copy['day_of_week'].map(day_map)
            X_copy['day_sin'] = np.sin(2 * np.pi * X_copy['day_num']/5)
            X_copy['day_cos'] = np.cos(2 * np.pi * X_copy['day_num']/5)
            X_copy.drop(columns=['day_of_week', 'day_num'], inplace=True)
        return X_copy

housing_loan_map = ['unknown', 'no', 'yes']
default_map = ['unknown', 'no', 'yes']

education_order = [
    'illiterate', 'basic.4y', 'basic.6y', 'basic.9y',
    'high.school', 'professional.course', 'university.degree', 'unknown'
]
poutcome_order = ['nonexistent', 'failure', 'success']

# --- SETUP COLUMN TRANSFORMER ---

cyclical_features = ['month', 'day_of_week']
ordinal_features = ['education', 'poutcome']
binary_features = ['default', 'housing', 'loan']
onehot_features = ['job', 'marital', 'contact']
numerical_features = ['age', 'duration', 'campaign', 'pdays', 'previous',
                      'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed']

ordinal_transformer = Pipeline(steps=[
    ('ordinal', OrdinalEncoder(
        categories=[education_order, poutcome_order, default_map, housing_loan_map, housing_loan_map],
        handle_unknown='use_encoded_value', unknown_value=-1
    )),
    ('scaler', StandardScaler())
])

onehot_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('cyclic', CyclicalFeatureTransformer(), cyclical_features),
        ('ordinal_bin', ordinal_transformer, ordinal_features + binary_features),
        ('onehot', onehot_transformer, onehot_features),
        ('num_only', numerical_transformer, numerical_features)
    ],
    remainder='drop',
    verbose_feature_names_out=False
)

# Pipeline Gabungan (Imputer + Preprocessor)
preprocessor_full = Pipeline(steps=[
    ('impute_custom', CustomImputer()),
    ('preprocessing', preprocessor)
])

In [4]:
# --- CUSTOM TRANSFORMERS ---

class CustomImputer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X_copy = X.copy()
        # Rule-based imputation
        X_copy.loc[(X_copy['age']>60) & (X_copy['job']=='unknown'), 'job'] = 'retired'
        X_copy.loc[(X_copy['education']=='unknown') & (X_copy['job']=='management'), 'education'] = 'university.degree'
        X_copy.loc[(X_copy['education']=='unknown') & (X_copy['job']=='services'), 'education'] = 'high.school'
        X_copy.loc[(X_copy['education']=='unknown') & (X_copy['job']=='housemaid'), 'education'] = 'basic.4y'
        X_copy.loc[(X_copy['job'] == 'unknown') & (X_copy['education']=='basic.4y'), 'job'] = 'blue-collar'
        X_copy.loc[(X_copy['job'] == 'unknown') & (X_copy['education']=='basic.6y'), 'job'] = 'blue-collar'
        X_copy.loc[(X_copy['job'] == 'unknown') & (X_copy['education']=='basic.9y'), 'job'] = 'blue-collar'
        X_copy.loc[(X_copy['job']=='unknown') & (X_copy['education']=='professional.course'), 'job'] = 'technician'
        return X_copy

class CyclicalFeatureTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X_copy = X.copy()
        month_map = {'mar':3, 'apr':4, 'may':5, 'jun':6, 'jul':7, 'aug':8, 'sep':9, 'oct':10, 'nov':11, 'dec':12}
        day_map = {'mon':1, 'tue':2, 'wed':3, 'thu':4, 'fri':5}

        # Month
        if 'month' in X_copy.columns:
            X_copy['month_num'] = X_copy['month'].map(month_map)
            X_copy['month_sin'] = np.sin(2 * np.pi * X_copy['month_num']/12)
            X_copy['month_cos'] = np.cos(2 * np.pi * X_copy['month_num']/12)
            X_copy.drop(columns=['month', 'month_num'], inplace=True)

        # Day
        if 'day_of_week' in X_copy.columns:
            X_copy['day_num'] = X_copy['day_of_week'].map(day_map)
            X_copy['day_sin'] = np.sin(2 * np.pi * X_copy['day_num']/5)
            X_copy['day_cos'] = np.cos(2 * np.pi * X_copy['day_num']/5)
            X_copy.drop(columns=['day_of_week', 'day_num'], inplace=True)
        return X_copy

housing_loan_map = ['unknown', 'no', 'yes']
default_map = ['unknown', 'no', 'yes']

education_order = [
    'illiterate', 'basic.4y', 'basic.6y', 'basic.9y',
    'high.school', 'professional.course', 'university.degree', 'unknown'
]
poutcome_order = ['nonexistent', 'failure', 'success']

# --- SETUP COLUMN TRANSFORMER ---

cyclical_features = ['month', 'day_of_week']
ordinal_features = ['education', 'poutcome']
binary_features = ['default', 'housing', 'loan']
onehot_features = ['job', 'marital', 'contact']
numerical_features = ['age', 'duration', 'campaign', 'pdays', 'previous',
                      'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed']

ordinal_transformer = Pipeline(steps=[
    ('ordinal', OrdinalEncoder(
        categories=[education_order, poutcome_order, default_map, housing_loan_map, housing_loan_map],
        handle_unknown='use_encoded_value', unknown_value=-1
    )),
    ('scaler', StandardScaler())
])

onehot_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('cyclic', CyclicalFeatureTransformer(), cyclical_features),
        ('ordinal_bin', ordinal_transformer, ordinal_features + binary_features),
        ('onehot', onehot_transformer, onehot_features),
        ('num_only', numerical_transformer, numerical_features)
    ],
    remainder='drop',
    verbose_feature_names_out=False
)

# Pipeline Gabungan (Imputer + Preprocessor)
preprocessor_full = Pipeline(steps=[
    ('impute_custom', CustomImputer()),
    ('preprocessing', preprocessor)
])

In [5]:
# Hitung Scale Pos Weight untuk XGBoost & LightGBM
# Rumus: Jumlah Negatif / Jumlah Positif
neg_count = (y_train == 0).sum()
pos_count = (y_train == 1).sum()
scale_pos_weight_val = neg_count / pos_count

print(f"Rasio Imbalance (Scale Pos Weight): {scale_pos_weight_val:.2f}")

models_boosting = {}

# 1. XGBoost
# Menggunakan scale_pos_weight
full_pipeline_xgb = Pipeline(steps=[
    ('preprocessor_final', preprocessor_full),
    ('classifier', XGBClassifier(
        random_state=42,
        use_label_encoder=False,
        eval_metric='logloss',
        scale_pos_weight=scale_pos_weight_val
    ))
])

# 2. LightGBM
full_pipeline_lgbm = Pipeline(steps=[
    ('preprocessor_final', preprocessor_full),
    ('classifier', LGBMClassifier(
        random_state=42,
        class_weight='balanced',
        verbose=-1
    ))
])

# 3. CatBoost
full_pipeline_cat = Pipeline(steps=[
    ('preprocessor_final', preprocessor_full),
    ('classifier', CatBoostClassifier(
        random_state=42,
        auto_class_weights='Balanced',
        verbose=0
    ))
])

# Simpan ke Dictionary
models_boosting["XGBoost"] = full_pipeline_xgb
models_boosting["LightGBM"] = full_pipeline_lgbm
models_boosting["CatBoost"] = full_pipeline_cat

# Loop Training
for name, model in models_boosting.items():
    print(f"\nMemulai Pelatihan Model {name}...")
    model.fit(X_train, y_train)
    print(f"Pelatihan {name} Selesai.")

Rasio Imbalance (Scale Pos Weight): 7.88

Memulai Pelatihan Model XGBoost...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Pelatihan XGBoost Selesai.

Memulai Pelatihan Model LightGBM...
Pelatihan LightGBM Selesai.

Memulai Pelatihan Model CatBoost...
Pelatihan CatBoost Selesai.


In [6]:
print("HASIL PERBANDINGAN MODEL BOOSTING PADA DATA TEST")

results_summary = []

for name, model in models_boosting.items():
    print(f"\n--- Model: {name} ---")

    # Prediksi
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]

    # Metrics
    auc = roc_auc_score(y_test, y_proba)

    # Print Report
    print(classification_report(y_test, y_pred, target_names=['No Deposit (0)', 'Deposit (1)']))
    print(f"Area Under the ROC Curve (AUC): {auc:.4f}")

    # Simpan summary untuk perbandingan cepat nanti
    results_summary.append({
        'Model': name,
        'AUC': auc,
        'Score_Example': y_proba[:3]
    })

# Tampilkan Summary AUC
print("\n--- RINGKASAN PERFORMA (AUC) ---")
summary_df = pd.DataFrame(results_summary).sort_values(by='AUC', ascending=False)
print(summary_df[['Model', 'AUC']])

HASIL PERBANDINGAN MODEL BOOSTING PADA DATA TEST

--- Model: XGBoost ---
                precision    recall  f1-score   support

No Deposit (0)       0.98      0.89      0.93      7310
   Deposit (1)       0.50      0.87      0.64       928

      accuracy                           0.89      8238
     macro avg       0.74      0.88      0.78      8238
  weighted avg       0.93      0.89      0.90      8238

Area Under the ROC Curve (AUC): 0.9475

--- Model: LightGBM ---
                precision    recall  f1-score   support

No Deposit (0)       0.99      0.87      0.93      7310
   Deposit (1)       0.48      0.93      0.63       928

      accuracy                           0.88      8238
     macro avg       0.73      0.90      0.78      8238
  weighted avg       0.93      0.88      0.89      8238

Area Under the ROC Curve (AUC): 0.9544

--- Model: CatBoost ---




                precision    recall  f1-score   support

No Deposit (0)       0.99      0.88      0.93      7310
   Deposit (1)       0.49      0.92      0.64       928

      accuracy                           0.88      8238
     macro avg       0.74      0.90      0.78      8238
  weighted avg       0.93      0.88      0.90      8238

Area Under the ROC Curve (AUC): 0.9542

--- RINGKASAN PERFORMA (AUC) ---
      Model       AUC
1  LightGBM  0.954406
2  CatBoost  0.954242
0   XGBoost  0.947523


In [7]:
# Save CatBoost Model
models_boosting["CatBoost"].named_steps['classifier'].save_model('catboost_model.cbm')
print("CatBoost model saved successfully as 'catboost_model.cbm'")

CatBoost model saved successfully as 'catboost_model.cbm'
