<a href="https://colab.research.google.com/github/mariorizki-lang/mid-term-machine-learning/blob/main/clustering_midterm_ML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Cell 1: Instalasi dependensi dan pengunduhan dataset
!pip install -q gdown scikit-learn xgboost

In [2]:
# Cell 2: Memuat dataset dan melakukan EDA
import pandas as pd

# Baca dataset
df = pd.read_csv('clusteringmidterm.csv')

# Drop CUST_ID jika ada
df = df.drop(columns=[col for col in df.columns if col.upper() == 'CUST_ID'])

# Tampilkan info dataset
df_info = df.info()
print(df_info)

# Deskripsi singkat
print('Statistik deskriptif singkat:')
print(df.describe().transpose().head())

# Missing values
print('Persentase missing value per kolom:')
print((df.isnull().mean() * 100).sort_values(ascending=False))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8950 entries, 0 to 8949
Data columns (total 17 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   BALANCE                           8950 non-null   float64
 1   BALANCE_FREQUENCY                 8950 non-null   float64
 2   PURCHASES                         8950 non-null   float64
 3   ONEOFF_PURCHASES                  8950 non-null   float64
 4   INSTALLMENTS_PURCHASES            8950 non-null   float64
 5   CASH_ADVANCE                      8950 non-null   float64
 6   PURCHASES_FREQUENCY               8950 non-null   float64
 7   ONEOFF_PURCHASES_FREQUENCY        8950 non-null   float64
 8   PURCHASES_INSTALLMENTS_FREQUENCY  8950 non-null   float64
 9   CASH_ADVANCE_FREQUENCY            8950 non-null   float64
 10  CASH_ADVANCE_TRX                  8950 non-null   int64  
 11  PURCHASES_TRX                     8950 non-null   int64  
 12  CREDIT

In [3]:
# Cell 3: Membuat label HighBalance berdasarkan median BALANCE
import numpy as np

# Pastikan kolom BALANCE ada
if 'BALANCE' not in df.columns:
    raise ValueError('Kolom BALANCE tidak ditemukan dalam dataset.')

median_balance = df['BALANCE'].median()
print('Median BALANCE:', median_balance)

# Tambahkan kolom HighBalance
labels = (df['BALANCE'] >= median_balance).astype(int)
df['HighBalance'] = labels

print('Distribusi label HighBalance:')
print(df['HighBalance'].value_counts())

# Fitur dan target
X = df.drop(columns=['HighBalance'])
y = df['HighBalance']

Median BALANCE: 873.385231
Distribusi label HighBalance:
HighBalance
0    4475
1    4475
Name: count, dtype: int64


In [4]:
# Cell 4: Preprocessing dan split data
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Semua kolom adalah numerik
numeric_cols = X.columns

preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ]), numeric_cols)
    ]
)

# Bagi data menjadi train dan test (stratify)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [7]:

# Cell 5: Melatih beberapa model klasifikasi
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, roc_auc_score

# Logistic Regression
log_reg_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000))
])

log_reg_pipeline.fit(X_train, y_train)

y_pred_log = log_reg_pipeline.predict(X_test)
y_proba_log = log_reg_pipeline.predict_proba(X_test)[:, 1]

print('--- Logistic Regression ---')
print(classification_report(y_test, y_pred_log))
print('ROC-AUC:', roc_auc_score(y_test, y_proba_log))

# RandomForestClassifier
rf_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=300, random_state=42))
])

rf_pipeline.fit(X_train, y_train)

y_pred_rf = rf_pipeline.predict(X_test)
y_proba_rf = rf_pipeline.predict_proba(X_test)[:, 1]

print('--- RandomForestClassifier ---')
print(classification_report(y_test, y_pred_rf))
print('ROC-AUC:', roc_auc_score(y_test, y_proba_rf))

# XGBoost Classifier
xgb_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', XGBClassifier(
        n_estimators=300,
        max_depth=4,
        learning_rate=0.1,
        subsample=0.8,
        colsample_bytree=0.8,
        eval_metric='auc',
        use_label_encoder=False
    ))
])

xgb_pipeline.fit(X_train, y_train)

y_pred_xgb = xgb_pipeline.predict(X_test)
y_proba_xgb = xgb_pipeline.predict_proba(X_test)[:, 1]

print('--- XGBoost Classifier ---')
print(classification_report(y_test, y_pred_xgb))
print('ROC-AUC:', roc_auc_score(y_test, y_proba_xgb))

--- Logistic Regression ---
              precision    recall  f1-score   support

           0       0.99      0.98      0.99       895
           1       0.98      0.99      0.99       895

    accuracy                           0.99      1790
   macro avg       0.99      0.99      0.99      1790
weighted avg       0.99      0.99      0.99      1790

ROC-AUC: 0.9995705502325146
--- RandomForestClassifier ---
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       895
           1       1.00      1.00      1.00       895

    accuracy                           1.00      1790
   macro avg       1.00      1.00      1.00      1790
weighted avg       1.00      1.00      1.00      1790

ROC-AUC: 1.0


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


--- XGBoost Classifier ---
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       895
           1       1.00      1.00      1.00       895

    accuracy                           1.00      1790
   macro avg       1.00      1.00      1.00      1790
weighted avg       1.00      1.00      1.00      1790

ROC-AUC: 1.0


In [9]:
# Cell 6: Hyperparameter tuning untuk RandomForest
from sklearn.model_selection import RandomizedSearchCV

param_dist = {
    'classifier__n_estimators': [200, 300, 400, 500],
    'classifier__max_depth': [None, 5, 8, 12],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4],
    'classifier__bootstrap': [True, False]
}

rf_random_search = RandomizedSearchCV(
    estimator=Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', RandomForestClassifier(random_state=42))
    ]),
    param_distributions=param_dist,
    n_iter=10,
    scoring='f1',
    cv=3,
    random_state=42,
    n_jobs=-1,
    verbose=1
)

rf_random_search.fit(X_train, y_train)

print('Best parameters (RandomForest):', rf_random_search.best_params_)

best_rf = rf_random_search.best_estimator_

# Evaluasi model terbaik
y_pred_best_rf = best_rf.predict(X_test)
y_proba_best_rf = best_rf.predict_proba(X_test)[:, 1]

print('--- Tuned RandomForest ---')
print(classification_report(y_test, y_pred_best_rf))
print('ROC-AUC:', roc_auc_score(y_test, y_proba_best_rf))


Fitting 3 folds for each of 10 candidates, totalling 30 fits
Best parameters (RandomForest): {'classifier__n_estimators': 300, 'classifier__min_samples_split': 10, 'classifier__min_samples_leaf': 1, 'classifier__max_depth': 5, 'classifier__bootstrap': True}
--- Tuned RandomForest ---
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       895
           1       1.00      1.00      1.00       895

    accuracy                           1.00      1790
   macro avg       1.00      1.00      1.00      1790
weighted avg       1.00      1.00      1.00      1790

ROC-AUC: 1.0
