In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, roc_auc_score
from sklearn.exceptions import ConvergenceWarning
import warnings

# Mengabaikan peringatan konvergensi agar output lebih bersih
warnings.filterwarnings("ignore", category=ConvergenceWarning)

def run_classification_analysis(file_path='bank-additional-full.csv'):
    """
    Fungsi lengkap untuk memuat, memproses, melatih, dan mengevaluasi
    model klasifikasi pada dataset bank marketing.
    """
    
    # 1. Memuat Data
    try:
        df = pd.read_csv(file_path, sep=';')
        print("--- 1. Data Berhasil Dimuat ---")
    except FileNotFoundError:
        print(f"Error: File '{file_path}' tidak ditemukan.")
        return
    except Exception as e:
        print(f"Error saat membaca file: {e}")
        return

    print(df.head())
    print("\n--- 2. Data Info ---")
    df.info()

    print("\n--- 3. Distribusi Variabel Target (Original) ---")
    print(df['y'].value_counts(normalize=True))

    # 2. EDA dan Preprocessing

    # Mengubah target variable 'y' menjadi biner
    df['y'] = df['y'].map({'no': 0, 'yes': 1})

    # Memisahkan fitur (X) dan target (y)
    X = df.drop('y', axis=1)
    y = df['y']

    # PENTING: Kolom 'duration' adalah bocoran (data leakage).
    # Durasi panggilan tidak diketahui *sebelum* panggilan dilakukan.
    # Untuk model prediktif yang realistis, ini harus dihapus.
    if 'duration' in X.columns:
        X = X.drop('duration', axis=1)
        print("\nInfo: Menghapus kolom 'duration' untuk menghindari data leakage.")

    # Mengidentifikasi kolom numerik dan kategorikal
    numeric_features = X.select_dtypes(include=np.number).columns.tolist()
    categorical_features = X.select_dtypes(include='object').columns.tolist()

    print(f"\nFitur Numerik: {numeric_features}")
    print(f"Fitur Kategorikal: {categorical_features}")

    # Membagi data menjadi train dan test SEBELUM preprocessing
    # Menggunakan stratify=y untuk menjaga proporsi kelas pada data split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    print(f"\nData dibagi: {len(X_train)} train, {len(X_test)} test.")

    # Membuat pipeline preprocessing
    numeric_transformer = StandardScaler()
    categorical_transformer = OneHotEncoder(handle_unknown='ignore') # Mengabaikan kategori yg tidak ada di train set

    # Menggabungkan transformer menggunakan ColumnTransformer
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)
        ])

    # 3. Pelatihan Model

    print("\n--- 4. Memulai Pelatihan Model ---")
    
    # Model 1: Logistic Regression
    # Menggunakan class_weight='balanced' untuk menangani imbalance
    # Menambah max_iter untuk memastikan konvergensi
    pipeline_lr = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', LogisticRegression(random_state=42, class_weight='balanced', max_iter=1000))
    ])
    pipeline_lr.fit(X_train, y_train)
    print("Model Logistic Regression selesai dilatih.")

    # Model 2: Decision Tree
    # Menggunakan class_weight='balanced' dan membatasi kedalaman (max_depth) untuk mencegah overfitting
    pipeline_dt = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', DecisionTreeClassifier(random_state=42, class_weight='balanced', max_depth=10))
    ])
    pipeline_dt.fit(X_train, y_train)
    print("Model Decision Tree selesai dilatih.")

    # 4. Evaluasi Model

    print("\n--- 5. Memulai Evaluasi Model ---")

    # Mendapatkan prediksi
    y_pred_lr = pipeline_lr.predict(X_test)
    y_prob_lr = pipeline_lr.predict_proba(X_test)[:, 1] # Probabilitas untuk kelas positif (1)

    y_pred_dt = pipeline_dt.predict(X_test)
    y_prob_dt = pipeline_dt.predict_proba(X_test)[:, 1]

    # Laporan Klasifikasi
    print("\n--- Laporan Klasifikasi: Logistic Regression ---")
    print(classification_report(y_test, y_pred_lr, target_names=['No (0)', 'Yes (1)']))

    print("\n--- Laporan Klasifikasi: Decision Tree ---")
    print(classification_report(y_test, y_pred_dt, target_names=['No (0)', 'Yes (1)']))
    
    # Menghitung AUC
    auc_lr = roc_auc_score(y_test, y_prob_lr)
    auc_dt = roc_auc_score(y_test, y_prob_dt)
    print(f"\nAUC Logistic Regression: {auc_lr:.4f}")
    print(f"AUC Decision Tree: {auc_dt:.4f}")

    # Visualisasi: Confusion Matrix
    cm_lr = confusion_matrix(y_test, y_pred_lr)
    cm_dt = confusion_matrix(y_test, y_pred_dt)

    fig, ax = plt.subplots(1, 2, figsize=(14, 6))
    sns.heatmap(cm_lr, annot=True, fmt='d', cmap='Blues', ax=ax[0], cbar=False)
    ax[0].set_title('Confusion Matrix: Logistic Regression', fontsize=14)
    ax[0].set_xlabel('Predicted')
    ax[0].set_ylabel('Actual')
    ax[0].set_xticklabels(['No (0)', 'Yes (1)'])
    ax[0].set_yticklabels(['No (0)', 'Yes (1)'])

    sns.heatmap(cm_dt, annot=True, fmt='d', cmap='Greens', ax=ax[1], cbar=False)
    ax[1].set_title('Confusion Matrix: Decision Tree', fontsize=14)
    ax[1].set_xlabel('Predicted')
    ax[1].set_ylabel('Actual')
    ax[1].set_xticklabels(['No (0)', 'Yes (1)'])
    ax[1].set_yticklabels(['No (0)', 'Yes (1)'])

    plt.tight_layout()
    plt.savefig('confusion_matrices.png')
    print("\nGambar 'confusion_matrices.png' telah disimpan.")
    plt.clf()

    # Visualisasi: ROC Curve
    fpr_lr, tpr_lr, _ = roc_curve(y_test, y_prob_lr)
    fpr_dt, tpr_dt, _ = roc_curve(y_test, y_prob_dt)

    plt.figure(figsize=(10, 7))
    plt.plot(fpr_lr, tpr_lr, label=f'Logistic Regression (AUC = {auc_lr:.3f})', linewidth=2)
    plt.plot(fpr_dt, tpr_dt, label=f'Decision Tree (AUC = {auc_dt:.3f})', linewidth=2)
    plt.plot([0, 1], [0, 1], 'k--', label='Random Guess (AUC = 0.5)')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve Comparison')
    plt.legend(loc='lower right')
    plt.grid(True)
    plt.savefig('roc_curves.png')
    print("Gambar 'roc_curves.png' telah disimpan.")
    plt.clf()

    print("\n--- Analisis Selesai ---")

if __name__ == "__main__":
    # Pastikan file 'bank-additional-full.csv' ada di direktori yang sama
    run_classification_analysis('bank-additional-full.csv')

--- 1. Data Berhasil Dimuat ---
   age        job  marital    education  default housing loan    contact  \
0   56  housemaid  married     basic.4y       no      no   no  telephone   
1   57   services  married  high.school  unknown      no   no  telephone   
2   37   services  married  high.school       no     yes   no  telephone   
3   40     admin.  married     basic.6y       no      no   no  telephone   
4   56   services  married  high.school       no      no  yes  telephone   

  month day_of_week  ...  campaign  pdays  previous     poutcome emp.var.rate  \
0   may         mon  ...         1    999         0  nonexistent          1.1   
1   may         mon  ...         1    999         0  nonexistent          1.1   
2   may         mon  ...         1    999         0  nonexistent          1.1   
3   may         mon  ...         1    999         0  nonexistent          1.1   
4   may         mon  ...         1    999         0  nonexistent          1.1   

   cons.price.idx  cons.

<Figure size 1400x600 with 0 Axes>

<Figure size 1000x700 with 0 Axes>