In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
data = pd.read_csv('./data/UCI_Credit_Card.csv')

# Preprocessing
data.rename(columns=lambda x: x.strip().lower().replace(' ', '_'), inplace=True)
data['default'] = data['default_payment_next_month']

# Feature Engineering: Create new features
data['credit_utilization'] = data[['bill_amt1', 'bill_amt2', 'bill_amt3', 'bill_amt4', 'bill_amt5', 'bill_amt6']].mean(axis=1) / data['limit_bal']
data['avg_payment_delay'] = data[['pay_1', 'pay_2', 'pay_3', 'pay_4', 'pay_5', 'pay_6']].mean(axis=1)

# Drop unnecessary columns
features = data.drop(columns=['id', 'default_payment_next_month', 'default'])
target = data['default']

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Logistic Regression
from sklearn.linear_model import LogisticRegression

logistic_model = LogisticRegression()
logistic_model.fit(X_train, y_train)
logistic_preds = logistic_model.predict_proba(X_test)[:, 1]

# Random Forest Classifier with Hyperparameter Tuning
from sklearn.ensemble import RandomForestClassifier

param_grid_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf_model = RandomForestClassifier(random_state=42)
grid_search_rf = GridSearchCV(estimator=rf_model, param_grid=param_grid_rf,
                              cv=3, n_jobs=-1, scoring='accuracy', verbose=2)
grid_search_rf.fit(X_train, y_train)
rf_best_model = grid_search_rf.best_estimator_
rf_best_preds = rf_best_model.predict_proba(X_test)[:, 1]

# XGBoost Classifier with Hyperparameter Tuning
from xgboost import XGBClassifier

param_grid_xgb = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.3],
    'subsample': [0.7, 0.8, 1.0],
    'colsample_bytree': [0.7, 0.8, 1.0]
}

xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
grid_search_xgb = GridSearchCV(estimator=xgb_model, param_grid=param_grid_xgb,
                               cv=3, n_jobs=-1, scoring='accuracy', verbose=2)
grid_search_xgb.fit(X_train, y_train)
xgb_best_model = grid_search_xgb.best_estimator_
xgb_best_preds = xgb_best_model.predict_proba(X_test)[:, 1]

# Improved Neural Network (Deep Learning)
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization

nn_model = Sequential([
    Dense(256, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.4),
    Dense(128, activation='relu'),
    BatchNormalization(),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])

nn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
nn_model.fit(X_train, y_train, epochs=30, batch_size=64, validation_split=0.2, verbose=1)
nn_preds = nn_model.predict(X_test).flatten()

# Model Stacking
from sklearn.ensemble import StackingClassifier

estimators = [
    ('rf', rf_best_model),
    ('xgb', xgb_best_model)
]

stacking_model = StackingClassifier(
    estimators=estimators, final_estimator=LogisticRegression(), cv=5, n_jobs=-1)

stacking_model.fit(X_train, y_train)
stacking_preds = stacking_model.predict_proba(X_test)[:, 1]

# Convert probabilities to binary predictions
logistic_binary_preds = (logistic_preds >= 0.5).astype(int)
rf_binary_preds = (rf_best_preds >= 0.5).astype(int)
xgb_binary_preds = (xgb_best_preds >= 0.5).astype(int)
nn_binary_preds = (nn_preds >= 0.5).astype(int)
stacking_binary_preds = (stacking_preds >= 0.5).astype(int)

# Evaluate models: ROC-AUC and Accuracy
logistic_roc_auc = roc_auc_score(y_test, logistic_preds)
rf_roc_auc = roc_auc_score(y_test, rf_best_preds)
xgb_roc_auc = roc_auc_score(y_test, xgb_best_preds)
nn_roc_auc = roc_auc_score(y_test, nn_preds)
stacking_roc_auc = roc_auc_score(y_test, stacking_preds)

logistic_acc = accuracy_score(y_test, logistic_binary_preds)
rf_acc = accuracy_score(y_test, rf_binary_preds)
xgb_acc = accuracy_score(y_test, xgb_binary_preds)
nn_acc = accuracy_score(y_test, nn_binary_preds)
stacking_acc = accuracy_score(y_test, stacking_binary_preds)

# Print out results
print(f'Logistic Regression - ROC-AUC: {logistic_roc_auc:.4f}, Accuracy: {logistic_acc:.4f}')
print(f'Random Forest (Tuned) - ROC-AUC: {rf_roc_auc:.4f}, Accuracy: {rf_acc:.4f}')
print(f'XGBoost (Tuned) - ROC-AUC: {xgb_roc_auc:.4f}, Accuracy: {xgb_acc:.4f}')
print(f'Neural Network (Improved) - ROC-AUC: {nn_roc_auc:.4f}, Accuracy: {nn_acc:.4f}')
print(f'Stacking Model - ROC-AUC: {stacking_roc_auc:.4f}, Accuracy: {stacking_acc:.4f}')

# Plotting ROC-AUC comparison
model_names = ['Logistic Regression', 'Random Forest', 'XGBoost', 'Neural Network', 'Stacking Model']
roc_auc_scores = [logistic_roc_auc, rf_roc_auc, xgb_roc_auc, nn_roc_auc, stacking_roc_auc]

plt.figure(figsize=(10, 6))
sns.barplot(x=model_names, y=roc_auc_scores, palette="viridis")
plt.title('ROC-AUC Comparison of Different Models')
plt.ylabel('ROC-AUC Score')
plt.ylim(0.7, 1.0)
plt.show()

# Plotting Accuracy comparison
acc_scores = [logistic_acc, rf_acc, xgb_acc, nn_acc, stacking_acc]

plt.figure(figsize=(10, 6))
sns.barplot(x=model_names, y=acc_scores, palette="magma")
plt.title('Accuracy Comparison of Different Models')
plt.ylabel('Accuracy Score')
plt.ylim(0.7, 1.0)
plt.show()
