In [None]:
from imblearn.over_sampling import SMOTE

In [None]:
file_path = r"..\data\processed\credit_card_cleaned.csv"
df = pd.read_csv(file_path)
pd.options.display.max_columns = 50

df.head()

In [None]:
# Choose columns based on business domain and understanding from EDA
# required_cols = ['LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'PAY_1', 'DEFAULT', 'age_group']
# df = df[required_cols]

In [None]:
# - 18-25: Young adults, limited credit history
# - 26-35: Early career, establishing credit
# - 36-50: Established career, stable income
# - 51-65: Peak earning years
# - 65+: Retirement age, fixed income

df["age_group_dk"] = pd.cut(
    df["AGE"],
    bins=[18, 25, 35, 50, 65, 100],
    labels=["18-25", "26-35", "36-50", "51-65", "65+"],
)

mapping = dict(
    zip(
        ["18-25", "26-35", "36-50", "51-65", "65+"],
        [
            "Young adults",
            "Early aareer",
            "Established career",
            "Peak earning years",
            "Retirment age",
        ],
    )
)

df['age_category'] = df["age_group_dk"].map(mapping)
df = df.drop(columns=['AGE', 'age_group', 'age_group_dk'])

In [None]:
numerical_cols = df.drop('DEFAULT', axis=1).select_dtypes(include='number').columns

df_encoded = pd.get_dummies(
    df,
    columns=["SEX", "EDUCATION", "MARRIAGE", "age_category"],
    drop_first=True,
    
)

X = df_encoded.drop('DEFAULT', axis=1)
y = df_encoded['DEFAULT']


X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42, 
    stratify=y   # to ensure that the class distribution remains consistent across both the training and test sets
)

print(f"\nTraining set size: {len(X_train)}")
print(f"Test set size: {len(X_test)}")
print(f"Training set default rate: {y_train.mean():.2%}")
print(f"Test set default rate: {y_test.mean():.2%}")

smote = SMOTE(random_state=123, k_neighbors=5)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

print(f"\nAfter SMOTE:")
print(f"Training samples: {len(X_train_smote)}")
print(pd.Series(y_train_smote).value_counts())

categorical_cols = df_encoded.select_dtypes(include='bool').columns
categorical_cols


scaler = StandardScaler()
X_train_scaled_numerical= scaler.fit_transform(X_train_smote[numerical_cols])
X_test_scaled_numerical= scaler.transform(X_test[numerical_cols])

X_train_scaled_numerical = pd.DataFrame(X_train_scaled_numerical, columns=X_train_smote[numerical_cols].columns, index=X_train_smote.index)
X_test_scaled_numerical = pd.DataFrame(X_train_scaled_numerical, columns=X_test[numerical_cols].columns, index=X_test.index)

In [None]:
X_train_categorical = X_train_smote[categorical_cols]
X_test_categorical = X_test[categorical_cols]

X_train_final = pd.concat([X_train_scaled_numerical, X_train_categorical], axis=1)
X_test_final = pd.concat([X_test_scaled_numerical, X_test_categorical], axis=1)

pca = PCA(n_components=10)
X_train_pca = pca.fit_transform(X_train_final)
X_test_pca = pca.transform(X_test_final)


# Train SVC
model = SVC(kernel='rbf', C=1.0, gamma='scale')
model.fit(X_train_pca, y_train_smote)

# Predict and evaluate
y_pred = model.predict(X_test_pca)
print(classification_report(y_test, y_pred))



svm = SVC()

svm.fit(X_train_final, y_train_smote)

y_pred = svm.predict(X_test_final)
# y_pred_prob = svm.predict_proba(X_test_final)[:, 1]

accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"Classification Report:\n{classification_report(y_test, y_pred)}")


#train a logistic regression model
logistic_model = LogisticRegression(random_state=42, max_iter=1000)
logistic_model.fit(X_train_final, y_train_smote)

y_pred = logistic_model.predict(X_test_final)
y_pred_prob = logistic_model.predict_proba(X_test_final)[:, 1]

accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"Classification Report:\n{classification_report(y_test, y_pred)}")



rf = RandomForestClassifier(random_state=42, class_weight='balanced')
rf.fit(X_train_final, y_train_smote)
y_pred = rf.predict(X_test_final)
print(classification_report(y_test, y_pred))
y_pred_prob = rf.predict_proba(X_test)[:, 1]
roc_auc = roc_auc_score(y_test, y_pred_prob)


plt.figure(figsize=(8, 6))
sns.heatmap(confusion_matrix(y_test, rf.predict(X_test_final)), annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix - Logistic Regression', fontsize=16)
plt.xlabel('Predicted', fontsize=12)
plt.ylabel('Actual', fontsize=12)
plt.show()