In [1]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import classification_report, recall_score, make_scorer

data=pd.read_csv('data/data.csv')
y = np.array(data['DEFAULT'])

X_train, X_test, y_train, y_test = train_test_split(
    data, y, stratify=y, test_size=0.3, random_state=42
)

In [2]:
selected_numeric_features = ['INCOME', 'SAVINGS', 'DEBT', 'R_SAVINGS_INCOME', 'R_DEBT_INCOME',
       'R_DEBT_SAVINGS', 'T_CLOTHING_12', 
       'R_CLOTHING_INCOME', 'R_CLOTHING_SAVINGS', 'R_CLOTHING_DEBT',
       'T_EDUCATION_12', 'R_EDUCATION_INCOME',
       'R_EDUCATION_SAVINGS', 'R_EDUCATION_DEBT', 'T_ENTERTAINMENT_12',
       'R_ENTERTAINMENT_INCOME',
       'R_ENTERTAINMENT_SAVINGS', 'R_ENTERTAINMENT_DEBT', 'T_FINES_12',
       'R_FINES_INCOME', 'R_FINES_SAVINGS',
       'R_FINES_DEBT', 'T_GAMBLING_12',
       'R_GAMBLING_INCOME', 'R_GAMBLING_SAVINGS', 'R_GAMBLING_DEBT',
       'T_GROCERIES_12', 'R_GROCERIES_INCOME',
       'R_GROCERIES_SAVINGS', 'R_GROCERIES_DEBT', 'T_HEALTH_12',
       'R_HEALTH_INCOME', 'R_HEALTH_SAVINGS', 'R_HEALTH_DEBT',
       'T_HOUSING_12', 'R_HOUSING_INCOME',
       'R_HOUSING_SAVINGS', 'R_HOUSING_DEBT', 'T_TAX_12',
       'R_TAX_INCOME', 'R_TAX_SAVINGS', 'R_TAX_DEBT', 'T_TRAVEL_12',
       'R_TRAVEL_INCOME', 'R_TRAVEL_SAVINGS',
       'R_TRAVEL_DEBT', 'T_UTILITIES_12', 'T_EXPENDITURE_12',
       'R_UTILITIES_INCOME', 'R_UTILITIES_SAVINGS', 'R_UTILITIES_DEBT',
       'R_EXPENDITURE_INCOME', 'R_EXPENDITURE_SAVINGS', 'R_EXPENDITURE_DEBT']

class OutliersReplacer(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        for column in X.columns:
            if column == 'SAVINGS':
                X.loc[X[column] > 2500000, column] = 2500000
            elif column == 'DEBT':
                X.loc[X[column] > 4000000, column] = 4000000
            elif column == 'T_CLOTHING_12':
                X.loc[X[column] > 32000, column] = 32000
            elif column == 'T_HEALTH_12':
                X.loc[X[column] > 25000, column] = 25000
            elif column == 'T_TRAVEL_12':
                X.loc[X[column] > 150000, column] = 150000
        return X
    
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), selected_numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), ['CAT_GAMBLING'])
    ])

In [3]:
#best accuracy
clf_acc = Pipeline(steps=[('outliers_replacer', OutliersReplacer()),
                      ('preprocessor', preprocessor),
                      ('pca', PCA(n_components=15)),
                      ('classifier', RandomForestClassifier(n_estimators=200,
                                                            min_samples_split=34,
                                                            min_samples_leaf=12,
                                                            max_features='sqrt',
                                                            max_depth=6,
                                                            bootstrap=False,
                                                            random_state=0))])

In [4]:
clf_acc.fit(X_train, y_train)

predictions = clf_acc.predict(X_test)
print(classification_report(y_test, predictions))
print("Accuracy:", accuracy_score(y_test, predictions))

              precision    recall  f1-score   support

           0       0.75      0.97      0.84       172
           1       0.69      0.16      0.26        68

    accuracy                           0.74       240
   macro avg       0.72      0.57      0.55       240
weighted avg       0.73      0.74      0.68       240

Accuracy: 0.7416666666666667


In [5]:
def recall_for_class_0(y_true, y_pred):
    return recall_score(y_true, y_pred, pos_label=0)

def recall_for_class_1(y_true, y_pred):
    return recall_score(y_true, y_pred, pos_label=1)

cv = RepeatedStratifiedKFold(n_splits=5, random_state=42)
cv_scores = cross_val_score(clf_acc, X_train, y_train,scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise') 
nested_recall_0 = cross_val_score(clf_acc, X_train, y_train, cv=cv, scoring=make_scorer(recall_for_class_0))
nested_recall_1 = cross_val_score(clf_acc, X_train, y_train, cv=cv, scoring=make_scorer(recall_for_class_1))

print("Mean cross-validation score:", np.mean(cv_scores))
print("Mean recall for class 0:", np.mean(nested_recall_0))
print("Mean recall for class 1:", np.mean(nested_recall_1))

Mean cross-validation score: 0.7344642857142857
Mean recall for class 0: 0.9750648148148148
Mean recall for class 1: 0.12768145161290323
