In [72]:
import pandas as pd

df = pd.read_csv('data\credit_customers.csv')

print(df.head())
print(df.info())
print(df.describe())

  checking_status  duration                  credit_history  \
0              <0       6.0  critical/other existing credit   
1        0<=X<200      48.0                   existing paid   
2     no checking      12.0  critical/other existing credit   
3              <0      42.0                   existing paid   
4              <0      24.0              delayed previously   

               purpose  credit_amount    savings_status employment  \
0             radio/tv         1169.0  no known savings        >=7   
1             radio/tv         5951.0              <100     1<=X<4   
2            education         2096.0              <100     4<=X<7   
3  furniture/equipment         7882.0              <100     4<=X<7   
4              new car         4870.0              <100     1<=X<4   

   installment_commitment     personal_status other_parties  ...  \
0                     4.0         male single          none  ...   
1                     2.0  female div/dep/mar          none  ...

In [73]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler


categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
categorical_cols.remove('class')  

le = LabelEncoder()
binary_cols = [col for col in categorical_cols if df[col].nunique() == 2]

for col in binary_cols:
    df[col] = le.fit_transform(df[col])


In [74]:
multi_cat_cols = [col for col in categorical_cols if df[col].nunique() > 2]
df = pd.get_dummies(df, columns=multi_cat_cols, drop_first=True)
df['class'] = df['class'].map({'good': 1, 'bad': 0})

In [75]:

scaler = StandardScaler()
num_cols = df.select_dtypes(include=['float64', 'int']).columns.tolist()
num_cols.remove('class')  # exclude the target
df[num_cols] = scaler.fit_transform(df[num_cols])


In [76]:

print(df.head())
print(df.shape)

   duration  credit_amount  installment_commitment  residence_since       age  \
0 -1.236478      -0.745131                0.918477         1.046987  2.766456   
1  2.248194       0.949817               -0.870183        -0.765977 -1.191404   
2 -0.738668      -0.416562               -0.870183         0.140505  1.183312   
3  1.750384       1.634247               -0.870183         1.046987  0.831502   
4  0.256953       0.566664                0.024147         1.046987  1.535122   

   existing_credits  num_dependents  own_telephone  foreign_worker  class  \
0          1.027079       -0.428290       1.214598        0.196014      1   
1         -0.704926       -0.428290      -0.823318        0.196014      0   
2         -0.704926        2.334869      -0.823318        0.196014      1   
3         -0.704926        2.334869      -0.823318        0.196014      1   
4          1.027079        2.334869      -0.823318        0.196014      0   

   ...  property_magnitude_life insurance  \
0  ..

In [77]:
from sklearn.model_selection import train_test_split
# Split the data
X = df.drop('class', axis=1)
y = df['class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [79]:
from xgboost import XGBClassifier

from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix



# Train XGBoost model
xgb_model = XGBClassifier(eval_metric='logloss', random_state=42)
xgb_model.fit(X_train.values, y_train.values)


# Predictions
y_pred = xgb_model.predict(X_test)
y_proba = xgb_model.predict_proba(X_test)[:, 1]

# Evaluation
roc_auc = roc_auc_score(y_test, y_proba)
print("ROC AUC Score:", roc_auc)
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

ROC AUC Score: 0.7398809523809524

Classification Report:
               precision    recall  f1-score   support

           0       0.55      0.47      0.50        60
           1       0.79      0.84      0.81       140

    accuracy                           0.72       200
   macro avg       0.67      0.65      0.66       200
weighted avg       0.71      0.72      0.72       200


Confusion Matrix:
 [[ 28  32]
 [ 23 117]]


In [108]:
# import pandas as pd

# df_fe = df.copy()
# df_fe['age_group'] = pd.cut(df_fe['age'], bins=[18, 25, 35, 50, 65, 100], labels=['18-25', '26-35', '36-50', '51-65', '66+'])
# df_fe['credit_per_age'] = df_fe['credit_amount'] / (df_fe['age'] + 1)
# df_fe['credit_per_month'] = df_fe['credit_amount'] / (df_fe['duration'] + 1)
# df_fe['savings_employment'] = df_fe['savings_status'].astype(str) + "_" + df_fe['employment'].astype(str)
# df_fe['young_high_credit'] = ((df_fe['age'] < 30) & (df_fe['credit_amount'] > df_fe['credit_amount'].median())).astype(int)

In [83]:
df.columns

Index(['duration', 'credit_amount', 'installment_commitment',
       'residence_since', 'age', 'existing_credits', 'num_dependents',
       'own_telephone', 'foreign_worker', 'class', 'checking_status_<0',
       'checking_status_>=200', 'checking_status_no checking',
       'credit_history_critical/other existing credit',
       'credit_history_delayed previously', 'credit_history_existing paid',
       'credit_history_no credits/all paid', 'purpose_domestic appliance',
       'purpose_education', 'purpose_furniture/equipment', 'purpose_new car',
       'purpose_other', 'purpose_radio/tv', 'purpose_repairs',
       'purpose_retraining', 'purpose_used car', 'savings_status_500<=X<1000',
       'savings_status_<100', 'savings_status_>=1000',
       'savings_status_no known savings', 'employment_4<=X<7', 'employment_<1',
       'employment_>=7', 'employment_unemployed',
       'personal_status_male div/sep', 'personal_status_male mar/wid',
       'personal_status_male single', 'other_par

In [None]:
df['age_group'] = pd.cut(df['age'], bins=[18, 30, 40, 50, 60, 100], labels=['18-30', '31-40', '41-50', '51-60', '61+'])


In [85]:
df['credit_per_age'] = df['credit_amount'] / (df['age'] + 1)


In [86]:
df['credit_per_month'] = df['credit_amount'] / (df['duration'] + 1)


In [87]:
df['checking_credit_history'] = df['checking_status_<0'].astype(str) + '_' + df['credit_history_critical/other existing credit'].astype(str)


In [88]:
df['young_high_credit'] = ((df['age'] < 30) & (df['credit_amount'] > df['credit_amount'].median())).astype(int)


In [90]:
df['employment_personal_status'] = df['employment_unemployed'].astype(str) + '_' + df['personal_status_male single'].astype(str)


In [None]:

categorical_cols = ['checking_status_<0', 'checking_status_>=200', 'checking_status_no checking',
                    'credit_history_critical/other existing credit', 'credit_history_delayed previously',
                    'credit_history_existing paid', 'credit_history_no credits/all paid', 'purpose_domestic appliance',
                    'purpose_education', 'purpose_furniture/equipment', 'purpose_new car', 'purpose_other',
                    'purpose_radio/tv', 'purpose_repairs', 'purpose_retraining', 'purpose_used car',
                    'savings_status_500<=X<1000', 'savings_status_<100', 'savings_status_>=1000', 
                    'savings_status_no known savings', 'employment_4<=X<7', 'employment_<1', 'employment_>=7', 
                    'employment_unemployed', 'personal_status_male div/sep', 'personal_status_male mar/wid',
                    'personal_status_male single', 'other_parties_guarantor', 'other_parties_none', 
                    'property_magnitude_life insurance', 'property_magnitude_no known property', 
                    'property_magnitude_real estate', 'other_payment_plans_none', 'other_payment_plans_stores', 
                    'housing_own', 'housing_rent', 'job_skilled', 'job_unemp/unskilled non res', 
                    'job_unskilled resident', 'age_group', 'employment_personal_status']

df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)


In [None]:
from sklearn.preprocessing import StandardScaler

numeric_cols = ['duration', 'credit_amount', 'installment_commitment', 'residence_since', 'age', 
                'existing_credits', 'num_dependents', 'own_telephone', 'foreign_worker']

scaler = StandardScaler()
df_encoded[numeric_cols] = scaler.fit_transform(df_encoded[numeric_cols])


In [None]:
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


model = XGBClassifier(use_label_encoder=False)
model.fit(X_train.values, y_train.values)


y_pred = model.predict(X_test)


roc_auc = roc_auc_score(y_test, y_pred)
print(f"ROC AUC Score: {roc_auc}")


print("Classification Report:")
print(classification_report(y_test, y_pred))


print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


ROC AUC Score: 0.6974996994831109
Classification Report:
              precision    recall  f1-score   support

           0       0.65      0.51      0.57        59
           1       0.81      0.89      0.85       141

    accuracy                           0.78       200
   macro avg       0.73      0.70      0.71       200
weighted avg       0.76      0.78      0.77       200

Confusion Matrix:
[[ 30  29]
 [ 16 125]]


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [None]:
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier


param_grid = {
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [100, 200, 500],
    'subsample': [0.8, 0.9, 1],
    'colsample_bytree': [0.8, 0.9, 1],
}


grid_search = GridSearchCV(estimator=XGBClassifier(use_label_encoder=False),
                           param_grid=param_grid,
                           cv=3,
                           n_jobs=-1,
                           verbose=1,
                           scoring='roc_auc')

grid_search.fit(X_train_res.values, y_train_res.values)

print(f"Best Parameters: {grid_search.best_params_}")

Fitting 3 folds for each of 243 candidates, totalling 729 fits
Best Parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.2, 'max_depth': 7, 'n_estimators': 100, 'subsample': 0.9}


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [None]:

best_model = XGBClassifier(
    colsample_bytree=0.8,
    learning_rate=0.2,
    max_depth=7,
    n_estimators=100,
    subsample=0.9,
    
)


best_model.fit(X_train_res.values, y_train_res.values)


y_pred = best_model.predict(X_test)


roc_auc = roc_auc_score(y_test, y_pred)
print(f"ROC AUC Score: {roc_auc}")


print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


ROC AUC Score: 0.7243058059862963
Classification Report:
              precision    recall  f1-score   support

           0       0.65      0.58      0.61        59
           1       0.83      0.87      0.85       141

    accuracy                           0.79       200
   macro avg       0.74      0.72      0.73       200
weighted avg       0.78      0.79      0.78       200

Confusion Matrix:
[[ 34  25]
 [ 18 123]]


In [111]:
from imblearn.over_sampling import SMOTE


smote = SMOTE(sampling_strategy='auto', random_state=42)
X_res, y_res = smote.fit_resample(X_train, y_train)

In [112]:


model = XGBClassifier(colsample_bytree=0.8,
    learning_rate=0.2,
    max_depth=7,
    n_estimators=100,
    subsample=0.9,
    )
model.fit(X_res.values, y_res.values)

y_pred_res = model.predict(X_test)


roc_auc_res = roc_auc_score(y_test, y_pred_res)
print(f"ROC AUC Score: {roc_auc_res}")

print("Classification Report ")
print(classification_report(y_test, y_pred_res))

print("Confusion Matrix ")
print(confusion_matrix(y_test, y_pred_res))


ROC AUC Score: 0.7243058059862963
Classification Report 
              precision    recall  f1-score   support

           0       0.65      0.58      0.61        59
           1       0.83      0.87      0.85       141

    accuracy                           0.79       200
   macro avg       0.74      0.72      0.73       200
weighted avg       0.78      0.79      0.78       200

Confusion Matrix 
[[ 34  25]
 [ 18 123]]


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, classification_report

log_reg_model = LogisticRegression(random_state=42)


log_reg_model.fit(X_res.values, y_res.values)


y_pred = log_reg_model.predict(X_test)


roc_auc = roc_auc_score(y_test, y_pred)
print(f'ROC-AUC: {roc_auc}')


print(classification_report(y_test, y_pred))

ValueError: Input X contains NaN.
LogisticRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values