# Credit card default prediction

Dataset src: <https://www.kaggle.com/datasets/pratjain/credit-card-default>

In [34]:
import pandas as pd
import pprint
import matplotlib.pyplot as plt
import seaborn as sns

In [7]:
df = pd.read_csv("credit_card_default_TRAIN.csv", skiprows=1)

## Data preparation

In [8]:
# data size overview

print(f"# of samples: {len(df)}")
print(f"# of features: {len(df.columns)}")

# of samples: 22500
# of features: 25


In [11]:
# clean up feature names

df.columns = df.columns.str.lower().str.replace(" ", "_")
df.head(2)

Unnamed: 0,id,limit_bal,sex,education,marriage,age,pay_0,pay_2,pay_3,pay_4,...,bill_amt4,bill_amt5,bill_amt6,pay_amt1,pay_amt2,pay_amt3,pay_amt4,pay_amt5,pay_amt6,default_payment_next_month
0,1,20000,2,2,1,24,2,2,-1,-1,...,0,0,0,0,689,0,0,0,0,1
1,2,120000,2,2,2,26,-1,2,0,0,...,3272,3455,3261,0,1000,1000,1000,0,2000,1


In [13]:
# inspect data types
# categorical features are encoded as numerics
# need to extract them manually

df.dtypes

id                            int64
limit_bal                     int64
sex                           int64
education                     int64
marriage                      int64
age                           int64
pay_0                         int64
pay_2                         int64
pay_3                         int64
pay_4                         int64
pay_5                         int64
pay_6                         int64
bill_amt1                     int64
bill_amt2                     int64
bill_amt3                     int64
bill_amt4                     int64
bill_amt5                     int64
bill_amt6                     int64
pay_amt1                      int64
pay_amt2                      int64
pay_amt3                      int64
pay_amt4                      int64
pay_amt5                      int64
pay_amt6                      int64
default_payment_next_month    int64
dtype: object

In [18]:
# extract categorical features
# need to drill down into pay_i categories

feature_map = {col: df[col].nunique() for col in list(df.columns)}
pprint.pprint(feature_map)

{'age': 55,
 'bill_amt1': 17712,
 'bill_amt2': 17398,
 'bill_amt3': 17152,
 'bill_amt4': 16769,
 'bill_amt5': 16359,
 'bill_amt6': 15972,
 'default_payment_next_month': 2,
 'education': 7,
 'id': 22500,
 'limit_bal': 77,
 'marriage': 4,
 'pay_0': 11,
 'pay_2': 11,
 'pay_3': 11,
 'pay_4': 11,
 'pay_5': 10,
 'pay_6': 10,
 'pay_amt1': 6592,
 'pay_amt2': 6476,
 'pay_amt3': 6050,
 'pay_amt4': 5732,
 'pay_amt5': 5674,
 'pay_amt6': 5684,
 'sex': 2}


In [19]:
# pay_i denotes payment status for month i (i=0,2,3,...,6) where i=0 is 09/2005 and i=6 is 04/2005
# pay_i = -1 means pay duly
# pay_i = 0 means revolving credit
# pay_i = 1 means payment delay for two months
# pay_i = 2 means payment delay for three months
# pay_i = -2 means no consumption for the month
# and so on

df["pay_0"].value_counts()

pay_0
 0    11069
-1     4322
 1     2830
 2     2093
-2     1830
 3      243
 4       63
 8       17
 5       17
 6        9
 7        7
Name: count, dtype: int64

In [21]:
# set cols for reuse

categorical_features = [
    "education", "marriage", "pay_0", "pay_2", "pay_3", "pay_4", "pay_5", "pay_6", "sex"
]

numerical_features = [
    col for col in df.columns if col not in categorical_features and col != "id" and col != "default_payment_next_month"
]

In [23]:
# drop id column

df.drop("id", axis=1, inplace=True)

In [24]:
# inspect nans

df.isnull().sum() # nothing to do here fortunately

limit_bal                     0
sex                           0
education                     0
marriage                      0
age                           0
pay_0                         0
pay_2                         0
pay_3                         0
pay_4                         0
pay_5                         0
pay_6                         0
bill_amt1                     0
bill_amt2                     0
bill_amt3                     0
bill_amt4                     0
bill_amt5                     0
bill_amt6                     0
pay_amt1                      0
pay_amt2                      0
pay_amt3                      0
pay_amt4                      0
pay_amt5                      0
pay_amt6                      0
default_payment_next_month    0
dtype: int64

In [25]:
# the target variable is imbalanced
# consider other methods - ensemble, boosting, forests, bagging, etc.

# some recommendations:
# https://medium.com/@mmalinda/testing-recommendations-for-binary-classification-with-an-imbalanced-target-variable-ff8b120ea8c9

df["default_payment_next_month"].value_counts(normalize=True)

default_payment_next_month
0    0.773867
1    0.226133
Name: proportion, dtype: float64

## Validation framework

In [27]:
from sklearn.model_selection import train_test_split

In [30]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [29]:
print(len(df_train), len(df_val), len(df_test))

13500 4500 4500


In [31]:
y_train = df_train["default_payment_next_month"].values
y_val = df_val["default_payment_next_month"].values
y_test = df_test["default_payment_next_month"].values

del df_train["default_payment_next_month"]
del df_val["default_payment_next_month"]
del df_test["default_payment_next_month"]

## EDA

In [32]:
eda = df_full_train.copy()

In [33]:
eda.head(1)

Unnamed: 0,limit_bal,sex,education,marriage,age,pay_0,pay_2,pay_3,pay_4,pay_5,...,bill_amt4,bill_amt5,bill_amt6,pay_amt1,pay_amt2,pay_amt3,pay_amt4,pay_amt5,pay_amt6,default_payment_next_month
8586,170000,2,1,1,63,0,0,0,0,0,...,59560,61442,63422,3000,2500,2500,3000,3000,2651,0


In [37]:
eda.sex.value_counts(normalize=True)

sex
2    0.6075
1    0.3925
Name: proportion, dtype: float64

In [41]:
# sex is not very predictive

eda.groupby(["sex"]).default_payment_next_month.mean()

sex
1    0.245435
2    0.210608
Name: default_payment_next_month, dtype: float64

In [43]:
eda.education.value_counts(normalize=True)

education
2    0.473500
1    0.353833
3    0.159556
5    0.008056
4    0.003167
6    0.001389
0    0.000500
Name: proportion, dtype: float64

In [44]:
# has some predictive value

eda.groupby(["education"]).default_payment_next_month.mean()

education
0    0.000000
1    0.192966
2    0.240174
3    0.259053
4    0.052632
5    0.075862
6    0.120000
Name: default_payment_next_month, dtype: float64

In [46]:
# use synthetic approach and calculate mutual information / entropy for each categorical feature

from sklearn.metrics import mutual_info_score

def calculate_mi(series):
    return mutual_info_score(series, df_full_train.default_payment_next_month)

mi = eda[categorical_features].apply(calculate_mi)
mi = mi.sort_values(ascending=False)

mi

pay_0        0.076982
pay_2        0.049405
pay_3        0.037636
pay_4        0.032250
pay_5        0.031638
pay_6        0.026163
education    0.003083
marriage     0.000913
sex          0.000825
dtype: float64

In [47]:
# calculate correlation for numerical features

eda[numerical_features].corrwith(eda.default_payment_next_month).abs().sort_values(ascending=False)

limit_bal    0.144338
pay_amt1     0.074535
pay_amt2     0.056004
pay_amt4     0.054297
pay_amt6     0.054103
pay_amt3     0.054061
pay_amt5     0.048757
bill_amt1    0.024862
age          0.018236
bill_amt2    0.017842
bill_amt3    0.016042
bill_amt4    0.007900
bill_amt5    0.003639
bill_amt6    0.000925
dtype: float64