In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
from functions import *
from pipeline import ProcessingPipeline
from statsmodels.stats.outliers_influence import variance_inflation_factor

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, RobustScaler


raw

In [2]:
credit_card_balance = pd.read_csv('raw-data/dseb63_credit_card_balance.csv')
credit_card_balance.head()

Unnamed: 0,SK_ID_PREV,MONTHS_BALANCE,AMT_BALANCE,AMT_CREDIT_LIMIT_ACTUAL,AMT_DRAWINGS_ATM_CURRENT,AMT_DRAWINGS_CURRENT,AMT_DRAWINGS_OTHER_CURRENT,AMT_DRAWINGS_POS_CURRENT,AMT_INST_MIN_REGULARITY,AMT_PAYMENT_CURRENT,...,AMT_TOTAL_RECEIVABLE,CNT_DRAWINGS_ATM_CURRENT,CNT_DRAWINGS_CURRENT,CNT_DRAWINGS_OTHER_CURRENT,CNT_DRAWINGS_POS_CURRENT,CNT_INSTALMENT_MATURE_CUM,NAME_CONTRACT_STATUS,SK_DPD,SK_DPD_DEF,SK_ID_CURR
0,2582071,-1,63975.555,45000,2250.0,2250.0,0.0,0.0,2250.0,2250.0,...,64875.555,1.0,1.0,0.0,0.0,69.0,Active,0.0,0.0,87788.0
1,2582071,-82,16809.21,67500,0.0,0.0,0.0,0.0,3375.0,9000.0,...,16809.21,0.0,0.0,0.0,0.0,18.0,Active,0.0,0.0,87788.0
2,2582071,-84,27577.89,67500,0.0,0.0,0.0,0.0,3375.0,4500.0,...,27577.89,0.0,0.0,0.0,0.0,16.0,Active,0.0,0.0,87788.0
3,2582071,-7,65159.235,45000,0.0,0.0,0.0,0.0,2250.0,2250.0,...,65609.235,0.0,0.0,0.0,0.0,63.0,Active,0.0,0.0,87788.0
4,2582071,-59,70475.85,67500,24750.0,24750.0,0.0,0.0,3375.0,4500.0,...,70475.85,4.0,4.0,0.0,0.0,41.0,Active,0.0,0.0,87788.0


In [3]:
credit_card_balance.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2092174 entries, 0 to 2092173
Data columns (total 23 columns):
 #   Column                      Dtype  
---  ------                      -----  
 0   SK_ID_PREV                  int64  
 1   MONTHS_BALANCE              int64  
 2   AMT_BALANCE                 float64
 3   AMT_CREDIT_LIMIT_ACTUAL     int64  
 4   AMT_DRAWINGS_ATM_CURRENT    float64
 5   AMT_DRAWINGS_CURRENT        float64
 6   AMT_DRAWINGS_OTHER_CURRENT  float64
 7   AMT_DRAWINGS_POS_CURRENT    float64
 8   AMT_INST_MIN_REGULARITY     float64
 9   AMT_PAYMENT_CURRENT         float64
 10  AMT_PAYMENT_TOTAL_CURRENT   float64
 11  AMT_RECEIVABLE_PRINCIPAL    float64
 12  AMT_RECIVABLE               float64
 13  AMT_TOTAL_RECEIVABLE        float64
 14  CNT_DRAWINGS_ATM_CURRENT    float64
 15  CNT_DRAWINGS_CURRENT        float64
 16  CNT_DRAWINGS_OTHER_CURRENT  float64
 17  CNT_DRAWINGS_POS_CURRENT    float64
 18  CNT_INSTALMENT_MATURE_CUM   float64
 19  NAME_CONTRACT_STATUS 

check null

In [4]:
credit_card_balance.isnull().sum() / len(credit_card_balance)

SK_ID_PREV                    0.000000e+00
MONTHS_BALANCE                0.000000e+00
AMT_BALANCE                   0.000000e+00
AMT_CREDIT_LIMIT_ACTUAL       0.000000e+00
AMT_DRAWINGS_ATM_CURRENT      1.449229e-01
AMT_DRAWINGS_CURRENT          0.000000e+00
AMT_DRAWINGS_OTHER_CURRENT    1.449229e-01
AMT_DRAWINGS_POS_CURRENT      1.449229e-01
AMT_INST_MIN_REGULARITY       8.354133e-02
AMT_PAYMENT_CURRENT           1.422410e-01
AMT_PAYMENT_TOTAL_CURRENT     0.000000e+00
AMT_RECEIVABLE_PRINCIPAL      4.779717e-07
AMT_RECIVABLE                 4.779717e-07
AMT_TOTAL_RECEIVABLE          4.779717e-07
CNT_DRAWINGS_ATM_CURRENT      1.449234e-01
CNT_DRAWINGS_CURRENT          4.779717e-07
CNT_DRAWINGS_OTHER_CURRENT    1.449234e-01
CNT_DRAWINGS_POS_CURRENT      1.449234e-01
CNT_INSTALMENT_MATURE_CUM     8.354181e-02
NAME_CONTRACT_STATUS          4.779717e-07
SK_DPD                        4.779717e-07
SK_DPD_DEF                    4.779717e-07
SK_ID_CURR                    4.779717e-07
dtype: floa

In [5]:
# fill na with mode
credit_card_balance = credit_card_balance.fillna(credit_card_balance.mode().iloc[0])
credit_card_balance.isnull().sum().sum()

0

eda

In [6]:
# # plot correlation matrix
# corr = credit_card_balance.select_dtypes(include=np.number).corr()
# plt.figure(figsize=(20, 20))
# sns.heatmap(corr, annot=True, fmt='.2f')
# plt.show()

In [7]:
# Aggregated features
aggregations = {
    'AMT_BALANCE': ['mean', 'max', 'min', 'sum', 'var'],
    'AMT_CREDIT_LIMIT_ACTUAL': ['mean', 'max', 'min', 'sum', 'var'],
    'AMT_DRAWINGS_ATM_CURRENT': ['mean', 'sum'],
    'AMT_DRAWINGS_CURRENT': ['mean', 'sum'],
    'AMT_PAYMENT_CURRENT': ['mean', 'sum'],
    'CNT_DRAWINGS_ATM_CURRENT': ['mean', 'sum'],
    'CNT_DRAWINGS_CURRENT': ['mean', 'sum'],
    'SK_DPD': ['max', 'mean', 'sum'],
    'SK_DPD_DEF': ['max', 'mean', 'sum']
}

# Apply the aggregation functions to the dataframe
credit_card_agg = credit_card_balance.groupby('SK_ID_CURR').agg(aggregations)
credit_card_agg.columns = pd.Index(['{}_{}'.format(e[0], e[1].upper()) for e in credit_card_agg.columns.tolist()])

# Credit utilization and payment ratios
credit_card_balance['UTILIZATION_RATIO'] = credit_card_balance['AMT_BALANCE'] / credit_card_balance['AMT_CREDIT_LIMIT_ACTUAL']
credit_card_balance['PAYMENT_RATIO'] = credit_card_balance['AMT_PAYMENT_CURRENT'] / credit_card_balance['AMT_INST_MIN_REGULARITY']
credit_card_balance['PAYMENT_TO_BALANCE_RATIO'] = credit_card_balance['AMT_PAYMENT_CURRENT'] / credit_card_balance['AMT_BALANCE']

# Overlimit feature
credit_card_balance['OVERLIMIT'] = credit_card_balance['AMT_BALANCE'] > credit_card_balance['AMT_CREDIT_LIMIT_ACTUAL']

# Rolling behavioral features (assuming MONTHS_BALANCE is in ascending order)
credit_card_balance.sort_values(by=['SK_ID_CURR', 'MONTHS_BALANCE'], inplace=True)
credit_card_balance['AMT_BALANCE_ROLLING_MEAN'] = credit_card_balance.groupby('SK_ID_CURR')['AMT_BALANCE'].transform(lambda x: x.rolling(window=3).mean())
credit_card_balance['AMT_PAYMENT_ROLLING_SUM'] = credit_card_balance.groupby('SK_ID_CURR')['AMT_PAYMENT_CURRENT'].transform(lambda x: x.rolling(window=3).sum())

# Merge aggregated data with the original dataframe (if needed)
credit_card_balance = credit_card_balance.merge(credit_card_agg, on='SK_ID_CURR', how='left')

# Ensure to handle missing values appropriately, which may be created by the rolling function or other operations
credit_card_balance.fillna(0, inplace=True)

# At this point, the credit_card_balance dataframe has new engineered features which can be used for modeling
credit_card_balance.head()

Unnamed: 0,SK_ID_PREV,MONTHS_BALANCE,AMT_BALANCE,AMT_CREDIT_LIMIT_ACTUAL,AMT_DRAWINGS_ATM_CURRENT,AMT_DRAWINGS_CURRENT,AMT_DRAWINGS_OTHER_CURRENT,AMT_DRAWINGS_POS_CURRENT,AMT_INST_MIN_REGULARITY,AMT_PAYMENT_CURRENT,...,CNT_DRAWINGS_ATM_CURRENT_MEAN,CNT_DRAWINGS_ATM_CURRENT_SUM,CNT_DRAWINGS_CURRENT_MEAN,CNT_DRAWINGS_CURRENT_SUM,SK_DPD_MAX,SK_DPD_MEAN,SK_DPD_SUM,SK_DPD_DEF_MAX,SK_DPD_DEF_MEAN,SK_DPD_DEF_SUM
0,1989688,-11,4711.5,900000,68400.0,68400.0,0.0,0.0,0.0,73422.0,...,0.272727,3.0,0.272727,3.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1989688,-10,0.0,900000,0.0,0.0,0.0,0.0,2250.0,5022.0,...,0.272727,3.0,0.272727,3.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1989688,-9,0.0,0,0.0,0.0,0.0,0.0,0.0,0.135,...,0.272727,3.0,0.272727,3.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1989688,-8,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.272727,3.0,0.272727,3.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1989688,-7,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.272727,3.0,0.272727,3.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
# encode categorical features
credit_card_balance = pd.get_dummies(credit_card_balance, drop_first=True)
credit_card_balance.head()

Unnamed: 0,SK_ID_PREV,MONTHS_BALANCE,AMT_BALANCE,AMT_CREDIT_LIMIT_ACTUAL,AMT_DRAWINGS_ATM_CURRENT,AMT_DRAWINGS_CURRENT,AMT_DRAWINGS_OTHER_CURRENT,AMT_DRAWINGS_POS_CURRENT,AMT_INST_MIN_REGULARITY,AMT_PAYMENT_CURRENT,...,SK_DPD_SUM,SK_DPD_DEF_MAX,SK_DPD_DEF_MEAN,SK_DPD_DEF_SUM,NAME_CONTRACT_STATUS_Approved,NAME_CONTRACT_STATUS_Completed,NAME_CONTRACT_STATUS_Demand,NAME_CONTRACT_STATUS_Refused,NAME_CONTRACT_STATUS_Sent proposal,NAME_CONTRACT_STATUS_Signed
0,1989688,-11,4711.5,900000,68400.0,68400.0,0.0,0.0,0.0,73422.0,...,0.0,0.0,0.0,0.0,False,False,False,False,False,False
1,1989688,-10,0.0,900000,0.0,0.0,0.0,0.0,2250.0,5022.0,...,0.0,0.0,0.0,0.0,False,False,False,False,False,False
2,1989688,-9,0.0,0,0.0,0.0,0.0,0.0,0.0,0.135,...,0.0,0.0,0.0,0.0,False,True,False,False,False,False
3,1989688,-8,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,False,True,False,False,False,False
4,1989688,-7,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,False,True,False,False,False,False


In [9]:
app_train = pd.read_csv('processed-data/app_train.csv')
app_train.set_index('SK_ID_CURR', inplace=True)
app_train.head()

Unnamed: 0_level_0,TARGET,CNT_CHILDREN,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,...,NAME_HOUSING_TYPE_Municipal apartment,NAME_HOUSING_TYPE_Other,NAME_HOUSING_TYPE_With parents,OCCUPATION_TYPE_Core staff,OCCUPATION_TYPE_Laborers,OCCUPATION_TYPE_Other,OCCUPATION_TYPE_Sales staff,ORGANIZATION_TYPE_Business Entity Type 3,ORGANIZATION_TYPE_Other,ORGANIZATION_TYPE_Self-employed
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,0,1197000.0,44487.0,1197000.0,0.026392,-11945,-376,-574.0,-580,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1,1,0,900000.0,26316.0,900000.0,0.003122,-19158,-9203,-12984.0,-2568,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,0,1,265851.0,11263.5,229500.0,0.031329,-14434,-3759,-4976.0,-3989,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
3,0,2,545040.0,20547.0,450000.0,0.004849,-15957,-6018,-10110.0,-5219,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
4,0,0,512064.0,25033.5,360000.0,0.018801,-17851,-495,-43.0,-181,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


In [10]:
# merge train data with installments
app_train = app_train.merge(credit_card_balance, left_index=True, right_index=True, how='left')
app_train.head()


Unnamed: 0_level_0,TARGET,CNT_CHILDREN,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,...,SK_DPD_SUM,SK_DPD_DEF_MAX,SK_DPD_DEF_MEAN,SK_DPD_DEF_SUM,NAME_CONTRACT_STATUS_Approved,NAME_CONTRACT_STATUS_Completed,NAME_CONTRACT_STATUS_Demand,NAME_CONTRACT_STATUS_Refused,NAME_CONTRACT_STATUS_Sent proposal,NAME_CONTRACT_STATUS_Signed
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,0,1197000.0,44487.0,1197000.0,0.026392,-11945,-376,-574.0,-580,...,0.0,0.0,0.0,0.0,False,False,False,False,False,False
1,1,0,900000.0,26316.0,900000.0,0.003122,-19158,-9203,-12984.0,-2568,...,0.0,0.0,0.0,0.0,False,False,False,False,False,False
2,0,1,265851.0,11263.5,229500.0,0.031329,-14434,-3759,-4976.0,-3989,...,0.0,0.0,0.0,0.0,False,True,False,False,False,False
3,0,2,545040.0,20547.0,450000.0,0.004849,-15957,-6018,-10110.0,-5219,...,0.0,0.0,0.0,0.0,False,True,False,False,False,False
4,0,0,512064.0,25033.5,360000.0,0.018801,-17851,-495,-43.0,-181,...,0.0,0.0,0.0,0.0,False,True,False,False,False,False


In [11]:
# check inf values
app_train.replace([np.inf, -np.inf], np.nan, inplace=True)
app_train.isnull().sum().sum()

# fill na with 0
for col in app_train.columns:
    app_train[col].fillna(0, inplace=True)
app_train.isnull().sum().sum()

0

In [17]:
X, y = app_train.drop('TARGET', axis=1), app_train['TARGET']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# scale data
scaler = RobustScaler(quantile_range=(10, 90))
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

# standardize data
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [18]:
logreg = LogisticRegression(class_weight='balanced', solver='newton-cholesky')
logreg.fit(X_train, y_train)

y_pred = logreg.predict(X_test)
y_pred_proba = logreg.predict_proba(X_test)[:, 1]

gini(y_test, y_pred_proba)

0.47018312649552474

: 