In [96]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.metrics import roc_auc_score 
from sklearn.model_selection import cross_val_score, KFold, StratifiedKFold
from sklearn.impute import KNNImputer
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import Pool, CatBoostClassifier


import optuna
import gc 
import shap 

import warnings
warnings.filterwarnings('ignore')

In [97]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
original = pd.read_csv('original_loan.csv')
sample_sub = pd.read_csv('sample_submission.csv')

In [98]:
train_df = pd.concat([train_df, original], axis=0).reset_index(drop=True)

In [99]:
train_df.drop(['id'], axis=1, inplace=True)
test_df.drop(['id'], axis=1, inplace=True)

In [100]:
train_df.duplicated().sum()

165

In [101]:
train_df.drop_duplicates(inplace=True)

In [102]:
train_df.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status
0,37,35000,RENT,0.0,EDUCATION,B,6000,11.49,0.17,N,14,0
1,22,56000,OWN,6.0,MEDICAL,C,4000,13.35,0.07,N,2,0
2,29,28800,OWN,8.0,PERSONAL,A,6000,8.9,0.21,N,10,0
3,30,70000,RENT,14.0,VENTURE,B,12000,11.11,0.17,N,5,0
4,22,60000,RENT,2.0,MEDICAL,A,6000,6.92,0.1,N,3,0


In [103]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 91061 entries, 0 to 91225
Data columns (total 12 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   person_age                  91061 non-null  int64  
 1   person_income               91061 non-null  int64  
 2   person_home_ownership       91061 non-null  object 
 3   person_emp_length           90174 non-null  float64
 4   loan_intent                 91061 non-null  object 
 5   loan_grade                  91061 non-null  object 
 6   loan_amnt                   91061 non-null  int64  
 7   loan_int_rate               87966 non-null  float64
 8   loan_percent_income         91061 non-null  float64
 9   cb_person_default_on_file   91061 non-null  object 
 10  cb_person_cred_hist_length  91061 non-null  int64  
 11  loan_status                 91061 non-null  int64  
dtypes: float64(3), int64(5), object(4)
memory usage: 9.0+ MB


In [104]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39098 entries, 0 to 39097
Data columns (total 11 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   person_age                  39098 non-null  int64  
 1   person_income               39098 non-null  int64  
 2   person_home_ownership       39098 non-null  object 
 3   person_emp_length           39098 non-null  float64
 4   loan_intent                 39098 non-null  object 
 5   loan_grade                  39098 non-null  object 
 6   loan_amnt                   39098 non-null  int64  
 7   loan_int_rate               39098 non-null  float64
 8   loan_percent_income         39098 non-null  float64
 9   cb_person_default_on_file   39098 non-null  object 
 10  cb_person_cred_hist_length  39098 non-null  int64  
dtypes: float64(3), int64(4), object(4)
memory usage: 3.3+ MB


In [105]:
imputer = KNNImputer(n_neighbors=5)

train_df['loan_int_rate'] = imputer.fit_transform(train_df[['loan_int_rate']])
train_df['person_emp_length'] = imputer.fit_transform(train_df[['person_emp_length']])

In [106]:
categorical_columns = train_df.select_dtypes(include=['object']).columns
unique_values = {col: train_df[col].nunique() for col in categorical_columns}
for col, unique_count in unique_values.items():
    print(f"{col}: {unique_count} unique values")
    
gc.collect()

person_home_ownership: 4 unique values
loan_intent: 6 unique values
loan_grade: 7 unique values
cb_person_default_on_file: 2 unique values


1291

In [107]:
categorical_columns = test_df.select_dtypes(include=['object']).columns
unique_values = {col: test_df[col].nunique() for col in categorical_columns}
for col, unique_count in unique_values.items():
    print(f"{col}: {unique_count} unique values")
    
gc.collect()

person_home_ownership: 4 unique values
loan_intent: 6 unique values
loan_grade: 7 unique values
cb_person_default_on_file: 2 unique values


0

In [108]:
train_df.describe()

Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,loan_status
count,91061.0,91061.0,91061.0,91061.0,91061.0,91061.0,91061.0,91061.0
mean,27.620683,64774.32,4.732306,9351.508352,10.791001,0.163158,5.812752,0.169546
std,6.150047,47922.86,4.006237,5848.017127,3.056025,0.097486,4.03982,0.375235
min,20.0,4000.0,0.0,500.0,5.42,0.0,2.0,0.0
25%,23.0,40000.0,2.0,5000.0,7.88,0.09,3.0,0.0
50%,26.0,57000.0,4.0,8000.0,10.791001,0.14,4.0,0.0
75%,30.0,78000.0,7.0,12000.0,13.04,0.22,8.0,0.0
max,144.0,6000000.0,123.0,35000.0,23.22,0.83,30.0,1.0


In [109]:
loan_grade_mapping = {
    'A': 0,
    'B': 1,
    'C': 2,
    'D': 3,
    'E': 4,
    'F': 5,
    'G': 6
}
cb_person_default_on_file_mapping = {
    'Y': 1,
    'N': 0
}
person_home_ownership_mapping = {
    'RENT': 0, 
    'MORTGAGE': 1, 
    'OWN': 2,
    'OTHER': 3

}
def credit_instability_indicator(cred_hist_length, default_on_file):
    if default_on_file == 'Y' and cred_hist_length >= 5:
        return 'High Risk'
    elif default_on_file == 'Y' and cred_hist_length < 5:
        return 'Medium Risk'
    elif default_on_file == 'N' and cred_hist_length >= 5:
        return 'Low Risk'
    else:
        return 'Very Low Risk'

credit_instability_indicator_mapping = {
    'Very Low Risk': 0,
    'Low Risk': 1,
    'Medium Risk': 2,
    'High Risk': 3
}

train_df['credit_instability_indicator'] = train_df.apply(lambda row: credit_instability_indicator(row['cb_person_cred_hist_length'], row['cb_person_default_on_file']), axis=1)
train_df['credit_instability_indicator'] = train_df['credit_instability_indicator'].map(credit_instability_indicator_mapping)
train_df['loan_grade'] = train_df['loan_grade'].map(loan_grade_mapping)
train_df['cb_person_default_on_file'] = train_df['cb_person_default_on_file'].map(cb_person_default_on_file_mapping)
train_df['person_home_ownership'] = train_df['person_home_ownership'].map(person_home_ownership_mapping)

test_df['credit_instability_indicator'] = test_df.apply(lambda row: credit_instability_indicator(row['cb_person_cred_hist_length'], row['cb_person_default_on_file']), axis=1)
test_df['credit_instability_indicator'] = test_df['credit_instability_indicator'].map(credit_instability_indicator_mapping)
test_df['loan_grade'] = test_df['loan_grade'].map(loan_grade_mapping)
test_df['cb_person_default_on_file'] = test_df['cb_person_default_on_file'].map(cb_person_default_on_file_mapping)
test_df['person_home_ownership'] = test_df['person_home_ownership'].map(person_home_ownership_mapping)


In [110]:
train_df = pd.get_dummies(train_df, columns=['loan_intent'], drop_first=True, dtype=np.uint8)
test_df = pd.get_dummies(test_df, columns=['loan_intent'], drop_first=True, dtype=np.uint8) 

In [111]:
train_df.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status,credit_instability_indicator,loan_intent_EDUCATION,loan_intent_HOMEIMPROVEMENT,loan_intent_MEDICAL,loan_intent_PERSONAL,loan_intent_VENTURE
0,37,35000,0,0.0,1,6000,11.49,0.17,0,14,0,1,1,0,0,0,0
1,22,56000,2,6.0,2,4000,13.35,0.07,0,2,0,0,0,0,1,0,0
2,29,28800,2,8.0,0,6000,8.9,0.21,0,10,0,1,0,0,0,1,0
3,30,70000,0,14.0,1,12000,11.11,0.17,0,5,0,1,0,0,0,0,1
4,22,60000,0,2.0,0,6000,6.92,0.1,0,3,0,0,0,0,1,0,0


In [112]:
y = train_df['loan_status'] 
train_df = train_df.drop(['loan_status'],axis=1)
X = train_df

In [113]:
scaler = StandardScaler()
scaled_train_data = train_df
scaled_test_data  = test_df

In [116]:
cat_list = X.columns.values

xgb_params= {
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'max_depth': 7,
    'eta': 0.07964177396162775,
    'reg_lambda': 38.499443612904315,
    'subsample': 0.8778759317150353,
    'colsample_bytree': 0.6504220261795185,
    'random_state': 42,
    'verbosity':0,
    'eneable_categorical': True,
    'min_child_weight': 5,
    #'tree_method': 'hist',
}

# LGBM params
lgb_params= {
    'objective': 'binary',
    'metric': 'auc',
    'max_depth': -1,
    'verbosity': -1,
    'n_estimators': 1500,
    'max_bin': 1024,
    'boosting_type': 'gbdt', #'dart'
    'colsample_bytree': 0.5673775386473462,        
    'eta': 0.05446876730023387,
    'reg_lambda': 10.787843597294561,
    'min_child_samples': 69,
    'random_state': 42,
    'early_stopping_rounds': 150,
    'verbose':1,
    #'categorical_feature': cat_indices,
}

cat_params={
    'iterations': 1500,
    'depth': 6,
    'eta': 0.28901888228959255, 
    'reg_lambda': 41.0642500499563, 
    #'colsample_bylevel': 0.6,
    #'subsample': 0.8,
    'loss_function': 'Logloss',
    'eval_metric': 'AUC',
    'cat_features': cat_list, 
    'random_state': 42,
    'min_data_in_leaf': 51,
    'early_stopping_rounds': 150,
    'verbose':200,
    #'random_strength': 1.5,
    #'bootstrap_type': 'Bernoulli',
}

In [117]:
'''xgb_params = {
    'n_estimators': 900,
    'max_depth': 10,
    'learning_rate': 0.03,
    'random_state': 42,
    'eval_metric': 'auc'
}'''

n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

xgb_predictions = np.zeros(len(scaled_train_data))
xgb_true_labels = np.zeros(len(scaled_train_data))
xgb_test_predictions = np.zeros(len(scaled_test_data))

for fold, (train_idx, val_idx) in enumerate(kf.split(scaled_train_data, y)):
    X_train, X_val = scaled_train_data.iloc[train_idx], scaled_train_data.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    xgb_model = XGBClassifier(**xgb_params)

    xgb_model.fit(X_train, y_train,
                  eval_set=[(X_train, y_train), (X_val, y_val)],
                  verbose=100)
    xgb_fold_preds = xgb_model.predict_proba(X_val)[:, 1] 
    xgb_fold_test_preds = xgb_model.predict_proba(scaled_test_data)[:, 1]
    xgb_predictions[val_idx] = xgb_fold_preds
    xgb_true_labels[val_idx] = y_val
    xgb_test_predictions += xgb_fold_test_preds / n_splits

overall_auc_xgb = roc_auc_score(xgb_true_labels, xgb_predictions)
print("Overall AUC (XGBClassifier):", overall_auc_xgb) # Overall AUC (XGBClassifier): 0.9538333227791258




[0]	validation_0-auc:0.84581	validation_1-auc:0.84577
[99]	validation_0-auc:0.95187	validation_1-auc:0.94871
[0]	validation_0-auc:0.84708	validation_1-auc:0.84506
[99]	validation_0-auc:0.95151	validation_1-auc:0.94291
[0]	validation_0-auc:0.84515	validation_1-auc:0.85069
[99]	validation_0-auc:0.95142	validation_1-auc:0.95052
[0]	validation_0-auc:0.84671	validation_1-auc:0.84070
[99]	validation_0-auc:0.95129	validation_1-auc:0.94650
[0]	validation_0-auc:0.84647	validation_1-auc:0.83841
[99]	validation_0-auc:0.95094	validation_1-auc:0.94278
Overall AUC (XGBClassifier): 0.9462188355391112


In [118]:
'''lgb_params = {
    'objective': 'binary',
        'n_estimators': 3000,
        'metric': 'binary_logloss',
        'boosting_type': 'gbdt',
        'random_state': 42,
        'learning_rate':0.0322942967545754,
        'num_leaves': 24,
        'max_depth': 15,
        'min_data_in_leaf': 25,
        'feature_fraction': 0.6236144085285287,
        'bagging_fraction': 0.9596685778433888,
        'bagging_freq': 3,      
        'verbose' : -1
} '''

n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

lgbm_predictions = np.zeros(len(scaled_train_data))
lgbm_true_labels = np.zeros(len(scaled_train_data))
lgbm_test_predictions = np.zeros(len(scaled_test_data))

for fold, (train_idx, val_idx) in enumerate(kf.split(scaled_train_data, y)):
    X_train, X_val = scaled_train_data.iloc[train_idx], scaled_train_data.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]  
    lgbm_model = LGBMClassifier(**lgb_params)
    lgbm_model.fit(X_train, y_train,
                   eval_set=[(X_val, y_val)],
                   eval_metric='auc'
                   )
    
    lgbm_fold_preds = lgbm_model.predict_proba(X_val)[:, 1]  
    lgbm_fold_test_preds = lgbm_model.predict_proba(scaled_test_data)[:, 1]
    lgbm_predictions[val_idx] = lgbm_fold_preds
    lgbm_true_labels[val_idx] = y_val
    lgbm_test_predictions += lgbm_fold_test_preds / n_splits

overall_metric_lgbm = roc_auc_score(lgbm_true_labels, lgbm_predictions)
print("Overall AUC (LGBMClassifier):", overall_metric_lgbm) # Overall AUC (LGBMClassifier): 0.9596065955958379


Overall AUC (LGBMClassifier): 0.961016179707092


In [119]:
''' catboost_params = {
    'loss_function': 'Logloss',
    'eval_metric': 'AUC',
    'learning_rate': 0.08114394459649094,
    'iterations': 1000,
    'depth': 6,
    'random_strength': 0,
    'l2_leaf_reg': 0.7047064221215757,
    'task_type': 'GPU',
    'random_seed': 42,
    'verbose': -1    
} '''

n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

catboost_predictions = np.zeros(len(scaled_train_data))
catboost_true_labels = np.zeros(len(scaled_train_data))
catboost_test_predictions = np.zeros(len(scaled_test_data))

for fold, (train_idx, val_idx) in enumerate(kf.split(scaled_train_data, y)):
    X_train, X_val = scaled_train_data.iloc[train_idx], scaled_train_data.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    catboost_model = CatBoostClassifier(**catboost_params)

    catboost_model.fit(X_train, y_train,
                       eval_set=[(X_val, y_val)],
                       verbose=100)
    
    catboost_fold_preds = catboost_model.predict_proba(X_val)[:, 1]
    catboost_fold_test_preds = catboost_model.predict_proba(scaled_test_data)[:, 1]
    catboost_predictions[val_idx] = catboost_fold_preds
    catboost_true_labels[val_idx] = y_val
    catboost_test_predictions += catboost_fold_test_preds / n_splits

overall_auc_catboost = roc_auc_score(catboost_true_labels, catboost_predictions)
print("Overall AUC (CatBoostClassifier):", overall_auc_catboost) # Overall AUC (CatBoostClassifier): 0.9550313496620053

Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.8808179	best: 0.8808179 (0)	total: 5.29ms	remaining: 5.29s
100:	test: 0.9397067	best: 0.9397067 (100)	total: 360ms	remaining: 3.2s
200:	test: 0.9478214	best: 0.9478214 (200)	total: 711ms	remaining: 2.83s
300:	test: 0.9511506	best: 0.9511506 (300)	total: 1.07s	remaining: 2.48s
400:	test: 0.9527909	best: 0.9528214 (395)	total: 1.41s	remaining: 2.11s
500:	test: 0.9538834	best: 0.9539655 (490)	total: 1.76s	remaining: 1.75s
600:	test: 0.9546835	best: 0.9546896 (585)	total: 2.1s	remaining: 1.4s
700:	test: 0.9551763	best: 0.9552056 (690)	total: 2.45s	remaining: 1.04s
800:	test: 0.9555261	best: 0.9555818 (795)	total: 2.79s	remaining: 694ms
900:	test: 0.9556359	best: 0.9556500 (840)	total: 3.14s	remaining: 345ms
999:	test: 0.9558733	best: 0.9558771 (990)	total: 3.47s	remaining: 0us
bestTest = 0.9558771253
bestIteration = 990
Shrink model to first 991 iterations.


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.8785154	best: 0.8785154 (0)	total: 6.74ms	remaining: 6.74s
100:	test: 0.9362471	best: 0.9362471 (100)	total: 357ms	remaining: 3.18s
200:	test: 0.9429380	best: 0.9429380 (200)	total: 706ms	remaining: 2.81s
300:	test: 0.9464454	best: 0.9464454 (300)	total: 1.06s	remaining: 2.45s
400:	test: 0.9486895	best: 0.9486895 (400)	total: 1.41s	remaining: 2.1s
500:	test: 0.9503083	best: 0.9503083 (500)	total: 1.76s	remaining: 1.75s
600:	test: 0.9516155	best: 0.9516155 (600)	total: 2.11s	remaining: 1.4s
700:	test: 0.9523916	best: 0.9524382 (695)	total: 2.46s	remaining: 1.05s
800:	test: 0.9524411	best: 0.9525780 (775)	total: 2.81s	remaining: 699ms
900:	test: 0.9529923	best: 0.9529923 (900)	total: 3.16s	remaining: 347ms
999:	test: 0.9531352	best: 0.9532049 (985)	total: 3.53s	remaining: 0us
bestTest = 0.9532048702
bestIteration = 985
Shrink model to first 986 iterations.


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.8861009	best: 0.8861009 (0)	total: 4.29ms	remaining: 4.29s
100:	test: 0.9427542	best: 0.9427542 (100)	total: 360ms	remaining: 3.21s
200:	test: 0.9507331	best: 0.9507331 (200)	total: 709ms	remaining: 2.82s
300:	test: 0.9540911	best: 0.9540911 (300)	total: 1.07s	remaining: 2.5s
400:	test: 0.9559013	best: 0.9559013 (400)	total: 1.43s	remaining: 2.14s
500:	test: 0.9570478	best: 0.9570478 (500)	total: 1.8s	remaining: 1.79s
600:	test: 0.9574021	best: 0.9574021 (600)	total: 2.16s	remaining: 1.44s
700:	test: 0.9578791	best: 0.9578791 (700)	total: 2.52s	remaining: 1.07s
800:	test: 0.9581788	best: 0.9581866 (790)	total: 2.87s	remaining: 712ms
900:	test: 0.9582006	best: 0.9584273 (845)	total: 3.25s	remaining: 357ms
999:	test: 0.9580827	best: 0.9584273 (845)	total: 3.6s	remaining: 0us
bestTest = 0.9584272504
bestIteration = 845
Shrink model to first 846 iterations.


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.8827231	best: 0.8827231 (0)	total: 4.99ms	remaining: 4.98s
100:	test: 0.9394441	best: 0.9394441 (100)	total: 348ms	remaining: 3.1s
200:	test: 0.9461343	best: 0.9461343 (200)	total: 712ms	remaining: 2.83s
300:	test: 0.9499102	best: 0.9499102 (300)	total: 1.16s	remaining: 2.69s
400:	test: 0.9515696	best: 0.9515933 (395)	total: 1.58s	remaining: 2.36s
500:	test: 0.9528448	best: 0.9528448 (500)	total: 1.98s	remaining: 1.97s
600:	test: 0.9535662	best: 0.9535662 (600)	total: 2.37s	remaining: 1.58s
700:	test: 0.9539687	best: 0.9539687 (700)	total: 2.74s	remaining: 1.17s
800:	test: 0.9542959	best: 0.9543359 (765)	total: 3.1s	remaining: 771ms
900:	test: 0.9545021	best: 0.9545472 (885)	total: 3.51s	remaining: 386ms
999:	test: 0.9545469	best: 0.9545785 (940)	total: 3.92s	remaining: 0us
bestTest = 0.9545785189
bestIteration = 940
Shrink model to first 941 iterations.


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.8773059	best: 0.8773059 (0)	total: 8.11ms	remaining: 8.1s
100:	test: 0.9349714	best: 0.9349714 (100)	total: 458ms	remaining: 4.08s
200:	test: 0.9410344	best: 0.9410344 (200)	total: 920ms	remaining: 3.65s
300:	test: 0.9459745	best: 0.9459745 (300)	total: 1.37s	remaining: 3.18s
400:	test: 0.9482958	best: 0.9482958 (400)	total: 1.78s	remaining: 2.65s
500:	test: 0.9500356	best: 0.9500356 (500)	total: 2.19s	remaining: 2.18s
600:	test: 0.9508359	best: 0.9509138 (580)	total: 2.64s	remaining: 1.75s
700:	test: 0.9512992	best: 0.9512992 (700)	total: 3.07s	remaining: 1.31s
800:	test: 0.9519677	best: 0.9519965 (795)	total: 3.58s	remaining: 891ms
900:	test: 0.9526670	best: 0.9526670 (900)	total: 4.03s	remaining: 442ms
999:	test: 0.9530442	best: 0.9530707 (965)	total: 4.43s	remaining: 0us
bestTest = 0.9530706704
bestIteration = 965
Shrink model to first 966 iterations.
Overall AUC (CatBoostClassifier): 0.9549561634603558


In [120]:
sample_sub['loan_status'] = xgb_test_predictions * 0.2 + lgbm_test_predictions * 0.5 + catboost_test_predictions * 0.3

In [121]:
sample_sub.to_csv('submission7.csv', index=False)