# Credit Risk

Credit Risk is the possibility of a loss resulting from a borrower's failure to repay a
loan or meet a contractual obligation. The primary goal of a credit risk assessment is to find out whether potential borrowers are creditworthy and have the means to repay their debts so that credit risk or loss can be minimized and the loan is granted to only creditworthy applicants.

If the borrower shows an acceptable level of default risk, then their loan application can be approved upon agreed terms. 

This project involves understanding financial terminologies attached to credit risk and building a classification model for default prediction with Pycaret. Hyperparameter Optimization is done using also the Pycaret library.

# Libraries

In [1]:
import pandas as pd
import inflection
import missingno as msno
import seaborn as sns
import numpy as np
import re

import scipy.stats as stats
from scipy import stats as ss
from scipy.stats import chi2_contingency
from matplotlib import pyplot as plt
from IPython.display import Image

import ipywidgets as widgets
from ipywidgets import interact
import plotly.express as px

from bokeh.plotting import figure, show, output_file
from bokeh.models import ColumnDataSource, HoverTool, CDSView, IndexFilter
from bokeh.layouts import row, column
from scipy.stats import gaussian_kde
from bokeh.models import RangeSlider

from sklearn import preprocessing as pp
from imblearn import under_sampling as us
from imblearn import over_sampling as oversamp
from pandas_profiling import ProfileReport
from category_encoders import TargetEncoder
from sklearn import model_selection as ms
from boruta import BorutaPy
from sklearn.ensemble import RandomForestRegressor

from sklearn.cluster import KMeans
import matplotlib.gridspec as gridspec
from sklearn.cluster import DBSCAN

import random
from sklearn import metrics as m
from sklearn.metrics import classification_report, cohen_kappa_score, roc_curve, brier_score_loss
from sklearn.metrics import accuracy_score, classification_report, cohen_kappa_score, recall_score, f1_score, roc_auc_score, precision_score, roc_curve
#from sklearn.metrics import plot_precision_recall_curve

from scikitplot import metrics as mt
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, cross_val_score, cross_val_predict, RandomizedSearchCV
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.ensemble import RandomForestRegressor, GradientBoostingClassifier, RandomForestClassifier
from sklearn import svm
import xgboost as xgb
from sklearn.neighbors import KNeighborsClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.decomposition import PCA
import category_encoders

from datetime import datetime, timedelta
import pickle

# Loadind Data

In [2]:
pwd

'/Users/lucasquemelli/Documents/repos/credit_default_risk_prediction'

In [3]:
data = pd.read_csv("/Users/lucasquemelli/Documents/repos/credit_default_risk_prediction/credit_risk_data.csv")
data.head()

  data = pd.read_csv("/Users/lucasquemelli/Documents/repos/credit_default_risk_prediction/credit_risk_data.csv")


Unnamed: 0,User_id,employment_type,tier_of_employment,industry,role,work_experience,total_income,gender,married,dependents,...,interest_received,number_of_loans,emi_1_dpd,emi_2_dpd,emi_3_dpd,emi_4_dpd,emi_5_dpd,emi_6_dpd,max_dpd,yearmo
0,7013527,Salaried,B,mLVIVxoGY7TUDJ1FyFoSIZi1SFcaBmO01AydRchaEiGYtU...,KHMbckjadbckIFGAZSEWdkcndwkcnCCM,1-2,125000.0,Female,Yes,4,...,852.69,0,0,90,90,90,90,90,90,202203
1,7014291,Self - Employeed,D,mLVIVxoGY7TUDJ1FyFoSIZi1SFcaBmO01AydRchaEiGYtU...,KHMbckjadbckIFGNCSEWdkcndwkcnCCM,10+,61000.0,Female,No,1,...,4912.53,0,0,0,0,0,0,0,0,202203
2,7014327,,,mLVIVxoGY7TUDJ1FyFoSIZi1SFcaBmO01AydRchaEiGYtU...,KHMbckjadbckIFGNYSEWdkcndwkcnCCM,5-10,100000.0,Other,,3,...,3310.68,0,90,90,90,90,90,90,90,202205
3,7014304,,,mLVIVxoGY7TUDJ1FyFoSIZi1SFcaBmO01AydRchaEiGYtU...,KHMbckjadbckIFGCASEWdkcndwkcnCCM,2-3,30000.0,Male,,1,...,1459.76,0,0,0,0,0,0,0,0,202203
4,7031995,,,mLVIVxoGY7TUDJ1FyFoSIZi1SFcaBmO01AydRchaEiGYtU...,KHMbckjadbckIFGCASEWdkcndwkcnCCM,10+,65000.0,Male,,3,...,1793.0,0,0,0,0,0,0,0,0,202203


In [4]:
cols_old = data.columns

snake_case = lambda x: inflection.underscore(x)

cols_new = list(map(snake_case, cols_old))

data.columns = cols_new

In [5]:
data.columns

Index(['user_id', 'employment_type', 'tier_of_employment', 'industry', 'role',
       'work_experience', 'total_income', 'gender', 'married', 'dependents',
       'home_type', 'pincode', 'has_social_profile', 'is_verified',
       'delinq_2yrs', 'total_payement', 'received_principal',
       'interest_received', 'number_of_loans', 'emi_1_dpd', 'emi_2_dpd',
       'emi_3_dpd', 'emi_4_dpd', 'emi_5_dpd', 'emi_6_dpd', 'max_dpd',
       'yearmo'],
      dtype='object')

# Duplication Test

Since this analysis should be at user level, would it be affordable to have more rows than userids? 

In [6]:
data["user_id"].nunique()

133752

In [7]:
data.shape[0]

143727

Notice below that some users appears more than once. Some of them appears 16 times. Let's check it out. 

In [8]:
data["user_id"].value_counts()

3166625     16
2803481     16
2789468     16
3440082     16
3062454     16
            ..
2814679      1
2819122      1
2818821      1
2819179      1
56122051     1
Name: user_id, Length: 133752, dtype: int64

In [9]:
data[data["user_id"] == 3166625]

Unnamed: 0,user_id,employment_type,tier_of_employment,industry,role,work_experience,total_income,gender,married,dependents,...,interest_received,number_of_loans,emi_1_dpd,emi_2_dpd,emi_3_dpd,emi_4_dpd,emi_5_dpd,emi_6_dpd,max_dpd,yearmo
36527,3166625,Salaried,C,mLVIVxoGY7TUDJ1FyFoSIZi1SFcaBmO01AydRchaEiGYtU...,KHMbckjadbckIFGHISEWdkcndwkcnCCM,10+,45000.0,Female,Yes,3,...,3355.56,0,90,90,90,90,90,90,90,202201
36528,3166625,Salaried,C,mLVIVxoGY7TUDJ1FyFoSIZi1SFcaBmO01AydRchaEiGYtU...,KHMbckjadbckIFGHISEWdkcndwkcnCCM,10+,45000.0,Female,Yes,3,...,550.89,0,0,0,0,0,0,0,0,202203
36529,3166625,Salaried,C,mLVIVxoGY7TUDJ1FyFoSIZi1SFcaBmO01AydRchaEiGYtU...,KHMbckjadbckIFGHISEWdkcndwkcnCCM,10+,45000.0,Female,Yes,2,...,3355.56,0,90,90,90,90,90,90,90,202203
36530,3166625,Salaried,C,mLVIVxoGY7TUDJ1FyFoSIZi1SFcaBmO01AydRchaEiGYtU...,KHMbckjadbckIFGHISEWdkcndwkcnCCM,10+,45000.0,Female,Yes,2,...,550.89,0,0,0,0,0,0,0,0,202203
36531,3166625,Salaried,D,0,KHMbckjadbckIFGMDSEWdkcndwkcnCCM,0,115000.0,Female,Yes,3,...,3355.56,0,90,90,90,90,90,90,90,202205
36532,3166625,Salaried,D,0,KHMbckjadbckIFGMDSEWdkcndwkcnCCM,0,115000.0,Female,Yes,3,...,550.89,0,0,0,0,0,0,0,0,202205
36533,3166625,Salaried,D,0,KHMbckjadbckIFGMDSEWdkcndwkcnCCM,0,115000.0,Female,Yes,2,...,3355.56,0,90,90,90,90,90,90,90,202202
36534,3166625,Salaried,D,0,KHMbckjadbckIFGMDSEWdkcndwkcnCCM,0,115000.0,Female,Yes,2,...,550.89,0,0,0,0,0,0,0,0,202205
36535,3166625,Salaried,C,mLVIVxoGY7TUDJ1FyFoSIZi1SFcaBmO01AydRchaEiGYtU...,KHMbckjadbckIFGHISEWdkcndwkcnCCM,10+,45000.0,Female,Yes,3,...,3355.56,0,0,90,90,90,90,90,90,202201
36536,3166625,Salaried,C,mLVIVxoGY7TUDJ1FyFoSIZi1SFcaBmO01AydRchaEiGYtU...,KHMbckjadbckIFGHISEWdkcndwkcnCCM,10+,45000.0,Female,Yes,3,...,550.89,0,0,0,0,0,0,0,0,202205


In [10]:
data.iloc[:, 0:15]

Unnamed: 0,user_id,employment_type,tier_of_employment,industry,role,work_experience,total_income,gender,married,dependents,home_type,pincode,has_social_profile,is_verified,delinq_2yrs
0,7013527,Salaried,B,mLVIVxoGY7TUDJ1FyFoSIZi1SFcaBmO01AydRchaEiGYtU...,KHMbckjadbckIFGAZSEWdkcndwkcnCCM,1-2,125000.0,Female,Yes,4,rent,XX852X,No,,0
1,7014291,Self - Employeed,D,mLVIVxoGY7TUDJ1FyFoSIZi1SFcaBmO01AydRchaEiGYtU...,KHMbckjadbckIFGNCSEWdkcndwkcnCCM,10+,61000.0,Female,No,1,mortgage,XX286X,,Source Verified,0
2,7014327,,,mLVIVxoGY7TUDJ1FyFoSIZi1SFcaBmO01AydRchaEiGYtU...,KHMbckjadbckIFGNYSEWdkcndwkcnCCM,5-10,100000.0,Other,,3,own,XX113X,No,,0
3,7014304,,,mLVIVxoGY7TUDJ1FyFoSIZi1SFcaBmO01AydRchaEiGYtU...,KHMbckjadbckIFGCASEWdkcndwkcnCCM,2-3,30000.0,Male,,1,rent,XX941X,Yes,,0
4,7031995,,,mLVIVxoGY7TUDJ1FyFoSIZi1SFcaBmO01AydRchaEiGYtU...,KHMbckjadbckIFGCASEWdkcndwkcnCCM,10+,65000.0,Male,,3,rent,XX913X,No,Verified,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
143722,55993166,,,0,KHMbckjadbckIFGTNSEWdkcndwkcnCCM,0,100000.0,Female,No,2,mortgage,XX374X,,Source Verified,1
143723,56263229,,,0,KHMbckjadbckIFGOHSEWdkcndwkcnCCM,0,27000.0,Male,Yes,3,rent,XX436X,No,Source Verified,0
143724,55992716,,,0,KHMbckjadbckIFGCASEWdkcndwkcnCCM,0,53000.0,Female,Yes,2,rent,XX908X,,,0
143725,56363313,,,0,KHMbckjadbckIFGVASEWdkcndwkcnCCM,0,75400.0,Other,No,0,mortgage,XX229X,,Source Verified,0


In [14]:
data.iloc[:, 15:30]

Unnamed: 0,total_payement,received_principal,interest_received,number_of_loans,emi_1_dpd,emi_2_dpd,emi_3_dpd,emi_4_dpd,emi_5_dpd,emi_6_dpd,max_dpd,yearmo
0,1824.150000,971.46,852.69,0,0,90,90,90,90,90,90,202203
1,22912.532998,18000.00,4912.53,0,0,0,0,0,0,0,0,202203
2,7800.440000,4489.76,3310.68,0,90,90,90,90,90,90,90,202205
3,6672.050000,5212.29,1459.76,0,0,0,0,0,0,0,0,202203
4,11793.001345,10000.00,1793.00,0,0,0,0,0,0,0,0,202203
...,...,...,...,...,...,...,...,...,...,...,...,...
143722,12842.400000,7193.83,5648.57,0,0,0,0,30,0,0,30,202205
143723,839.200000,581.67,257.53,0,0,0,0,0,0,0,0,202203
143724,6085.800000,2434.49,3651.31,0,0,0,0,0,0,0,0,202204
143725,4080.640000,3286.82,793.82,0,0,0,0,0,0,30,30,202202


# Deduplication

Let's consider for deduplication:

1. The most recent updated date in our dataset for every user.
2. The smallest total income. 

In [10]:
# sort the dataframe by 'yearmo' in descending order
data.sort_values(by=['yearmo', 'total_income'], ascending=[False, True], inplace=True)

# drop duplicates based on 'user_id' while keeping the first occurence (maximum 'yearmo')
data = data.drop_duplicates(subset='user_id', keep='first')

# reset the index of the new dataframe
data.reset_index(drop=True, inplace=True)

In [11]:
data["user_id"].nunique()

133752

In [12]:
data.shape[0]

133752

In [13]:
data[data["user_id"] == 3166625]

Unnamed: 0,user_id,employment_type,tier_of_employment,industry,role,work_experience,total_income,gender,married,dependents,...,interest_received,number_of_loans,emi_1_dpd,emi_2_dpd,emi_3_dpd,emi_4_dpd,emi_5_dpd,emi_6_dpd,max_dpd,yearmo
6922,3166625,Salaried,C,mLVIVxoGY7TUDJ1FyFoSIZi1SFcaBmO01AydRchaEiGYtU...,KHMbckjadbckIFGHISEWdkcndwkcnCCM,10+,45000.0,Female,Yes,3,...,550.89,0,0,0,0,0,0,0,0,202205
