# Libraries

In [3]:
import pandas as pd
import inflection
import missingno as msno
import seaborn as sns
import numpy as np
import re

import scipy.stats as stats
from scipy import stats as ss
from scipy.stats import chi2_contingency
from matplotlib import pyplot as plt
from IPython.display import Image
from utils import *

import ipywidgets as widgets
from ipywidgets import interact
import plotly.express as px

from bokeh.plotting import figure, show, output_file
from bokeh.models import ColumnDataSource, HoverTool, CDSView, IndexFilter
from bokeh.layouts import row, column
from scipy.stats import gaussian_kde
from bokeh.models import RangeSlider

from sklearn import preprocessing as pp
from imblearn import under_sampling as us
from imblearn import over_sampling as oversamp
from pandas_profiling import ProfileReport
from category_encoders import TargetEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn import model_selection as ms
from boruta import BorutaPy
from sklearn.ensemble import RandomForestRegressor

from sklearn.cluster import KMeans
import matplotlib.gridspec as gridspec
from sklearn.cluster import DBSCAN

import random
from sklearn import metrics as m
from sklearn.metrics import classification_report, cohen_kappa_score, roc_curve, brier_score_loss
from sklearn.metrics import accuracy_score, classification_report, cohen_kappa_score, recall_score, f1_score, roc_auc_score, precision_score, roc_curve
#from sklearn.metrics import plot_precision_recall_curve

from scikitplot import metrics as mt
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, cross_val_score, cross_val_predict, RandomizedSearchCV
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.ensemble import RandomForestRegressor, GradientBoostingClassifier, RandomForestClassifier
from sklearn import svm
import xgboost as xgb
from sklearn.neighbors import KNeighborsClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.decomposition import PCA
import category_encoders

from datetime import datetime, timedelta
import pickle

# Loading Data

In [1]:
pwd

'/Users/lucasquemelli/Documents/repos/credit_risk_assessment'

In [4]:
data = pd.read_csv("/Users/lucasquemelli/Documents/repos/credit_risk_assessment/credit_risk_dataset.csv")
data.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,22,59000,RENT,123.0,PERSONAL,D,35000,16.02,1,0.59,Y,3
1,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.1,N,2
2,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3
3,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2
4,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4


In [5]:
data.columns

Index(['person_age', 'person_income', 'person_home_ownership',
       'person_emp_length', 'loan_intent', 'loan_grade', 'loan_amnt',
       'loan_int_rate', 'loan_status', 'loan_percent_income',
       'cb_person_default_on_file', 'cb_person_cred_hist_length'],
      dtype='object')

# Duplication Analysis

Since this analysis should be at user level, would it be affordable to have repeated rows?
        
        * Ideally, we should have some column-key such as userid to identify any specific user. 

In [7]:
data.shape[0]

32581

In [8]:
data.drop_duplicates(inplace=True)

In [9]:
data.shape[0]

32416

# Helper Functions

In [10]:
def cramers_v(x, y):
    confusion_matrix = pd.crosstab(x, y).values
    n = confusion_matrix.sum()
    r,k = confusion_matrix.shape
   
    chi2 = chi2_contingency(confusion_matrix)[0]
    phi2 = chi2/n
    phi2corr = max(0, phi2-((k-1)*(r-1))/(n-1))
   
    rcorr = r-((r-1)**2)/(n-1)
    kcorr = k-((k-1)**2)/(n-1)
    return np.sqrt(phi2corr/min((kcorr-1),(rcorr-1)))

# 1. Data Description

- person_age: Age of the individual applying for the loan.

- person_income: Annual income of the individual.

- person_home_ownership: Type of home ownership of the individual.

        - rent: The individual is currently renting a property.
        - mortgage: The individual has a mortgage on the property they own.
        - own: The individual owns their home outright.
        - other: Other categories of home ownership that may be specific to the dataset.
        
- person_emp_length: Employment length of the individual in years.

- loan_intent: The intent behind the loan application.

- loan_grade: The grade assigned to the loan based on the creditworthiness of the borrower.

        - A: The borrower has a high creditworthiness, indicating low risk.
        - B: The borrower is relatively low-risk, but not as creditworthy as Grade A.
        - C: The borrower's creditworthiness is moderate.
        - D: The borrower is considered to have higher risk compared to previous grades.
        - E: The borrower's creditworthiness is lower, indicating a higher risk.
        - F: The borrower poses a significant credit risk.
        - G: The borrower's creditworthiness is the lowest, signifying the highest risk.
        
- loan_amnt: The loan amount requested by the individual.

- loan_int_rate: The interest rate associated with the loan.

- loan_status: Loan status, where 0 indicates non-default and 1 indicates default.

        - 0: Non-default - The borrower successfully repaid the loan as agreed, and there was no default.
        - 1: Default - The borrower failed to repay the loan according to the agreed-upon terms and defaulted on the loan.
        
- loan_percent_income: The percentage of income represented by the loan amount.

- cb_person_default_on_file: Historical default of the individual as per credit bureau records.

        - Y: The individual has a history of defaults on their credit file.
        - N: The individual does not have any history of defaults.
        
- cb_preson_cred_hist_length: The length of credit history for the individual.