### Overview

This is the main step that involves creating a credit risk predictive model using historical data to train a machine learning model that can assess the risk associated with lending money to individuals or businesses. This model predicts the likelihood of a borrower defaulting on a loan or credit obligation.

### Data Preprocessing

In [6]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [2]:
# Load the dataset
df = pd.read_csv(r'resources/loan_data_cleaned.csv')

In [3]:
# View the dataset head rows
df.head()

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,next_pymnt_month,last_pymnt_year,last_pymnt_month,last_credit_pull_year,last_credit_pull_month,loan_term,int_rate_diff,dti_ratio,credit_util_ratio,loan_status_binary
0,1077501,1296599,5000,5000,4975.0,36 months,10.65,162.87,B,B2,...,,2015.0,1.0,2016.0,1.0,36,-3.179236,0.001152,0.449256,0
1,1077430,1314167,2500,2500,2500.0,60 months,15.27,59.83,C,C4,...,,2013.0,4.0,2013.0,9.0,60,1.440764,3.3e-05,0.055532,1
2,1077175,1313524,2400,2400,2400.0,36 months,15.96,84.33,C,C5,...,,2014.0,6.0,2016.0,1.0,36,2.130764,0.000712,0.097304,0
3,1076863,1277178,10000,10000,10000.0,36 months,13.49,339.31,C,C1,...,,2015.0,1.0,2015.0,1.0,36,-0.339236,0.000407,0.184271,0
4,1075358,1311748,3000,3000,3000.0,60 months,12.69,67.79,B,B5,...,2.0,2016.0,1.0,2016.0,1.0,60,-1.139236,0.000224,0.914544,0


In [4]:
# View the types of each columns in dataset in order to encode categorical column
for col, dtype in zip(df.columns, df.dtypes):
    print(f'{col}: {dtype}')

id: int64
member_id: int64
loan_amnt: int64
funded_amnt: int64
funded_amnt_inv: float64
term: object
int_rate: float64
installment: float64
grade: object
sub_grade: object
emp_title: object
emp_length: object
home_ownership: object
annual_inc: float64
verification_status: object
issue_d: object
loan_status: object
pymnt_plan: object
desc: object
purpose: object
title: object
zip_code: object
addr_state: object
dti: float64
delinq_2yrs: float64
earliest_cr_line: object
inq_last_6mths: float64
mths_since_last_delinq: float64
mths_since_last_record: float64
open_acc: float64
pub_rec: float64
revol_bal: int64
revol_util: float64
total_acc: float64
initial_list_status: object
out_prncp: float64
out_prncp_inv: float64
total_pymnt: float64
total_pymnt_inv: float64
total_rec_prncp: float64
total_rec_int: float64
total_rec_late_fee: float64
recoveries: float64
collection_recovery_fee: float64
last_pymnt_d: object
last_pymnt_amnt: float64
next_pymnt_d: object
last_credit_pull_d: object
collectio

In [23]:
# Count the number of approved/rejected loans binary
df['loan_status_binary'].value_counts()

loan_status_binary
0    423810
1     42475
Name: count, dtype: int64

: 

In [5]:
# Drop irrelevant columns
df = df[['loan_status_binary', 'loan_amnt', 'funded_amnt', 'term', 'int_rate', 'installment', 'grade', 'sub_grade', 'emp_length', 'home_ownership', 'annual_inc', 'verification_status', 'purpose', 'dti', 'delinq_2yrs', 'inq_last_6mths', 'open_acc', 'pub_rec', 'revol_bal', 'revol_util', 'total_acc']]

# Handle missing values
df.dropna(inplace=True)

# Convert categorical variables into dummy/indicator variables
df = pd.get_dummies(df, columns=['term', 'grade', 'sub_grade', 'emp_length', 'home_ownership', 'verification_status', 'purpose'])

# Split data into features and target variable
X = df.drop('loan_status_binary', axis=1)
y = df['loan_status_binary']

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
# Initialize Random Forest Classifier
rf_classifier = RandomForestClassifier()

# Train the model
rf_classifier.fit(X_train, y_train)

In [8]:
# Predict on the test set
y_pred = rf_classifier.predict(X_test)

In [9]:
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9091757187128044


In [20]:
from sklearn.naive_bayes import GaussianNB

gfc = GaussianNB()
gfc.fit(X_train, y_train)
pred1 = gfc.predict(X_test)

pred1

array([0, 0, 0, ..., 0, 0, 0])

In [19]:
from sklearn.metrics import accuracy_score

def loss(y_test, y_pred):
    acc = accuracy_score(y_test, y_pred)
    print(acc)

loss(y_test, y_pred)

0.9091757187128044


loan_status_binary
0    423810
1     42475
Name: count, dtype: int64

In [22]:
df.head()

Unnamed: 0,loan_status_binary,loan_amnt,funded_amnt,int_rate,installment,annual_inc,dti,delinq_2yrs,inq_last_6mths,open_acc,...,purpose_home_improvement,purpose_house,purpose_major_purchase,purpose_medical,purpose_moving,purpose_other,purpose_renewable_energy,purpose_small_business,purpose_vacation,purpose_wedding
0,0,5000,5000,10.65,162.87,24000.0,27.65,0.0,1.0,3.0,...,False,False,False,False,False,False,False,False,False,False
1,1,2500,2500,15.27,59.83,30000.0,1.0,0.0,5.0,3.0,...,False,False,False,False,False,False,False,False,False,False
2,0,2400,2400,15.96,84.33,12252.0,8.72,0.0,2.0,2.0,...,False,False,False,False,False,False,False,True,False,False
3,0,10000,10000,13.49,339.31,49200.0,20.0,0.0,1.0,10.0,...,False,False,False,False,False,True,False,False,False,False
4,0,3000,3000,12.69,67.79,80000.0,17.94,0.0,0.0,15.0,...,False,False,False,False,False,True,False,False,False,False
