In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

In [3]:
df = pd.read_csv('lending_data.csv')

In [4]:
df.head()

Unnamed: 0,loan_size,interest_rate,homeowner,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
0,10700.0,7.672,own,52800,0.431818,5,1,22800,low_risk
1,8400.0,6.692,own,43600,0.311927,3,0,13600,low_risk
2,9000.0,6.963,rent,46100,0.349241,3,0,16100,low_risk
3,10700.0,7.664,own,52700,0.43074,5,1,22700,low_risk
4,10800.0,7.698,mortgage,53000,0.433962,5,1,23000,low_risk


In [5]:
X = df.iloc[:,:-1]

In [6]:
from sklearn.preprocessing import LabelEncoder
label = LabelEncoder()

In [7]:
X['home_owner'] = label.fit_transform(X['homeowner']) 

In [8]:
X = X.drop(columns = ['homeowner'])

In [9]:
X.head()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,home_owner
0,10700.0,7.672,52800,0.431818,5,1,22800,1
1,8400.0,6.692,43600,0.311927,3,0,13600,1
2,9000.0,6.963,46100,0.349241,3,0,16100,2
3,10700.0,7.664,52700,0.43074,5,1,22700,1
4,10800.0,7.698,53000,0.433962,5,1,23000,0


In [10]:
y = df['loan_status']

In [11]:
y

0         low_risk
1         low_risk
2         low_risk
3         low_risk
4         low_risk
           ...    
77531    high_risk
77532    high_risk
77533    high_risk
77534    high_risk
77535    high_risk
Name: loan_status, Length: 77536, dtype: object

In [12]:
X.describe()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,home_owner
count,77536.0,77536.0,77536.0,77536.0,77536.0,77536.0,77536.0,77536.0
mean,9805.562577,7.292333,49221.949804,0.377318,3.82661,0.392308,19221.949804,0.606144
std,2093.223153,0.889495,8371.635077,0.081519,1.904426,0.582086,8371.635077,0.667811
min,5000.0,5.25,30000.0,0.0,0.0,0.0,0.0,0.0
25%,8700.0,6.825,44800.0,0.330357,3.0,0.0,14800.0,0.0
50%,9500.0,7.172,48100.0,0.376299,4.0,0.0,18100.0,1.0
75%,10400.0,7.528,51400.0,0.416342,4.0,1.0,21400.0,1.0
max,23800.0,13.235,105200.0,0.714829,16.0,3.0,75200.0,2.0


In [13]:
y.value_counts()

low_risk     75036
high_risk     2500
Name: loan_status, dtype: int64

In [14]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test =train_test_split(X,y,random_state = 1)

In [15]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [16]:
x_scaler =scaler.fit(X_train)

In [17]:
x_train_scaled = x_scaler.transform(X_train)
x_test_scaled = x_scaler.transform(X_test)

In [18]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver = 'lbfgs' , random_state =1)

In [19]:
model.fit(x_train_scaled,y_train)

LogisticRegression(random_state=1)

In [21]:
predictions = model.predict(x_test_scaled)

In [23]:
from sklearn.metrics import balanced_accuracy_score , confusion_matrix
balanced_accuracy_score(y_test,predictions)

0.9889115309798473

In [25]:
print(confusion_matrix(y_test,predictions))

[[  609    10]
 [  113 18652]]


In [27]:
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test,predictions))

                   pre       rec       spe        f1       geo       iba       sup

  high_risk       0.84      0.98      0.99      0.91      0.99      0.98       619
   low_risk       1.00      0.99      0.98      1.00      0.99      0.98     18765

avg / total       0.99      0.99      0.98      0.99      0.99      0.98     19384

