In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
# Load the data
file_path = Path('Resources/lending_data.csv')
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,loan_size,interest_rate,homeowner,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
0,10700.0,7.672,own,52800,0.431818,5,1,22800,low_risk
1,8400.0,6.692,own,43600,0.311927,3,0,13600,low_risk
2,9000.0,6.963,rent,46100,0.349241,3,0,16100,low_risk
3,10700.0,7.664,own,52700,0.43074,5,1,22700,low_risk
4,10800.0,7.698,mortgage,53000,0.433962,5,1,23000,low_risk


In [4]:
# Display number of rows, columns, etc.
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77536 entries, 0 to 77535
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   loan_size         77536 non-null  float64
 1   interest_rate     77536 non-null  float64
 2   homeowner         77536 non-null  object 
 3   borrower_income   77536 non-null  int64  
 4   debt_to_income    77536 non-null  float64
 5   num_of_accounts   77536 non-null  int64  
 6   derogatory_marks  77536 non-null  int64  
 7   total_debt        77536 non-null  int64  
 8   loan_status       77536 non-null  object 
dtypes: float64(3), int64(4), object(2)
memory usage: 5.3+ MB


In [5]:
df["homeowner"]

0             own
1             own
2            rent
3             own
4        mortgage
           ...   
77531         own
77532    mortgage
77533        rent
77534    mortgage
77535    mortgage
Name: homeowner, Length: 77536, dtype: object

In [6]:
# Binary encoding using Pandas (single column)
df1 = pd.get_dummies(df, columns=["homeowner"])
df1.head()


Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status,homeowner_mortgage,homeowner_own,homeowner_rent
0,10700.0,7.672,52800,0.431818,5,1,22800,low_risk,0,1,0
1,8400.0,6.692,43600,0.311927,3,0,13600,low_risk,0,1,0
2,9000.0,6.963,46100,0.349241,3,0,16100,low_risk,0,0,1
3,10700.0,7.664,52700,0.43074,5,1,22700,low_risk,0,1,0
4,10800.0,7.698,53000,0.433962,5,1,23000,low_risk,1,0,0


In [7]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77536 entries, 0 to 77535
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   loan_size           77536 non-null  float64
 1   interest_rate       77536 non-null  float64
 2   borrower_income     77536 non-null  int64  
 3   debt_to_income      77536 non-null  float64
 4   num_of_accounts     77536 non-null  int64  
 5   derogatory_marks    77536 non-null  int64  
 6   total_debt          77536 non-null  int64  
 7   loan_status         77536 non-null  object 
 8   homeowner_mortgage  77536 non-null  uint8  
 9   homeowner_own       77536 non-null  uint8  
 10  homeowner_rent      77536 non-null  uint8  
dtypes: float64(3), int64(4), object(1), uint8(3)
memory usage: 5.0+ MB


In [8]:
# Creating an instance of label encoder
# le = LabelEncoder()

In [9]:
# Fitting and encoding the columns with the LabelEncoder
# le.fit(df1["loan_status"])
# df1["loan_status"] = le.transform(df1["loan_status"])
# df1.head()



In [10]:
# List the classes identified by the label encoder
# list(le.classes_)

In [11]:
# Create our features
X = df1.copy()
X.drop("loan_status", axis=1, inplace=True)
X.head()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,homeowner_mortgage,homeowner_own,homeowner_rent
0,10700.0,7.672,52800,0.431818,5,1,22800,0,1,0
1,8400.0,6.692,43600,0.311927,3,0,13600,0,1,0
2,9000.0,6.963,46100,0.349241,3,0,16100,0,0,1
3,10700.0,7.664,52700,0.43074,5,1,22700,0,1,0
4,10800.0,7.698,53000,0.433962,5,1,23000,1,0,0


In [12]:
X.describe()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,homeowner_mortgage,homeowner_own,homeowner_rent
count,77536.0,77536.0,77536.0,77536.0,77536.0,77536.0,77536.0,77536.0,77536.0,77536.0
mean,9805.562577,7.292333,49221.949804,0.377318,3.82661,0.392308,19221.949804,0.497472,0.398911,0.103616
std,2093.223153,0.889495,8371.635077,0.081519,1.904426,0.582086,8371.635077,0.499997,0.489678,0.304764
min,5000.0,5.25,30000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,8700.0,6.825,44800.0,0.330357,3.0,0.0,14800.0,0.0,0.0,0.0
50%,9500.0,7.172,48100.0,0.376299,4.0,0.0,18100.0,0.0,0.0,0.0
75%,10400.0,7.528,51400.0,0.416342,4.0,1.0,21400.0,1.0,1.0,0.0
max,23800.0,13.235,105200.0,0.714829,16.0,3.0,75200.0,1.0,1.0,1.0


In [13]:
# Create our target
y = df1["loan_status"]
y.head()

0    low_risk
1    low_risk
2    low_risk
3    low_risk
4    low_risk
Name: loan_status, dtype: object

In [14]:
# Check the balance of our target values
y.value_counts()

low_risk     75036
high_risk     2500
Name: loan_status, dtype: int64

In [15]:
# Create X_train, X_test, y_train, y_test
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1,
                                                    stratify=y
                                                   )
X_train.shape

(58152, 10)

In [16]:
# Create the StandardScaler instance
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [17]:
# Fit the Standard Scaler with the training data
# When fitting scaling functions, only train on the training dataset
X_scaler = scaler.fit(X_train)

In [18]:
# Scale the training and testing data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# SMOTE

In [27]:
# Resample the training data with SMOTE
from imblearn.over_sampling import SMOTE

X_resampled, y_resampled = SMOTE(random_state=1, sampling_strategy=1.0).fit_resample(
    X_train, y_train
)

# View the count of target classes with Counter
from collections import Counter

Counter(y_resampled)

Counter({'low_risk': 56277, 'high_risk': 56277})

In [28]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression
model_smote = LogisticRegression(solver='lbfgs', random_state=1)
model_smote.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [29]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix

y_pred_smote = model_smote.predict(X_test)
cm_smote = confusion_matrix(y_test, y_pred_smote)
cm_smote_df = pd.DataFrame(
    cm_smote, index=["Actual high_risk", "Actual low_risk"], columns=["Predicted high_risk", "Predicted low_risk"]
)
cm_smote_df

Unnamed: 0,Predicted high_risk,Predicted low_risk
Actual high_risk,622,3
Actual low_risk,104,18655


In [30]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
balanced_accuracy_score(y_test, y_pred_smote)

0.9948279972279972

In [32]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred_smote))

                   pre       rec       spe        f1       geo       iba       sup

  high_risk       0.86      1.00      0.99      0.92      0.99      0.99       625
   low_risk       1.00      0.99      1.00      1.00      0.99      0.99     18759

avg / total       1.00      0.99      1.00      0.99      0.99      0.99     19384



# Naive Random Oversampling (Focus HERE)

In [84]:
# Resample the training data with the RandomOversampler
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)


# View the count of target classes with Counter
Counter(y_resampled)

Counter({'low_risk': 56277, 'high_risk': 56277})

In [85]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression

model_nos = LogisticRegression(solver='lbfgs', random_state=1)
model_nos.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [87]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix

y_pred_nos = model_nos.predict(X_test)
confusion_matrix(y_test, y_pred_nos)

cm_nos = confusion_matrix(y_test, y_pred_nos)
cm_nos_df = pd.DataFrame(
    cm_nos, index=["Actual high_risk", "Actual low_risk"], columns=["Predicted high_risk", "Predicted low_risk"]
)
cm_nos_df

Unnamed: 0,Predicted high_risk,Predicted low_risk
Actual high_risk,622,3
Actual low_risk,106,18653


In [88]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score

balanced_accuracy_score(y_test, y_pred_nos)

0.9947746894823818

In [89]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced

print(classification_report_imbalanced(y_test, y_pred_nos))

                   pre       rec       spe        f1       geo       iba       sup

  high_risk       0.85      1.00      0.99      0.92      0.99      0.99       625
   low_risk       1.00      0.99      1.00      1.00      0.99      0.99     18759

avg / total       1.00      0.99      1.00      0.99      0.99      0.99     19384



# Simple Logistic Regression

In [56]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_train, y_train)

LogisticRegression(random_state=1)

In [57]:
# Calculate the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
y_pred_slr = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred_slr)

0.9543211898288821

In [60]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred_slr)
cm_df = pd.DataFrame(
    cm, index=["Actual high_risk", "Actual low_risk"], columns=["Predicted high_risk", "Predicted low_risk"]
)
cm_df

Unnamed: 0,Predicted high_risk,Predicted low_risk
Actual high_risk,571,54
Actual low_risk,93,18666


In [59]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred_slr))

                   pre       rec       spe        f1       geo       iba       sup

  high_risk       0.86      0.91      1.00      0.89      0.95      0.90       625
   low_risk       1.00      1.00      0.91      1.00      0.95      0.92     18759

avg / total       0.99      0.99      0.92      0.99      0.95      0.92     19384

