# Prediction: I believe the Random Forest Classifier model will perform better than the Logistic Regression model in this case based on the fact that we are looking for the probability of whether or not a loan will approved. We are not seeking a classified result, such as 0 or 1. 

In [22]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import os

In [23]:
# Import the data
file_path = Path("lending_data.csv")
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
0,10700.0,7.672,52800,0.431818,5,1,22800,0
1,8400.0,6.692,43600,0.311927,3,0,13600,0
2,9000.0,6.963,46100,0.349241,3,0,16100,0
3,10700.0,7.664,52700,0.43074,5,1,22700,0
4,10800.0,7.698,53000,0.433962,5,1,23000,0


In [24]:
# List column/feature names
df.columns

Index(['loan_size', 'interest_rate', 'borrower_income', 'debt_to_income',
       'num_of_accounts', 'derogatory_marks', 'total_debt', 'loan_status'],
      dtype='object')

In [25]:
# Assign the data to X and y
X = df[['loan_size', 'interest_rate', 'borrower_income', 'debt_to_income',
       'num_of_accounts', 'derogatory_marks', 'total_debt']]
y = df['loan_status']
print("Shape: ", X.shape, y.shape)

Shape:  (77536, 7) (77536,)


In [26]:
# Split the data into X_train, X_test, y_train, y_test
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, train_size=0.8)

In [27]:
# Create a Logistic Regression model
classifier = LogisticRegression()
classifier

LogisticRegression()

In [28]:
# Fit (train) the model by using the training data
classifier.fit(X_train, y_train)

LogisticRegression()

In [30]:
# Print the model score
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 0.9915522022312504
Testing Data Score: 0.9928424039205571


In [34]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [36]:
# Train a Random Forest Classifier model and print the model score
clf = RandomForestClassifier(random_state=42, n_estimators=50).fit(X_train_scaled, y_train)
print(f"Training Score: {clf.score(X_train_scaled, y_train)}")
print(f"Testing Score: {clf.score(X_test_scaled, y_test)}")

Training Score: 0.996969110724189
Testing Score: 0.9923265411400567
