In [27]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

## Prediction: The Logistical Regression model will work better with the numeric values provdided in the dataset than the Random Forest model; my prediction is based upon google searches comparing the strengths of the two models. 

# -----------------------------
## Logistical Regression Model
# -----------------------------

In [28]:
# Import the data
# Load dataset
file_path = Path("./Resources/lending_data.csv")
df_lending_data = pd.read_csv(file_path)
df_lending_data.head()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
0,10700.0,7.672,52800,0.431818,5,1,22800,0
1,8400.0,6.692,43600,0.311927,3,0,13600,0
2,9000.0,6.963,46100,0.349241,3,0,16100,0
3,10700.0,7.664,52700,0.43074,5,1,22700,0
4,10800.0,7.698,53000,0.433962,5,1,23000,0


In [29]:
# Define the X (features) and y (target) sets
y = df_lending_data["derogatory_marks"].values
X = df_lending_data.drop("derogatory_marks", axis=1)

In [30]:
# Split the data into X_train, X_test, y_train, y_test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [31]:
# Train a Logistic Regression model print the model score
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(max_iter=10000)
classifier

In [32]:
classifier.fit(X_train, y_train)

### The testing score indicates that the model is a good fit for the data

In [33]:
# Validate the model using the test data
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 0.9964575594992434
Testing Data Score: 0.9957697069748246


In [34]:
# Scaling the X data by using StandardScaler()
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_train_scaled

array([[-0.57708952, -0.56367666, -0.5652314 , ..., -0.43489843,
        -0.5652314 , -0.18283195],
       [-0.95927354, -0.98302549, -0.98332378, ..., -0.96014741,
        -0.98332378, -0.18283195],
       [ 0.09173251,  0.10413354,  0.10371642, ...,  0.09035056,
         0.10371642, -0.18283195],
       ...,
       [ 0.18727852,  0.19070153,  0.18733489, ...,  0.09035056,
         0.18733489, -0.18283195],
       [ 0.61723554,  0.61229888,  0.61737277, ...,  0.61559954,
         0.61737277, -0.18283195],
       [ 0.37837052,  0.3807014 ,  0.37846284, ...,  0.61559954,
         0.37846284, -0.18283195]])

In [35]:
# Validate the model using the scaled test data
print(f"Training Data Score: {classifier.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 0.6010455358371165
Testing Data Score: 0.9957697069748246




In [36]:
from sklearn.metrics import confusion_matrix

y_true = y_test
y_pred = classifier.predict(X_test)
confusion_matrix(y_true, y_pred)

array([[12603,     0,     0,     0],
       [   78,  5981,     0,     0],
       [    0,     0,   607,     0],
       [    0,     0,     4,   111]], dtype=int64)

In [37]:
confusion_matrix(y, classifier.predict(X))


array([[50497,     0,     0,     0],
       [  281, 23893,     0,     0],
       [    0,     0,  2351,     0],
       [    0,     0,     7,   507]], dtype=int64)

In [38]:
classifier.coef_


array([[-9.02055363e-03,  1.61809878e-04,  4.93055486e-01,
        -3.11541242e-06, -2.31514801e-04, -1.10076459e+00,
        -1.43647293e-05],
       [ 7.43730516e-03,  1.00823456e-05,  5.31815862e-02,
         9.78930583e-06, -4.57974498e-05, -9.30924098e-03,
        -4.94962911e-05],
       [ 2.30909505e-03, -7.59593511e-05, -2.37364453e-01,
        -1.94380038e-06,  1.26051577e-04,  5.00904345e-01,
         4.31506061e-05],
       [-7.25846580e-04, -9.59328720e-05, -3.08872619e-01,
        -4.73009304e-06,  1.51260673e-04,  6.09169489e-01,
         2.07104143e-05]])

# Random Forest Regression Analysis

In [39]:

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)