In [13]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

## Prediction: The Logistical Regression model will work better with the numeric values provdided in the dataset than the Random Forest model; my prediction is based upon google searches comparing the strengths of the two models. 

# -----------------------------
## Logistical Regression Model
# -----------------------------

In [14]:
# Import the data
# Load dataset
file_path = Path("../Resources/lending_data.csv")
df_lending_data = pd.read_csv(file_path)
df_lending_data.head()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
0,10700.0,7.672,52800,0.431818,5,1,22800,0
1,8400.0,6.692,43600,0.311927,3,0,13600,0
2,9000.0,6.963,46100,0.349241,3,0,16100,0
3,10700.0,7.664,52700,0.43074,5,1,22700,0
4,10800.0,7.698,53000,0.433962,5,1,23000,0


In [15]:
# Define the X (features) and y (target) sets
y = df_lending_data["derogatory_marks"].values
X = df_lending_data.drop("derogatory_marks", axis=1)

In [16]:
# Split the data into X_train, X_test, y_train, y_test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [17]:
# Train a Logistic Regression model print the model score
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(max_iter=100)
classifier

In [18]:
classifier.fit(X_train, y_train)

### Without the testing scores were stonger without scaling than with scaling leaving doubt about the strength of the model.

In [19]:
# Validate the model using the test data
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 0.9964747558123538
Testing Data Score: 0.9960276516714817


In [20]:
# Scaling the X data by using StandardScaler()
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_train_scaled

array([[-0.43187896, -0.44814146, -0.4438063 , ..., -0.43366145,
        -0.4438063 , -0.18418365],
       [-0.38434183, -0.40451293, -0.40814802, ..., -0.43366145,
        -0.40814802, -0.18418365],
       [-0.43187896, -0.44478542, -0.4438063 , ..., -0.43366145,
        -0.4438063 , -0.18418365],
       ...,
       [ 4.13168534,  4.12949813,  4.13233907, ...,  4.27085157,
         4.13233907,  5.42936345],
       [-0.14665619, -0.13379231, -0.13476791, ...,  0.08906222,
        -0.13476791, -0.18418365],
       [ 0.13856658,  0.1548272 ,  0.1504983 , ...,  0.08906222,
         0.1504983 , -0.18418365]])

In [21]:
# Validate the model using the scaled test data
print(f"Training Data Score: {classifier.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 0.6019569404319713
Testing Data Score: 0.9960276516714817




In [22]:
from sklearn.metrics import confusion_matrix

y_true = y_test
y_pred = classifier.predict(X_test)
confusion_matrix(y_true, y_pred)

array([[12507,    76,     0,     0],
       [    0,  6118,     0,     0],
       [    0,     0,   572,     0],
       [    0,     0,     1,   110]], dtype=int64)

In [26]:
confusion_matrix(y, classifier.predict(X))


array([[50222,   275,     0,     0],
       [    0, 24174,     0,     0],
       [    0,     0,  2351,     0],
       [    0,     0,     7,   507]], dtype=int64)

In [24]:
classifier.coef_


array([[-8.40102790e-03,  1.38574261e-04,  4.20302157e-01,
        -2.87429724e-06, -2.03815750e-04, -9.46573154e-01,
        -1.20462970e-05],
       [ 6.42016755e-03,  3.22362693e-06,  2.83322842e-02,
         7.91420892e-06, -2.63204664e-05,  2.59438748e-02,
        -3.91278897e-05],
       [ 2.47537470e-03, -6.07239536e-05, -1.87850056e-01,
        -1.16015471e-06,  1.01764191e-04,  4.05119680e-01,
         3.08000198e-05],
       [-4.94514350e-04, -8.10739341e-05, -2.60784386e-01,
        -3.87975697e-06,  1.28372025e-04,  5.15509599e-01,
         2.03741669e-05]])