In [1]:
# Import the modules
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import confusion_matrix, classification_report

---

## Split the Data into Training and Testing Sets

### Step 1: Read the `lending_data.csv` data from the `Resources` folder into a Pandas DataFrame.

In [2]:
# Read the CSV file from the Resources folder into a Pandas DataFrame
file_path = Path("../Credit_Risk/lending_data.csv")
orig_df = pd.read_csv(file_path)
# Review the DataFrame
orig_df.head()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
0,10700.0,7.672,52800,0.431818,5,1,22800,0
1,8400.0,6.692,43600,0.311927,3,0,13600,0
2,9000.0,6.963,46100,0.349241,3,0,16100,0
3,10700.0,7.664,52700,0.43074,5,1,22700,0
4,10800.0,7.698,53000,0.433962,5,1,23000,0


In [3]:
high_risk = orig_df.loc[(orig_df["loan_status"] == 1)]
high_risk

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
75036,18700.0,11.083,84900,0.646643,12,2,54900,1
75037,17900.0,10.734,81600,0.632353,11,2,51600,1
75038,18300.0,10.902,83200,0.639423,11,2,53200,1
75039,18700.0,11.089,85000,0.647059,12,2,55000,1
75040,17000.0,10.346,78000,0.615385,10,2,48000,1
...,...,...,...,...,...,...,...,...
77531,19100.0,11.261,86600,0.653580,12,2,56600,1
77532,17700.0,10.662,80900,0.629172,11,2,50900,1
77533,17600.0,10.595,80300,0.626401,11,2,50300,1
77534,16300.0,10.068,75300,0.601594,10,2,45300,1


### Step 2: Create the labels set (`y`)  from the “loan_status” column, and then create the features (`X`) DataFrame from the remaining columns.

In [4]:
# Refs: Module 20/Day1/Act-04
# Separate the data into labels and features

# Separate the y variable, the labels
y = orig_df["loan_status"]

# Separate the X variable, the features
X = orig_df.drop("loan_status", axis=1)

In [5]:
# Refs: Module 20/Day1/Act-01
# Review the y variable Series
y[:10]

0    0
1    0
2    0
3    0
4    0
5    0
6    0
7    0
8    0
9    0
Name: loan_status, dtype: int64

In [6]:
# Review the X variable DataFrame
X

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt
0,10700.0,7.672,52800,0.431818,5,1,22800
1,8400.0,6.692,43600,0.311927,3,0,13600
2,9000.0,6.963,46100,0.349241,3,0,16100
3,10700.0,7.664,52700,0.430740,5,1,22700
4,10800.0,7.698,53000,0.433962,5,1,23000
...,...,...,...,...,...,...,...
77531,19100.0,11.261,86600,0.653580,12,2,56600
77532,17700.0,10.662,80900,0.629172,11,2,50900
77533,17600.0,10.595,80300,0.626401,11,2,50300
77534,16300.0,10.068,75300,0.601594,10,2,45300


### Step 3: Split the data into training and testing datasets by using `train_test_split`.

In [7]:
# Ref: Module 20/Day1/Act-04
# Import the train_test_learn module
from sklearn.model_selection import train_test_split

# Split the data using train_test_split
# Assign a random_state of 1 to the function
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1, 
                                                    stratify=y)
X_train.shape

(58152, 7)

In [8]:
# X_train
# X_test
# y_train
# y_test

---

## Create a Logistic Regression Model with the Original Data

###  Step 1: Fit a logistic regression model by using the training data (`X_train` and `y_train`).

In [9]:
# Ref: Module 20/Day1/Act-04

# Import the LogisticRegression module from SKLearn
from sklearn.linear_model import LogisticRegression

# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
classifier = LogisticRegression(solver='lbfgs',
                                max_iter=200,
                                random_state=1)
# classifier

# Fit the model using training data
classifier.fit(X_train, y_train)

In [10]:
# Score the model
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 0.9914878250103177
Testing Data Score: 0.9924164259182832


### Step 2: Save the predictions on the testing data labels by using the testing feature data (`X_test`) and the fitted model.

In [11]:
# Ref: Module 20/Day1/Act-04
# Make a prediction using the testing data
y_predictions = classifier.predict(X_test)
results = pd.DataFrame({"Prediction": y_predictions, "Actual": y_test}).reset_index(drop=True)
results.head(10)

Unnamed: 0,Prediction,Actual
0,0,0
1,0,1
2,0,0
3,0,0
4,0,0
5,0,0
6,0,0
7,0,0
8,0,0
9,0,0


### Step 3: Evaluate the model’s performance by doing the following:

* Generate a confusion matrix.

* Print the classification report.

In [12]:
# Refs: Module 20/Day1/Act-05
# Generate a confusion matrix for the model
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_predictions))

Confusion Matrix:
[[18679    80]
 [   67   558]]


In [13]:
# Refs: Module 20/Day1/Act-05
# Print the classification report for the model
target_names = ["Healthy Loan [0]", "High-Risk Loan [1]"]
print(classification_report(y_test, y_predictions, target_names = target_names))

                    precision    recall  f1-score   support

  Healthy Loan [0]       1.00      1.00      1.00     18759
High-Risk Loan [1]       0.87      0.89      0.88       625

          accuracy                           0.99     19384
         macro avg       0.94      0.94      0.94     19384
      weighted avg       0.99      0.99      0.99     19384



In [14]:
# Checking the original DataFrame to see how many loans were Healthy Loans (0) vs High-Risk Loans (1)
orig_df["loan_status"].value_counts()

loan_status
0    75036
1     2500
Name: count, dtype: int64

### Step 4: Answer the following question.

**Question:** How well does the logistic regression model predict both the `0` (healthy loan) and `1` (high-risk loan) labels?

**Answer:** The purpose of the analyis is to build a machine learning model that can identify whether a borrower will or will not be creditworthy. The model demonstrates an overall accuracy of 99%, indicating the logistic regression model predicts the labels very well. It correctly predicts Healthy Loan's 100% of the time, and correctly predicts high-risk loans 87% of the time. We can also see that the original DataFrame had an imbalance of healthy versus high-risk loans in which to train the model. Out of 77,536 loans, just 2,500 were high-risk.<br><br>
True Negatives: 18,679<br>
False Positives: 80<br>
False Negatives: 67<br>
True Positives: 558<br>

---