In [4]:
# Import the modules
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report

---

## Split the Data into Training and Testing Sets

### Step 1: Read the `lending_data.csv` data from the `Resources` folder into a Pandas DataFrame.

In [5]:
# Read the CSV file from the Resources folder into a Pandas DataFrame
# YOUR CODE HERE!
# Review the DataFrame
# YOUR CODE HERE!

# Load the data into a Pandas DataFrame
df_lending_data = pd.read_csv(
    "lending_data.csv",
    index_col="loan_size")

# Display sample data
df_lending_data.head(10)


Unnamed: 0_level_0,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
loan_size,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
10700.0,7.672,52800,0.431818,5,1,22800,0
8400.0,6.692,43600,0.311927,3,0,13600,0
9000.0,6.963,46100,0.349241,3,0,16100,0
10700.0,7.664,52700,0.43074,5,1,22700,0
10800.0,7.698,53000,0.433962,5,1,23000,0
10100.0,7.438,50600,0.407115,4,1,20600,0
10300.0,7.49,51100,0.412916,4,1,21100,0
8800.0,6.857,45100,0.334812,3,0,15100,0
9300.0,7.096,47400,0.367089,3,0,17400,0
9700.0,7.248,48800,0.385246,4,0,18800,0


### Step 2: Create the labels set (`y`)  from the “loan_status” column, and then create the features (`X`) DataFrame from the remaining columns.

In [8]:
# Separate the data into labels and features
# Separate the y variable, the labels
# Separate the X variable, the features
# -----------------------------------------------------
# Step 2: Create the labels set (y) from the “loan_status” column, and then create the features (X) DataFrame from the remaining columns.
# Separate the data into labels and features

# Separate the y variable, the labels
y = df_lending_data["loan_status"]

# Separate the X variable, the features
X = df_lending_data.drop(columns=["loan_status"])

In [11]:
# Review the y variable Series
y.head(10)  

loan_size
10700.0    0
8400.0     0
9000.0     0
10700.0    0
10800.0    0
10100.0    0
10300.0    0
8800.0     0
9300.0     0
9700.0     0
Name: loan_status, dtype: int64

In [10]:
# Review the X variable DataFrame
X.head(10)  # This will display the first few rows of the feature DataFrame

Unnamed: 0_level_0,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt
loan_size,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
10700.0,7.672,52800,0.431818,5,1,22800
8400.0,6.692,43600,0.311927,3,0,13600
9000.0,6.963,46100,0.349241,3,0,16100
10700.0,7.664,52700,0.43074,5,1,22700
10800.0,7.698,53000,0.433962,5,1,23000


### Step 3: Check the balance of the labels variable (`y`) by using the `value_counts` function.

In [12]:
# Step 3: Check the balance of the labels variable (y) by using the value_counts function
label_balance = y.value_counts()
print(label_balance)

0    75036
1     2500
Name: loan_status, dtype: int64


### Step 4: Split the data into training and testing datasets by using `train_test_split`.

In [13]:
# Step 4: Split the data into training and testing datasets by using train_test_split.
from sklearn.model_selection import train_test_split

# Split the data using train_test_split
# Assign a random_state of 1 to the function
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# The data is split into X_train, X_test, y_train, and y_test

---

## Create a Logistic Regression Model with the Original Data

###  Step 1: Fit a logistic regression model by using the training data (`X_train` and `y_train`).

In [14]:
# Create a Logistic Regression Model with the Original Data
# Step 1: Fit a logistic regression model by using the training data (X_train and y_train).
from sklearn.linear_model import LogisticRegression

# Initialize the logistic regression model
model = LogisticRegression(random_state=1)

# Fit the model with the training data
model.fit(X_train, y_train)

### Step 2: Save the predictions on the testing data labels by using the testing feature data (`X_test`) and the fitted model.

In [16]:
# Step 2: Save the predictions on the testing data labels by using the testing feature data (X_test) and the fitted model.
# Make a prediction using the testing data
y_pred = model.predict(X_test)

print(y_pred)

[0 0 0 ... 0 0 0]


### Step 3: Evaluate the model’s performance by doing the following:

* Calculate the accuracy score of the model.

* Generate a confusion matrix.

* Print the classification report.

In [28]:
# Calculate the Accuracy Score and Print Balanced Accuracy Score
from sklearn.metrics import accuracy_score, balanced_accuracy_score

# Calculate the accuracy score
accuracy = accuracy_score(y_test, y_pred)

# Print the balanced accuracy score
balanced_acc = balanced_accuracy_score(y_test, y_pred)

print(f"Accuracy Score: {accuracy:.2f}")
print(f"Balanced Accuracy Score: {balanced_acc:.2f}")

Accuracy Score: 0.99
Balanced Accuracy Score: 0.95


In [29]:
# Generate a confusion matrix for the model
from sklearn.metrics import confusion_matrix

# Generate a confusion matrix
confusion = confusion_matrix(y_test, y_pred)

print("Confusion Matrix:")
print(confusion)

Confusion Matrix:
[[14926    75]
 [   46   461]]


In [19]:
# Print the classification report for the model
from sklearn.metrics import classification_report

# Print the classification report
report = classification_report(y_test, y_pred)

print("Classification Report:")
print(report)

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     15001
           1       0.86      0.91      0.88       507

    accuracy                           0.99     15508
   macro avg       0.93      0.95      0.94     15508
weighted avg       0.99      0.99      0.99     15508



### Step 4: Answer the following question.

**Question:** How well does the logistic regression model predict both the `0` (healthy loan) and `1` (high-risk loan) labels?

**Answer:** The logistic regression model is doing a great job at predicting whether loans are healthy (0) or high-risk (1).

Accuracy Score: The model has an accuracy score of 0.99, meaning the model is correct about 99% of the time when determining loan status.

Balanced Accuracy Score: The balanced accuracy score is 0.95. This score is 95%, which means the model is good at distinguishing between healthy and high-risk loans.

Confusion Matrix: The confusion matrix shows that the model is making very few misclassifications. It correctly identifies a high proportion of both healthy loans (15001 out of 15001) and high-risk loans (461 out of 507).

Classification Report: The classification report provides more detailed insights into the model's performance, including precision, recall, and F1-score for both classes. It's demonstrating that the model has high precision, recall, and F1-score for both classes, with slightly lower values for high-risk loans due to smaller sample size.

In summary, the model is accurate and can tell healthy and high-risk loans apart. It's a good choice for the company to use because it can effectively distinguish between the two types of loans.

---

## Predict a Logistic Regression Model with Resampled Training Data

### Step 1: Use the `RandomOverSampler` module from the imbalanced-learn library to resample the data. Be sure to confirm that the labels have an equal number of data points. 

In [20]:
# Step 1: Use the RandomOverSampler module from the imbalanced-learn library to resample the data.
# Import the RandomOverSampler module form imbalanced-learn
from imblearn.over_sampling import RandomOverSampler

# Instantiate the random oversampler model
# Assign a random_state parameter of 1 to the model
random_oversampler = RandomOverSampler(random_state=1)

# Fit the original training data to the random_oversampler model
X_resampled, y_resampled = random_oversampler.fit_resample(X_train, y_train)

In [21]:
# Count the distinct values of the resampled labels data
unique_labels = set(y_resampled)
distinct_values_count = len(unique_labels)
print(f"Distinct values count of the resampled labels: {distinct_values_count}")

Distinct values count of the resampled labels: 2


### Step 2: Use the `LogisticRegression` classifier and the resampled data to fit the model and make predictions.

In [23]:
# Step 2: Use the LogisticRegression classifier and the resampled data to fit the model and make predictions.
from sklearn.linear_model import LogisticRegression

# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
logistic_model_resampled = LogisticRegression(random_state=1)

# Fit the model using the resampled training data
logistic_model_resampled.fit(X_resampled, y_resampled)

# Make a prediction using the testing data
y_pred_resampled = logistic_model_resampled.predict(X_test)

print(y_pred_resampled)

[0 0 0 ... 0 0 0]


### Step 3: Evaluate the model’s performance by doing the following:

* Calculate the accuracy score of the model.

* Generate a confusion matrix.

* Print the classification report.

In [24]:
# Calculate Accuracy and Print Balanced Accuracy
from sklearn.metrics import accuracy_score, balanced_accuracy_score

# Calculate the accuracy score
accuracy_resampled = accuracy_score(y_test, y_pred_resampled)

# Print the balanced accuracy score
balanced_acc_resampled = balanced_accuracy_score(y_test, y_pred_resampled)

print(f"Accuracy Score (Resampled Model): {accuracy_resampled:.2f}")
print(f"Balanced Accuracy Score (Resampled Model): {balanced_acc_resampled:.2f}")

Accuracy Score (Resampled Model): 0.99
Balanced Accuracy Score (Resampled Model): 0.99


In [25]:
# Generate Confusion Matrix
from sklearn.metrics import confusion_matrix

# Generate a confusion matrix for the resampled model
confusion_resampled = confusion_matrix(y_test, y_pred_resampled)

print("Confusion Matrix (Resampled Model):")
print(confusion_resampled)

Confusion Matrix (Resampled Model):
[[14915    86]
 [    3   504]]


In [26]:
# Print Classification Report
from sklearn.metrics import classification_report

# Print the classification report for the resampled model
report_resampled = classification_report(y_test, y_pred_resampled)

print("Classification Report (Resampled Model):")
print(report_resampled)

Classification Report (Resampled Model):
              precision    recall  f1-score   support

           0       1.00      0.99      1.00     15001
           1       0.85      0.99      0.92       507

    accuracy                           0.99     15508
   macro avg       0.93      0.99      0.96     15508
weighted avg       1.00      0.99      0.99     15508



### Step 4: Answer the following question

**Question:** How well does the logistic regression model, fit with oversampled data, predict both the `0` (healthy loan) and `1` (high-risk loan) labels?

**Answer:** The logistic regression model, fit with oversampled data, predicts both the 0 (healthy loan) and 1 (high-risk loan) labels exceptionally well.

**Accuracy and Balanced Accuracy:** The model demonstrates a remarkable accuracy score of 0.99, indicating that it correctly predicts loan status in 99% of cases. The balanced accuracy score, also at 0.99, highlights the model's outstanding ability to effectively distinguish between healthy and high-risk loans.

**Confusion Matrix:** The model's performance is exemplified by a confusion matrix that shows very few misclassifications. It correctly identifies the majority of both healthy loans (14915 out of 15001) and high-risk loans (504 out of 507).

**Classification Report:** The detailed classification report emphasizes that the model maintains high precision, recall, and F1-score for both classes. Although high-risk loans have a smaller sample size, the model still achieves strong performance in identifying them.

In summary, the logistic regression model, trained with oversampled data, excels in predicting both healthy and high-risk loan labels. Its balanced accuracy and low misclassification rates make it a highly reliable model for the company's use.