In [30]:
# Import the modules
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

In [2]:
conda install -c conda-forge imbalanced-learn

Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.

Retrieving notices: ...working... done

Note: you may need to restart the kernel to use updated packages.


In [31]:
from imblearn import under_sampling, over_sampling
from imblearn.over_sampling import SMOTE

In [32]:
# Read the CSV file from the Resources folder into a Pandas DataFrame
lending_df = pd.read_csv("lending_data.csv")

# Review the DataFrame
lending_df.head(10)

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
0,10700.0,7.672,52800,0.431818,5,1,22800,0
1,8400.0,6.692,43600,0.311927,3,0,13600,0
2,9000.0,6.963,46100,0.349241,3,0,16100,0
3,10700.0,7.664,52700,0.43074,5,1,22700,0
4,10800.0,7.698,53000,0.433962,5,1,23000,0
5,10100.0,7.438,50600,0.407115,4,1,20600,0
6,10300.0,7.49,51100,0.412916,4,1,21100,0
7,8800.0,6.857,45100,0.334812,3,0,15100,0
8,9300.0,7.096,47400,0.367089,3,0,17400,0
9,9700.0,7.248,48800,0.385246,4,0,18800,0


In [33]:
# Separate the data into labels and features

# Separate the y variable, the labels
y = lending_df["loan_status"]

# Separate the X variable, the features
X = lending_df.drop(columns="loan_status")


In [34]:
# Review the y variable Series
y.head()

0    0
1    0
2    0
3    0
4    0
Name: loan_status, dtype: int64

In [35]:
# Review the X variable DataFrame
X.head()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt
0,10700.0,7.672,52800,0.431818,5,1,22800
1,8400.0,6.692,43600,0.311927,3,0,13600
2,9000.0,6.963,46100,0.349241,3,0,16100
3,10700.0,7.664,52700,0.43074,5,1,22700
4,10800.0,7.698,53000,0.433962,5,1,23000


In [36]:
# Check the balance of our target values
y.value_counts()


0    75036
1     2500
Name: loan_status, dtype: int64

In [37]:
# Import the train_test_learn module
from sklearn.model_selection import train_test_split

# Split the data using train_test_split

X_train, X_test, y_train, y_test = train_test_split( X, y, random_state=1)

In [38]:
# Import the LogisticRegression module from SKLearn
from sklearn.linear_model import LogisticRegression

# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
lr_model = LogisticRegression(random_state=1)
# Fit the model using training data
lr_model.fit(X_train,y_train)


LogisticRegression(random_state=1)

In [39]:
# Make a prediction using the testing data
y_pred = lr_model.predict(X_test)


In [40]:
# Print the balanced_accuracy score of the model
balanced_accuracy_score(y_test, y_pred)

0.9520479254722232

In [41]:
# Generate a confusion matrix for the model
confusion_matrix(y_test, y_pred)

array([[18663,   102],
       [   56,   563]], dtype=int64)

In [42]:
# Print the classification report for the model
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       1.00      0.99      0.91      1.00      0.95      0.91     18765
          1       0.85      0.91      0.99      0.88      0.95      0.90       619

avg / total       0.99      0.99      0.91      0.99      0.95      0.91     19384



In [45]:
#Question: How well does the logistic regression model predict both the 0 (healthy loan) and 1 (high-risk loan) labels?

#Answer: The model's precision for the "healthy(0)" loans is 1.00,and thats saying the model's skill to predict the positive numbers was almost exact. For recall, which shows how many samples were accurately predicted was 99% as well.

#As far as the "risky(1)" loans the model did about 85%. The recall was 91%. Overall, showing it performed well.

In [46]:
# Import the RandomOverSampler module form imbalanced-learn
from imblearn.over_sampling import RandomOverSampler

# Instantiate the random oversampler model
# # Assign a random_state parameter of 1 to the model
resample_model = RandomOverSampler(random_state=1)

# Fit the original training data to the random_oversampler model
X_resampled, y_resampled = resample_model.fit_resample(X_train, y_train)


In [47]:
# Count the distinct values of the resampled labels data
y_resampled.value_counts()

0    56271
1    56271
Name: loan_status, dtype: int64

In [48]:
# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
resampled_lr_model = LogisticRegression(random_state=1)

# Fit the model using the resampled training data
resampled_lr_model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [49]:
# Make a prediction using the testing data
y_pred_resampled = resampled_lr_model.predict(X_test)

In [50]:
# Print the balanced_accuracy score of the model 
balanced_accuracy_score(y_test, y_pred_resampled)


0.9936781215845847

In [51]:
# Generate a confusion matrix for the model
confusion_matrix(y_test, y_pred_resampled)


array([[18649,   116],
       [    4,   615]], dtype=int64)

In [52]:
# Print the classification report for the model
print(classification_report_imbalanced(y_test, y_pred_resampled))


                   pre       rec       spe        f1       geo       iba       sup

          0       1.00      0.99      0.99      1.00      0.99      0.99     18765
          1       0.84      0.99      0.99      0.91      0.99      0.99       619

avg / total       0.99      0.99      0.99      0.99      0.99      0.99     19384



In [None]:
Question: How well does the logistic regression model, fit with oversampled data, predict both the 0 (healthy loan) and 1 (high-risk loan) labels?

Answer: The resampled data was better performing than the original data. Risky and Healthy loans had a recall of 99% anmd showed a high amount of samples were positive. When it came to ability of the model it seemed to have worked very well with the "0" loans and about 84% in predicting the "1" or risky loans.