In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import os
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

# Dataset:  heart.csv



In [2]:
lending_data = pd.read_csv(os.path.join("Resources", "lending_data.csv"))
lending_data.head(10)

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
0,10700.0,7.672,52800,0.431818,5,1,22800,0
1,8400.0,6.692,43600,0.311927,3,0,13600,0
2,9000.0,6.963,46100,0.349241,3,0,16100,0
3,10700.0,7.664,52700,0.43074,5,1,22700,0
4,10800.0,7.698,53000,0.433962,5,1,23000,0
5,10100.0,7.438,50600,0.407115,4,1,20600,0
6,10300.0,7.49,51100,0.412916,4,1,21100,0
7,8800.0,6.857,45100,0.334812,3,0,15100,0
8,9300.0,7.096,47400,0.367089,3,0,17400,0
9,9700.0,7.248,48800,0.385246,4,0,18800,0


## Data inspection

In [3]:
#looks like loan_status is the column with binary numbers. To confirm
lending_data["loan_status"].unique()


array([0, 1])

In [4]:
lending_data["derogatory_marks"].unique()

array([1, 0, 2, 3])

In [5]:
#Any columns with NaN values in lending data column. Are there any NaN values
lending_data["loan_status"].isnull().values.any()

False

## Building the model

In [6]:
#Therefore y axis is 
y = lending_data["loan_status"]


In [7]:
# and X
X = lending_data.drop("loan_status", axis=1)
X.head()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt
0,10700.0,7.672,52800,0.431818,5,1,22800
1,8400.0,6.692,43600,0.311927,3,0,13600
2,9000.0,6.963,46100,0.349241,3,0,16100
3,10700.0,7.664,52700,0.43074,5,1,22700
4,10800.0,7.698,53000,0.433962,5,1,23000


In [8]:
# Split the data by using train_test_split()
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.33, random_state=42)


In [9]:
X_train.shape

(51949, 7)

In [10]:
X_test.shape

(25587, 7)

In [11]:
# Create a logistic regression model and fit the model to the data
clf = LogisticRegression()
clf.fit(X_train, y_train)
print(f"Training score: {clf.score(X_train, y_train)}")
print(f"Test score: {clf.score(X_test, y_test)}")

Training score: 0.9917804000076998
Test score: 0.9926134365107281


## Logistic Confusion Matrix

In [12]:
# Create a confusion matrix from the test values and predictions
y_true = y_test
y_pred = clf.predict(X_test)
cm = confusion_matrix(y_true, y_pred)
cm

array([[24691,   121],
       [   68,   707]])

In [13]:
clf.predict(X_test)

array([0, 0, 0, ..., 0, 0, 0])

In [14]:
clf.predict_proba(X_test)[:,1]>0.5

array([False, False, False, ..., False, False, False])

In [15]:
# extracting the true negative, false positive, false negative, and true positive values 
# from a confusion matrix `cm`
tn, fp, fn, tp = cm.ravel()

In [16]:
# Calculate the precision of the model based on the confusion matrix
precision = tp/(tp+fp)

In [17]:
# Calculate the sensitivity of the model based on the confusion matrix
recall = tp/(tp+fn)

In [18]:
# Calculate the F1 score of the model based on the confusion matrix
f1_score = 2*precision*recall/(precision+recall)

In [21]:
# Print the classification report for the model on the test data
print(f"The random forest precision is: {precision}")
print(f"The random forest recall is:{recall}")
print(f"The random forest f1_scores is:{f1_score}")

The random forest precision is: 0.8538647342995169
The random forest recall is:0.912258064516129
The random forest f1_scores is:0.8820960698689957


In [22]:
#Can also print the classification score
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     24812
           1       0.85      0.91      0.88       775

    accuracy                           0.99     25587
   macro avg       0.93      0.95      0.94     25587
weighted avg       0.99      0.99      0.99     25587

