In [None]:
# Import the modules
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import confusion_matrix, classification_report

---

## Split the Data into Training and Testing Sets

### Step 1: Read the `lending_data.csv` data from the `Resources` folder into a Pandas DataFrame.

In [None]:
# Import the required modules
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [None]:
# Read the CSV file from the Resources folder into a Pandas DataFrame
lending_data_df = pd.read_csv(Path("lending_data.csv"))

# Review the DataFrame
display(lending_data_df.head())
display(lending_data_df.tail())

In [None]:
print(lending_data_df.shape)

description = lending_data_df.describe()

print(description)

### Step 2: Create the labels set (`y`)  from the “loan_status” column, and then create the features (`X`) DataFrame from the remaining columns.

In [None]:
# Separate the data into labels and features
# Split the data into X (features) and y (target)
y = lending_data_df['loan_status']

# X variable should include all features except target
X = lending_data_df.drop(columns=['loan_status'])

# “loan_status” = 0 means loan is healthy, "loan-status" = 1 means loan has high risk of defaulting


In [None]:
# Review the y variable Series
print(y[:5])

In [None]:
# Review the X variable DataFrame
print(X[:5])

### Step 3: Split the data into training and testing datasets by using `train_test_split`.

In [None]:
# Import the train_test_learn module
from sklearn.model_selection import train_test_split

# Split the data using train_test_split
# Assign a random_state of 1 to the function
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1, stratify=y)

In [None]:
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

---

## Create a Logistic Regression Model with the Original Data

###  Step 1: Fit a logistic regression model by using the training data (`X_train` and `y_train`).

In [None]:
# Import the LogisticRegression module from SKLearn
from sklearn.linear_model import LogisticRegression

# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
logistic_regression_model = LogisticRegression(random_state=1)

# Fit logistic regression model using training data
lr_model = logistic_regression_model.fit(X_train, y_train)

###  Step 2: Save predictions for testing data labels by using testing feature data (X_test) and fitted model

In [None]:
# Make a prediction using the testing data

#Generate testing predictions
testing_predictions = logistic_regression_model.predict(X_test)


In [None]:
# Out of curiosity
from sklearn.metrics import accuracy_score
# Display accuracy score for test dataset.
accuracy_score(y_test, testing_predictions)

### Step 3: Evaluate the model’s performance by doing the following:

* Generate a confusion matrix.

* Print the classification report.

In [None]:
# Import model for sklearn confusion matrix
from sklearn.metrics import confusion_matrix

conf_matrix = confusion_matrix(y_test, testing_predictions)

print(conf_matrix)

#Note: If interpretation of classes is flipped (0 represents positive outcome 
#and 1 represents negative outcome, then interpretation of confusion matrix 
#also flips accordingly

#                          Predicted Positive          Predicted Negative
#                                0                               1
#Actual Positive  0        True Positive (14,951)      False Negative (57)
#Actual Negative  1        False Positive (59)         True Negative (441)

#Precision = True Positives / (True Positives + False Positives)
#Recall = True Positives / (True Positives + False Negatives)

In [None]:
# Print the classification report for the model
from sklearn.metrics import classification_report

testing_predictions = lr_model.predict(X_test)

print(classification_report(y_test, testing_predictions))

In [None]:
#Precision = True Positives / (True Positives + False Positives)
#Recall = True Positives / (True Positives + False Negatives)

### Step 4: Answer the following question.

**Question:** How well does the logistic regression model predict both the `0` (healthy loan) and `1` (high-risk loan) labels?

**Answer:** WRITE YOUR ANSWER HERE!
Answer the following question: How well does the logistic regression model predict both the 0 (healthy loan) and 1 (high-risk loan) labels?

The logistic regression model does an excellent job at predicting the healthy loans and a decent job at predicting the high-risk loans. 

The model's accuracy is 99%. This means that the model correctly classified the vast majority of the the loan applications.

For the healthy loan predictions, the model has a 100% score in both precision and recall, which is the highest possible. The 100% precision score means the model classified 100% (when rounding) of the healthy loans as healthy and had (nearly) no false positives (misclassification of high-risk loans as healthy). The 100% recall score means the model correctly predicted 100% (when rounding) truly healthy loans as healthy. It made negligible false negatives, which means it did not classify (almost) any of the healthy loans as high-risk. 

The model was less precise and had poorer recall when it came to predicting high-risk loans, with scores of 89% and 88%, respectively. This precision score means it correctly predicted only 89% of the loans it predicted to be high-risk. In other words, the model mis-classified 11% of its high-risk predictions as high risk when they were actuall healthy (false positives). The recall score of 88% means the model correctly identified 88% of all the truly high-risk loans as high-risk (true positives). The other 12% it mis-classified as healthy (false negatives).

---