In [None]:
# Import the modules
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression


---

## Split the Data into Training and Testing Sets

### Step 1: Read the `lending_data.csv` data from the `Resources` folder into a Pandas DataFrame.

In [None]:
# Read the CSV file from the Resources folder into a Pandas DataFrame
data = Path('Resources/lending_data.csv')
lending_data_df = pd.read_csv(data)


# Review the DataFrame
lending_data_df.info()
lending_data_df.head()

In [None]:


observartions = pd.DataFrame(lending_data_df['loan_status'].value_counts())

# observartions["loan_status"]=observartions.index()
observartions.head()

# ax = observartions.plot.bar(x='loan_status', y='val', rot=0)

### Step 2: Create the labels set (`y`)  from the “loan_status” column, and then create the features (`X`) DataFrame from the remaining columns.

In [None]:
# Separate the data into labels and features
# Separate the y variable, the labels
y = lending_data_df["loan_status"]

# Separate the X variable, the features
X = lending_data_df.copy()
X.drop("loan_status", axis=1,inplace=True)


In [None]:
# Review the y variable Series
y.head()

In [None]:
# Review the X variable DataFrame
X.head()

### Step 3: Split the data into training and testing datasets by using `train_test_split`.

In [None]:
# Split the data using train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1, 
                                                    stratify=y)

X_train.shape

---

## Create a Logistic Regression Model with the Original Data

###  Step 1: Fit a logistic regression model by using the training data (`X_train` and `y_train`).

In [None]:
# Import the LogisticRegression module from SKLearn
from sklearn.linear_model import LogisticRegression

# Instantiate the Logistic Regression model
classifier = LogisticRegression(solver='lbfgs',
                                # max_iter=200,
                                random_state=1)
# classifier
# Fit(train) the model using training data
classifier.fit(X_train, y_train)

In [None]:
## Validate the model using the test data
# Score the model
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

### Step 2: Save the predictions on the testing data labels by using the testing feature data (`X_test`) and the fitted model.

In [None]:
# Make a prediction using the testing data
test_predictions = classifier.predict(X_test)
test_results = pd.DataFrame({"Prediction": test_predictions, "Actual": y_test}).reset_index(drop=True)
test_results.head(10)


### Step 3: Evaluate the model’s performance by doing the following:

* Generate a confusion matrix.

* Print the classification report.

In [None]:
## // Test MODEL //
# Generate a confusion matrix for the model
test_conf_matrix = confusion_matrix(y_test, test_predictions)

print("------------ Confusion Matrix (Test)")
print(test_conf_matrix)

In [None]:
# 18679 True Positives , 80 False Positives, 
# 29 False Negatives, 110 True Negatives

In [None]:
# Print the classification report for the model
# Test Data classification report

print("------------ Class Report (Test)")
test_class_report =  classification_report(y_test, test_predictions)
print(test_class_report)

In [None]:
## // TRAINING MODEL //
# Make a prediction using the Training data
train_predictions = classifier.predict(X_train)
train_results = pd.DataFrame({"Prediction": train_predictions, "Actual": y_train}).reset_index(drop=True)

# Generate a confusion matrix for the model
train_conf_matrix = confusion_matrix(y_train, train_predictions)

print("------------ Confusion Matrix (Train)")
print(train_conf_matrix)



In [None]:
# Test Data classification report
train_class_report =  classification_report(y_train, train_predictions)

print("------------ Class Report (Train)")
print(train_class_report)

### Step 4: Answer the following question.

**Question:** How well does the logistic regression model predict both the `0` (healthy loan) and `1` (high-risk loan) labels?

**Answer:** WRITE YOUR ANSWER HERE!

---Looking at the two classification reports for the training and test data, it looks as if model performance declined--albeit slightly--on the test data. This is to be expected: this is how well the model is performing on data that the model hasn't seen before. If we're still getting strong precision and recall on the test dataset, this is a good indication about how well the model is likely to perform in real life.