In [164]:
# Import the modules
import dask.dataframe as dd
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

from imblearn.over_sampling import RandomOverSampler

In [154]:
# create variables
cols = ['loan_size','interest_rate','borrower_income','debt_to_income','num_of_accounts','derogatory_marks','total_debt', 'loan_status']
report_names=['Healthy Loan', 'High Risk Loan']

##  kNN variables
oversample = ['True', 'False']
neighbor_nums = [1,5]

---

## Split the Data into Training and Testing Sets

### Step 1: Read the `lending_data.csv` data from the `Resources` folder into a Pandas DataFrame.

In [130]:
# Read the CSV file from the Resources folder into a Pandas DataFrame
ddf = dd.read_csv("Resources/lending_data.csv")

# Review the DataFrame
ddf.dtypes

loan_size           float64
interest_rate       float64
borrower_income       int64
debt_to_income      float64
num_of_accounts       int64
derogatory_marks      int64
total_debt            int64
loan_status           int64
dtype: object

In [131]:
# compute the DataFrame values to create the pandas DataFrame
df = ddf.compute()
target_values = df['loan_status'].unique()

# if the target is letters:
##  if true, set = to default which is 1; otherwise make 0
# df['loan_status'] = (df['loan_status'] == 'g').astype(int)

print(f'df contains {df.shape[0]} rows and {df.shape[1]} columns.')
print(f'df target values are {target_values}')
print('_' * 50)

df

df contains 77536 rows and 8 columns.
df target values are [0 1]
__________________________________________________


Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
0,10700.0,7.672,52800,0.431818,5,1,22800,0
1,8400.0,6.692,43600,0.311927,3,0,13600,0
2,9000.0,6.963,46100,0.349241,3,0,16100,0
3,10700.0,7.664,52700,0.430740,5,1,22700,0
4,10800.0,7.698,53000,0.433962,5,1,23000,0
...,...,...,...,...,...,...,...,...
77531,19100.0,11.261,86600,0.653580,12,2,56600,1
77532,17700.0,10.662,80900,0.629172,11,2,50900,1
77533,17600.0,10.595,80300,0.626401,11,2,50300,1
77534,16300.0,10.068,75300,0.601594,10,2,45300,1


In [132]:
## graphically view the probabilities

# for col in cols[:-1]:
#     plt.hist(df[df['loan_status']==1][col], color='blue', label='Healthy Loan', alpha=1, density='True')
#     plt.hist(df[df['loan_status']==0][col], color='red', label='High Risk Loan', alpha=1, density='True')
#     plt.title(col)
#     plt.ylabel('Probability')
#     plt.xlabel(col)
#     plt.legend()
#     plt.show()

### Step 2: Create the labels set (`y`)  from the “loan_status” column, and then create the features (`X`) DataFrame from the remaining columns.

In [133]:
# Separate the data into labels and features
# Separate the y variable, the labels
y = df['loan_status']      ## labels

# Separate the X variable, the features
X = df.drop(columns='loan_status')   ## features

In [134]:
y.head(), y.shape[0]

(0    0
 1    0
 2    0
 3    0
 4    0
 Name: loan_status, dtype: int64,
 77536)

In [135]:
# Review the X variable DataFrame
X.head(), X.shape[0]

(   loan_size  interest_rate  borrower_income  debt_to_income  num_of_accounts  \
 0    10700.0          7.672            52800        0.431818                5   
 1     8400.0          6.692            43600        0.311927                3   
 2     9000.0          6.963            46100        0.349241                3   
 3    10700.0          7.664            52700        0.430740                5   
 4    10800.0          7.698            53000        0.433962                5   
 
    derogatory_marks  total_debt  
 0                 1       22800  
 1                 0       13600  
 2                 0       16100  
 3                 1       22700  
 4                 1       23000  ,
 77536)

### Step 3: Check the balance of the labels variable (`y`) by using the `value_counts` function.

In [136]:
# Check the balance of our target values
## loan_status
## 0 = healthy loan, 1 = high-risk-loan
y.value_counts(dropna=False), y.value_counts(dropna=True)

(0    75036
 1     2500
 Name: loan_status, dtype: int64,
 0    75036
 1     2500
 Name: loan_status, dtype: int64)

In [137]:
#  percentage of results, ## 0 = healthy loan, 1 = high-risk-loan
df['loan_status'].value_counts(normalize = True).to_frame().style.format('{:.3%}')

Unnamed: 0,loan_status
0,96.776%
1,3.224%


### Step 4: Split the data into training and testing datasets by using `train_test_split`.

In [138]:
# Split the data using train_test_split
# Assign a random_state of 1 to the function
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1)
# create datasets
train = X_train.join(y_train)
test = X_test.join(y_test)

In [139]:
train

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
29175,8600.0,6.792,44500,0.325843,3,0,14500,0
23020,7800.0,6.419,41000,0.268293,2,0,11000,0
31269,10000.0,7.386,50100,0.401198,4,1,20100,0
35479,9300.0,7.093,47300,0.365751,3,0,17300,0
13470,9200.0,7.045,46900,0.360341,3,0,16900,0
...,...,...,...,...,...,...,...,...
20609,7200.0,6.177,38700,0.224806,1,0,8700,0
21440,10000.0,7.389,50100,0.401198,4,1,20100,0
73349,10200.0,7.463,50800,0.409449,4,1,20800,0
50057,11100.0,7.838,54400,0.448529,5,1,24400,0


In [140]:
test

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
60914,12600.0,8.469,60300,0.502488,6,1,30300,0
36843,9800.0,7.289,49200,0.390244,4,0,19200,0
1966,10900.0,7.770,53700,0.441341,5,1,23700,0
70137,10700.0,7.666,52700,0.430740,5,1,22700,0
27237,9900.0,7.353,49800,0.397590,4,0,19800,0
...,...,...,...,...,...,...,...,...
45639,9900.0,7.328,49600,0.395161,4,0,19600,0
11301,9900.0,7.317,49500,0.393939,4,0,19500,0
51614,8000.0,6.520,42000,0.285714,2,0,12000,0
4598,11500.0,8.001,55900,0.463327,5,1,25900,0


In [141]:
# print number of records for testing and training purposes
print(f' Number of train records: X: {len(X_train)} y: {len(y_train)}')
print(f' Number of test  records: X: {len(X_test)} y: {len(y_test)}')

 Number of train records: X: 58152 y: 58152
 Number of test  records: X: 19384 y: 19384


In [142]:
## Scale (normalize) the dataset to avoid skewing caused by numbers of various sizes
def scale_dataset(train, oversample=False):
    X = df[df.columns[:-1]].values
    y = df[df.columns[-1]].values

    # Instantiate a StandardScaler instance
    scaler = StandardScaler()

    # fit the data and transform
    X = scaler.fit_transform(X)
    
    # if we decide to use the RandomOverSampler module due to a large variance in record numbers
    ## to take more of a less class and increase the size of the dataset
    if oversample:
        ros = RandomOverSampler()
        X, y = ros.fit_resample(X, y)
        
    # create on huge 2d numpy array (hstack, horizontal stack side by side)
    data = np.hstack((X, np.reshape(y, (-1,1))))
    
    # return fit and transformed data
    return train, X, y

In [143]:
train

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
29175,8600.0,6.792,44500,0.325843,3,0,14500,0
23020,7800.0,6.419,41000,0.268293,2,0,11000,0
31269,10000.0,7.386,50100,0.401198,4,1,20100,0
35479,9300.0,7.093,47300,0.365751,3,0,17300,0
13470,9200.0,7.045,46900,0.360341,3,0,16900,0
...,...,...,...,...,...,...,...,...
20609,7200.0,6.177,38700,0.224806,1,0,8700,0
21440,10000.0,7.389,50100,0.401198,4,1,20100,0
73349,10200.0,7.463,50800,0.409449,4,1,20800,0
50057,11100.0,7.838,54400,0.448529,5,1,24400,0


In [144]:
test

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
60914,12600.0,8.469,60300,0.502488,6,1,30300,0
36843,9800.0,7.289,49200,0.390244,4,0,19200,0
1966,10900.0,7.770,53700,0.441341,5,1,23700,0
70137,10700.0,7.666,52700,0.430740,5,1,22700,0
27237,9900.0,7.353,49800,0.397590,4,0,19800,0
...,...,...,...,...,...,...,...,...
45639,9900.0,7.328,49600,0.395161,4,0,19600,0
11301,9900.0,7.317,49500,0.393939,4,0,19500,0
51614,8000.0,6.520,42000,0.285714,2,0,12000,0
4598,11500.0,8.001,55900,0.463327,5,1,25900,0


In [145]:
# NO oversample
# train, X_train, y_train = scale_dataset(train, oversample=True)
train, X_train, y_train = scale_dataset(train, oversample=False)
test, X_test, y_test = scale_dataset(test, oversample=False)

# print number of records for testing and training purposes
print(f' Number of train records: X: {len(X_train)} y: {len(y_train)}')
print(f' Number of test  records: X: {len(X_test)} y: {len(y_test)}')

 Number of train records: X: 150072 y: 150072
 Number of test  records: X: 77536 y: 77536


In [146]:
# should be rebalanced
sum(y_train == 0), sum(y_train == 1)

(75036, 75036)

In [105]:
## classification report details
## https://medium.com/@kohlishivam5522/understanding-a-classification-report-for-your-machine-learning-model-88815e2ce397#:~:text=The%20classification%20report%20visualizer%20displays,was%20positive%20and%20predicted%20positive

# PRECISION - What percent of your predictions were correct?
##  accuracy of positive predictions
# RECALL - What percent of the positive cases did you catch?
##  fraction of positives that were correctly identified.
# F1-SCORE - What percent of positive predictions were correct?
##  a weighted harmonic mean of precision and recall

---

## Create a kNN Model

###  Step 1: Fit a logistic regression model by using the training data (`X_train` and `y_train`).

In [147]:
# import the module
from sklearn.neighbors import KNeighborsClassifier

In [148]:
# use variables


# Fit the model using the resampled training data,
#   number of neighborhood points to classify a given point
knn_model = KNeighborsClassifier(n_neighbors=1)
knn_model.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=1)

In [158]:
# Make a prediction using the testing data & print classification_report
y_pred = knn_model.predict(X_test)
print("kNN Model Classification Report for 1 n_neighbors")
print(classification_report(y_test, y_pred, target_names=report_names))

kNN Model Classification Report for 1 n_neighbors
                precision    recall  f1-score   support

  Healthy Loan       1.00      0.99      1.00     75036
High Risk Loan       0.86      0.99      0.92      2500

      accuracy                           0.99     77536
     macro avg       0.93      0.99      0.96     77536
  weighted avg       1.00      0.99      0.99     77536



In [150]:
# use n_neighbors = 5
# Fit the model using the resampled training data,
#   number of neighborhood points to classify a given point
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train, y_train)

KNeighborsClassifier()

In [151]:
# Make a prediction using the testing data & print classification_report
y_pred = knn_model.predict(X_test)
print("kNN Model Classification Report for 5 n_neighbors")
print(classification_report(y_test, y_pred, target_names=report_names))

kNN Model Classification Report for 5 n_neighbors
              precision    recall  f1-score   support

           0       1.00      0.99      1.00     75036
           1       0.86      0.99      0.92      2500

    accuracy                           0.99     77536
   macro avg       0.93      0.99      0.96     77536
weighted avg       1.00      0.99      0.99     77536



## Create Naive Bayes Model

In [119]:
# import the module
from sklearn.naive_bayes import GaussianNB

In [120]:
# Fit the model using the resampled training data,
#   number of neighborhood points to classify a given point
nb_model = GaussianNB()
nb_model = nb_model.fit(X_train, y_train)

In [121]:
# Make a prediction using the testing dataa & print classification_report
y_pred = nb_model.predict(X_test)
print("Naive Bayes Model Classification Report")
print(classification_report(y_test, y_pred, target_names=report_names))

              precision    recall  f1-score   support

           0       1.00      0.99      1.00     75036
           1       0.86      0.99      0.92      2500

    accuracy                           0.99     77536
   macro avg       0.93      0.99      0.96     77536
weighted avg       1.00      0.99      0.99     77536



## Create Logistic Regression Model

In [123]:
# import the module
from sklearn.linear_model import LogisticRegression

In [124]:
# Fit the model using the resampled training data,
#   number of neighborhood points to classify a given point
lg_model = LogisticRegression()
lg_model = lg_model.fit(X_train, y_train)

In [125]:
# Make a prediction using the testing dataa & print classification_report
y_pred = lg_model.predict(X_test)
print("Logistic Regression Model Classification Report")
print(classification_report(y_test, y_pred, target_names=report_names))

              precision    recall  f1-score   support

           0       1.00      0.99      1.00     75036
           1       0.84      1.00      0.91      2500

    accuracy                           0.99     77536
   macro avg       0.92      0.99      0.95     77536
weighted avg       0.99      0.99      0.99     77536



## Create Support Vector Machines (SVM) Model

In [159]:
# import the module
from sklearn.svm import SVC

In [160]:
# Fit the model using the resampled training data,
#   number of neighborhood points to classify a given point
svm_model = SVC()
svm_model = svm_model.fit(X_train, y_train)

In [None]:
# Make a prediction using the testing dataa & print classification_report
y_pred = svm_model.predict(X_test)
print("Support Vector Machines (SVM) Model Classification Report")
print(classification_report(y_test, y_pred, target_names=report_names))

## Create Tensorflow Neural Networks Model

In [165]:
# import the module
import tensorflow as tf

RuntimeError: module compiled against API version 0xe but this version of numpy is 0xd

ImportError: initialization failed

In [None]:
# Fit the model and complile using the resampled training data,

nn_model = tf.keras.Sequential([
    tf.keras.layers.Dense(32, activation='relu', input_shape=(10,)),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

nn_model.compile(optimizer=tf.keras.optimizers.Adam(0.001)), loss='binary_crossentropy',
                 metricvs=['accuracy'])

In [None]:
# Display the accuracy score for the test dataset.
ODaccuracy_score = accuracy_score(y_test, ODpredictions)
f'The accuracy score for the original data is {ODaccuracy_score:.3%}'
# accuracy_score(y_test, predictions)

In [None]:
# Generate a confusion matrix for the model
cm_original = confusion_matrix(y_test, ODpredictions)
cm_original_df = pd.DataFrame(
                cm_original, index=['Actual Healthy Loan', 'Actual High-Risk Loan'],
                columns=['Predicted Healthy Loan', 'Predicted High-Risk Loan']
)
# true positives 18663, 563
# true negatives 102, 56
cm_original_df

In [None]:
# Print the classification report for the model
from sklearn.metrics import classification_report
ODclassification_report = classification_report(y_test, ODpredictions, target_names=['Healthy Loan', 'High-Risk Loan'])
print(ODclassification_report)

In [None]:
# Displaying combined results to answer the next question
print("Linear Regression Calculated on the Original Data")
print("Confusion Matrix")
display(cm_original_df)
print(f"Accuracy Score : {ODaccuracy_score:.3%}\n")
print("Classification Report")
print(ODclassification_report)

### Step 4: Answer the following question.

**Question:** How well does the logistic regression model predict both the `0` (healthy loan) and `1` (high-risk loan) labels?

**Answer:** The Linear Regression model run on the original data is a good prediction if a loan will default due to the high precision, recall, and f1-scores. All the data was included in the calculations due to the data being 'clean' which helped with the accuracy in making these calculations. Data that is 'clean' is not only normalized but also has correct value types where expected (such as no null values or strings type values where integers are expected).

---

## Predict a Logistic Regression Model with Resampled Training Data

### Step 1: Use the `RandomOverSampler` module from the imbalanced-learn library to resample the data. Be sure to confirm that the labels have an equal number of data points. 

In [None]:
# Import the RandomOverSampler module form imbalanced-learn (pip install imbalanced-learn)
from imblearn.over_sampling import RandomOverSampler

# Instantiate the random oversampler model
## define dataset
# X, y = make_classification(n_samples=5000, weights=[0.99], flip_y=0)

# # Assign a random_state parameter of 1 to the model
## used sampling_strategy to over sample the lessor group
# ros = RandomOverSampler(sampling_strategy='minority',random_state=1)
ros = RandomOverSampler(random_state=1)

# Fit the original training data to the random_oversampler model
## this step samples the training sample...
X_res, y_res = ros.fit_resample(X, y)


In [None]:
# Count the distinct values of the resampled labels data
X_res.shape, y_res.shape

In [None]:
print(X_res)

In [None]:
print(y_res)

In [None]:
# Split the data using train_test_split
# Assign a random_state of 1 to the function
X_res, X_testROS, y_res, y_testROS = train_test_split(X, 
                                                      y, 
                                                      random_state=1)

In [None]:
X_res, X_testROS, y_res, y_testROS

In [None]:
## Scale (normalize) the dataset to avoid skewing caused by numbers of various sizes
# Instantiate a StandardScaler instance
scalerROS = StandardScaler()
# Fit the training data to the standard scaler
X_scaler = scalerROS.fit(X_res)

# Transform the training data using the scaler
X_res_scaled = X_scaler.transform(X_res)

# Transform the testing data using the scaler
X_test_scaledROS = X_scaler.transform(X_testROS)

In [None]:
# scaledROS_df = pd.DataFrame(
#                 X_test_scaledROS
# )

# scaledROS_df.info()
# scaledROS_df

### Step 2: Use the `LogisticRegression` classifier and the resampled data to fit the model and make predictions.

In [None]:
# Instantiate the Logistic Regression model
from sklearn.linear_model import LogisticRegression
from collections import Counter
from sklearn.datasets import make_classification

# Assign a random_state parameter of 1 to the model
ROSclassifier = LogisticRegression(solver='lbfgs',random_state=1)
# ROSclassifier

# Fit the model using the resampled training data
ROSclassifier.fit(X_res_scaled, y_res)

# # Make a prediction using the testing data
ROStest_predictions = ROSclassifier.predict(X_test_scaledROS)

# # show test predictions
ROSresults = pd.DataFrame({"Prediction": ROStest_predictions, "Actual": y_testROS}).reset_index(drop=True)
ROSresults.info()
ROSresults.head(10)

In [None]:
# Score the model
print(f"RandomOverSample Data Score: {ROSclassifier.score(X_res_scaled, y_res):.3%}")
print(f"         Testing Data Score: {ROSclassifier.score(X_test_scaledROS, y_testROS):.3%}")

In [None]:
# Make a prediction using the testing data
ROSpredictions = ROSclassifier.predict(X_test_scaledROS)
pd.DataFrame({"Prediction": ROStest_predictions, "Actual": y_testROS}).tail(10)

### Step 3: Evaluate the model’s performance by doing the following:

* Calculate the accuracy score of the model.

* Generate a confusion matrix.

* Print the classification report.

In [None]:
# Print the balanced_accuracy score of the model 
f'The Logistic Regression Model RandomOverSample balanced accuracy score is {balanced_accuracy_score(y_testROS, ROStest_predictions):.3%}'

In [None]:
# accuracy score
ROSaccuracy_score = accuracy_score(y_testROS, ROStest_predictions)
f'The Logistic Regression Model RandomOverSample accuracy score is {ROSaccuracy_score:.3%}'
# accuracy_score(y_test, predictions)

In [None]:
# Generate a confusion matrix for the model
ROStest_matrix = confusion_matrix(y_testROS, ROSpredictions)
ROStest_matrix
# # true positives 18652, 113
# # true negatives 10, 609

cm_ROS = confusion_matrix(y_testROS, ROSpredictions)
cm_ROS_df = pd.DataFrame(
                cm_ROS, index=['Actual Healthy Loan', 'Actual High-Risk Loan'],
                columns=['Predicted Healthy Loan', 'Predicted High-Risk Loan']
)

cm_ROS_df

In [None]:
# understanding the confusion matrix:
print(f"The true positives in the Logistic Regression Model are: {ROStest_matrix[0][0]} and {ROStest_matrix[1][1]}")
print(f"The true negatives in the Logistic Regression Model are: {ROStest_matrix[0][1]} and {ROStest_matrix[1][0]}")

In [None]:
# X.shape, y.shape, len(X_test_scaled), y_res.shape

In [None]:
# Print the classification report for the model
ROSclassification_report = classification_report(y_testROS, ROSpredictions, target_names=['Healthy Loan', 'High-Risk Loan'])
print(ROSclassification_report)

In [None]:
# Displaying combined results to answer the next question
print("Linear Regression Calculated on the RandomOverSampler Data")
print("Confusion Matrix")
display(cm_ROS_df)
print(f"Accuracy Score : {ROSaccuracy_score:.3%}\n")
print("Classification Report")
print(ROSclassification_report)

### Step 4: Answer the following question

**Question:** How well does the logistic regression model, fit with oversampled data, predict both the `0` (healthy loan) and `1` (high-risk loan) labels?

**Answer:** YOUR ANSWER HERE!