In [42]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split

from matplotlib import pyplot as plt
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

# Make a prediction

Prediction: 

    Random forest Classifer will perform better than Logistic regression because the dataset consists of data that expands beyond just yes or no answers. Random Forest models tend to work best with categorical data, since this data set has multiple groups that can be linked together that are more complex then yes/no. Logistic Regression, on the other hand, performs best with linearly separable datasets.

models: 

    Logistic regression: is an example of supervised learning. It is used to calculate or predict the probability of a binary (yes/no) event occurring.
    
    Random forest Classifer: is a classification algorithm consisting of many decisions trees. It uses bagging and feature randomness when building each individual tree to try to create an uncorrelated forest of trees whose prediction by committee is more accurate than that of any individual tree.

In [33]:
# Import the data
df = pd.read_csv(Path('Resources/lending_data.csv'))

In [34]:
df.head()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
0,10700.0,7.672,52800,0.431818,5,1,22800,0
1,8400.0,6.692,43600,0.311927,3,0,13600,0
2,9000.0,6.963,46100,0.349241,3,0,16100,0
3,10700.0,7.664,52700,0.43074,5,1,22700,0
4,10800.0,7.698,53000,0.433962,5,1,23000,0


# Split the Data into Training and Testing

In [35]:
# High Risk vs Low Risk
y = df["loan_status"]
X = df.drop(columns = ["loan_status"])
X .head()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt
0,10700.0,7.672,52800,0.431818,5,1,22800
1,8400.0,6.692,43600,0.311927,3,0,13600
2,9000.0,6.963,46100,0.349241,3,0,16100
3,10700.0,7.664,52700,0.43074,5,1,22700
4,10800.0,7.698,53000,0.433962,5,1,23000


In [36]:
# Check the balance of our target values 
y.value_counts()

0    75036
1     2500
Name: loan_status, dtype: int64

In [37]:
# Split the X and y into X_train, X_test, y_train, y_test
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1)
X_train.shape

(58152, 7)

# Logistic Regression model

In [29]:
# Create a logistic regression model
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier

LogisticRegression()

In [30]:
# Fit (train) our model by using the training data
classifier.fit(X_train, y_train)

LogisticRegression()

In [31]:
# Validate the model by using the test data
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 0.9921240885954051
Testing Data Score: 0.9918489475856377


In [38]:
# Make predictions
predictions = classifier.predict(X_test)
print(f"First 10 predictions:   {predictions[:10]}")
print(f"First 10 actual labels: {y_test[:10].tolist()}")

First 10 predictions:   [0 0 0 0 0 0 0 0 0 0]
First 10 actual labels: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [39]:
# Create the confusion matrix
from sklearn.metrics import confusion_matrix

y_true = y_test
y_pred = classifier.predict(X_test)
confusion_matrix(y_true, y_pred)

array([[18663,   102],
       [   56,   563]], dtype=int64)

In [40]:
# Manually calculate the accuracy of the model
TP = 149
FP = 2
TN = 191
FN = 1
accuracy = (TP + TN) / (TP + FP + TN + FN)
print(accuracy)

0.9912536443148688


# Random Forest Classifier model 

In [43]:
# Create data
X, y = make_classification(random_state=1, n_features=50, n_informative=5, n_redundant=0)
X = pd.DataFrame(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [44]:
clf = RandomForestClassifier(random_state=1, n_estimators=500).fit(X_train_scaled, y_train)
print(f'Training Score: {clf.score(X_train_scaled, y_train)}')
print(f'Testing Score: {clf.score(X_test_scaled, y_test)}')

Training Score: 1.0
Testing Score: 0.76


In [45]:
from sklearn.feature_selection import SelectFromModel
sel = SelectFromModel(clf)
sel.fit(X_train_scaled, y_train)

SelectFromModel(estimator=RandomForestClassifier(n_estimators=500,
                                                 random_state=1))

In [46]:
X_selected_train, X_selected_test, y_train, y_test = train_test_split(sel.transform(X), y, random_state=1)
scaler = StandardScaler().fit(X_selected_train)
X_selected_train_scaled = scaler.transform(X_selected_train)
X_selected_test_scaled = scaler.transform(X_selected_test)

In [47]:
clf = LogisticRegression().fit(X_train_scaled, y_train)
print(f'Training Score: {clf.score(X_train_scaled, y_train)}')
print(f'Testing Score: {clf.score(X_test_scaled, y_test)}')

Training Score: 1.0
Testing Score: 0.68


In [48]:
clf = LogisticRegression()
clf.fit(X_selected_train_scaled, y_train)
print(f'Training Score: {clf.score(X_selected_train_scaled, y_train)}')
print(f'Testing Score: {clf.score(X_selected_test_scaled, y_test)}')

Training Score: 0.9066666666666666
Testing Score: 0.84


In [9]:
# Import the data

In [111]:
# Split the data into X_train, X_test, y_train, y_test

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt
32815,8200.0,6.598,42700,0.297424,2,0,12700
59572,10500.0,7.591,52000,0.423077,4,1,22000
42325,7600.0,6.351,40400,0.257426,2,0,10400
39070,9700.0,7.254,48900,0.386503,4,0,18900
42524,11800.0,8.155,57300,0.47644,6,1,27300


In [109]:
# Train a Logistic Regression model print the model score

0.9908171687990095

In [110]:
# Train a Random Forest Classifier model and print the model score

0.9910751134956666