# Credit Risk Evaluator

In [9]:
# Dependencies
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from pathlib import Path
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.datasets import make_classification
import warnings
warnings.filterwarnings('ignore')


## Retrieve the Data

The data is located in the Challenge Files Folder:

* `lending_data.csv`

Import the data using Pandas. Display the resulting dataframe to confirm the import was successful.

In [10]:
# Import the data
file_path = Path('Resources/lending_data.csv')
credit_risk = pd.read_csv(file_path)
credit_risk.head()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
0,10700.0,7.672,52800,0.431818,5,1,22800,0
1,8400.0,6.692,43600,0.311927,3,0,13600,0
2,9000.0,6.963,46100,0.349241,3,0,16100,0
3,10700.0,7.664,52700,0.43074,5,1,22700,0
4,10800.0,7.698,53000,0.433962,5,1,23000,0


In [11]:
# Column names
credit_risk.columns

Index(['loan_size', 'interest_rate', 'borrower_income', 'debt_to_income',
       'num_of_accounts', 'derogatory_marks', 'total_debt', 'loan_status'],
      dtype='object')

In [12]:
#Count of rows with null values
credit_risk.isnull().sum()

loan_size           0
interest_rate       0
borrower_income     0
debt_to_income      0
num_of_accounts     0
derogatory_marks    0
total_debt          0
loan_status         0
dtype: int64

In [13]:
# Seperate the data set into data and target - define the X (features) and y (Target) sets

y = credit_risk["loan_size"].values
X = credit_risk.drop("loan_size", axis=1)


In [14]:
# Standardize the data
X_scaled = StandardScaler().fit_transform(X)
print(X_scaled[0:5])


[[ 0.4268375   0.42740435  0.66857141  0.61614258  1.04399575  0.42740435
  -0.18253038]
 [-0.67491817 -0.67155173 -0.80216566 -0.43404935 -0.67397306 -0.67155173
  -0.18253038]
 [-0.370249   -0.37292236 -0.34442423 -0.43404935 -0.67397306 -0.37292236
  -0.18253038]
 [ 0.41784357  0.41545918  0.65534557  0.61614258  1.04399575  0.41545918
  -0.18253038]
 [ 0.45606775  0.4512947   0.69487335  0.61614258  1.04399575  0.4512947
  -0.18253038]]


## Predict Model Performance

You will be creating and comparing two models on this data: a Logistic Regression, and a Random Forests Classifier. Before you create, fit, and score the models, make a prediction as to which model you think will perform better. You do not need to be correct! 

Write down your prediction in the designated cells in your Jupyter Notebook, and provide justification for your educated guess.

*After reviewing the data, my prediction is the logistical Regression Model will be more accurate*

## Split the Data into Training and Testing Sets

In [15]:
# Split the data into X_train, X_test, y_train, y_test
y = credit_risk['loan_status'].values
X = credit_risk.drop('loan_status', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)


In [16]:
y

array([0, 0, 0, ..., 1, 1, 1], dtype=int64)

In [17]:
X 

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt
0,10700.0,7.672,52800,0.431818,5,1,22800
1,8400.0,6.692,43600,0.311927,3,0,13600
2,9000.0,6.963,46100,0.349241,3,0,16100
3,10700.0,7.664,52700,0.430740,5,1,22700
4,10800.0,7.698,53000,0.433962,5,1,23000
...,...,...,...,...,...,...,...
77531,19100.0,11.261,86600,0.653580,12,2,56600
77532,17700.0,10.662,80900,0.629172,11,2,50900
77533,17600.0,10.595,80300,0.626401,11,2,50300
77534,16300.0,10.068,75300,0.601594,10,2,45300


In [18]:
# Create a scaler to standardize the data
scaler = StandardScaler()


In [19]:
# Train the scaler with the X_train data.
scaler.fit(X_train)


In [20]:
# Transform X_train and X_test.
# Note that the scaler used to transform X_train and X_test was trained on X_train.
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)


## Create prediction with KNN

In [21]:
# Instantiate KNN model and make predictions
knn = KNeighborsClassifier(n_neighbors=9)
knn.fit(X_train_scaled, y_train)
y_pred = knn.predict(X_test_scaled)


In [22]:
# Assess the accuracy score
accuracy_score(y_test, y_pred)


0.994428394552208

## Create, Fit and Compare Models

Create a Logistic Regression model, fit it to the data, and print the model's score. Do the same for a Random Forest Classifier. You may choose any starting hyperparameters you like. 

Which model performed better? How does that compare to your prediction? Write down your results and thoughts in the designated markdown cell.

In [23]:
# Creating Logistical Regression model
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

classifier = LogisticRegression()
classifier


In [24]:
# Train (Fit) the model using training data
classifier.fit(X_train, y_train)


In [25]:
# Create a logistical regression model
X_selected_train, X_selected_test, y_train, y_test = train_test_split(X, y, random_state=1)
scaler = StandardScaler().fit(X_selected_train)
X_selected_train_scaled = scaler.transform(X_selected_train)
X_selected_test_scaled = scaler.transform(X_selected_test)


In [26]:
# Validate the modle using the test data
print(f"Logistical Regression Model Training Score: {classifier.score(X_train, y_train)}")
print(f"Logistical Regression Model Testing Score: {classifier.score(X_test, y_test)}")


Logistical Regression Model Training Score: 0.9921240885954051
Logistical Regression Model Testing Score: 0.9918489475856377


In [27]:
# Train a Random Forest Classifier model and print the model score
X, y = make_classification(
    random_state=1, n_features=50, n_informative=5, n_redundant=0)
X = pd.DataFrame(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [28]:
# Get results of Random Forest Classifier
clf = RandomForestClassifier(
    random_state=12, n_estimators=500).fit(X_train_scaled, y_train)
print(f'Random Forest Classifer Training Score: {clf.score(X_train_scaled, y_train)}')
print(f'Random Forest Classifer Testing Score: {clf.score(X_test_scaled, y_test)}')


Random Forest Classifer Training Score: 1.0
Random Forest Classifer Testing Score: 0.72


### *Which model performed better? The Logistical Regression Model performed much better with greater accuracy than the Random Forest Classifer Model.  In my opinion, this is because the Logistical Regression Model performs better with categorical data.*