In [1]:
%matplotlib inline
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from pathlib import Path

In [3]:
#Function for getting model scores
def test_model(model, data):
    X_train_scaled, X_test_scaled, y_train, y_test = data
    reg = model.fit(X_train_scaled, y_train)
    print(f'Model: {type(reg).__name__}')
    print(f'Train score: {reg.score(X_train_scaled, y_train)}')
    print(f'Test Score: {reg.score(X_test_scaled, y_test)}\n')
    return reg.score(X_test_scaled, y_test)

In [4]:
# Read the CSV file into a Pandas DataFrame
file = Path('Resources/lending_data.csv')
df = pd.read_csv(file)
df.head()

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
0,10700.0,7.672,52800,0.431818,5,1,22800,0
1,8400.0,6.692,43600,0.311927,3,0,13600,0
2,9000.0,6.963,46100,0.349241,3,0,16100,0
3,10700.0,7.664,52700,0.43074,5,1,22700,0
4,10800.0,7.698,53000,0.433962,5,1,23000,0


In [5]:
#Check for null values
null = df.isna().sum()
null

loan_size           0
interest_rate       0
borrower_income     0
debt_to_income      0
num_of_accounts     0
derogatory_marks    0
total_debt          0
loan_status         0
dtype: int64

## I think the  random tree clasifier will give better accuracy on this case.

In [6]:
#Define the features set X and the target y
X = df.drop('loan_status', axis=1)
y = df['loan_status']

In [7]:
#Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
#Create a StandardScaler instance
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)
data = [X_train_scaled, X_test_scaled, y_train, y_test]

In [8]:
#Dependencies
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor

In [46]:
#Logistic Regression Score
test_model(LogisticRegression(), data)

Model: LogisticRegression
Train score: 0.9942908240473243
Test Score: 0.9936545604622369



In [57]:
#Random Forest instance and parameters
rfc = RandomForestRegressor()
parameters = {
    "n_estimators":[100,250,500],
    "max_depth":[2,4],
    "max_features":["auto","sqrt","log2"]#,
    #"min_sample_leaf":[1,2,3,4,5]
}

In [58]:
#GridSearchCV to find the best parameters
from sklearn.model_selection import GridSearchCV
cv = GridSearchCV(rfc,parameters,cv=5)
cv.fit(X_train_scaled,y_train.values.ravel())

GridSearchCV(cv=5, estimator=RandomForestRegressor(),
             param_grid={'max_depth': [2, 4],
                         'max_features': ['auto', 'sqrt', 'log2'],
                         'n_estimators': [100, 250, 500]})

In [63]:
#Function to display the results
def display(results):
    print(f'Best parameters are: {results.best_params_}')
    print("\n")
    mean_score = results.cv_results_['mean_test_score']
    std_score = results.cv_results_['std_test_score']
    params = results.cv_results_['params']
    for mean,std,params in zip(mean_score,std_score,params):
        print(f'{round(mean,3)} + or -{round(std,3)} for the {params}')

In [60]:
display(cv)

Best parameters are: {'max_depth': 2, 'max_features': 'sqrt', 'n_estimators': 100}


0.85 + or -0.017 for the {'max_depth': 2, 'max_features': 'auto', 'n_estimators': 100}
0.85 + or -0.017 for the {'max_depth': 2, 'max_features': 'auto', 'n_estimators': 250}
0.85 + or -0.017 for the {'max_depth': 2, 'max_features': 'auto', 'n_estimators': 500}
0.85 + or -0.017 for the {'max_depth': 2, 'max_features': 'sqrt', 'n_estimators': 100}
0.85 + or -0.017 for the {'max_depth': 2, 'max_features': 'sqrt', 'n_estimators': 250}
0.85 + or -0.017 for the {'max_depth': 2, 'max_features': 'sqrt', 'n_estimators': 500}
0.85 + or -0.016 for the {'max_depth': 2, 'max_features': 'log2', 'n_estimators': 100}
0.85 + or -0.017 for the {'max_depth': 2, 'max_features': 'log2', 'n_estimators': 250}
0.85 + or -0.017 for the {'max_depth': 2, 'max_features': 'log2', 'n_estimators': 500}
0.849 + or -0.017 for the {'max_depth': 4, 'max_features': 'auto', 'n_estimators': 100}
0.849 + or -0.017 for the {'max_depth': 4, '

In [62]:
#Random Forest Score with the best parameters
cv.score(X_test_scaled,y_test)

0.8312362185731026