# Logistic Regression

Logistic regression is a classification algorithm used to predict two know known set of classes or categories, such as yes/no, 0/1, True/False.



In [1]:
# Dependencies

%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import os

In [2]:
# Load dataset

file_path = os.path.join("Resources/lending_data.csv")
df = pd.read_csv(file_path)
df.head(26)

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
0,10700.0,7.672,52800,0.431818,5,1,22800,0
1,8400.0,6.692,43600,0.311927,3,0,13600,0
2,9000.0,6.963,46100,0.349241,3,0,16100,0
3,10700.0,7.664,52700,0.43074,5,1,22700,0
4,10800.0,7.698,53000,0.433962,5,1,23000,0
5,10100.0,7.438,50600,0.407115,4,1,20600,0
6,10300.0,7.49,51100,0.412916,4,1,21100,0
7,8800.0,6.857,45100,0.334812,3,0,15100,0
8,9300.0,7.096,47400,0.367089,3,0,17400,0
9,9700.0,7.248,48800,0.385246,4,0,18800,0


In [3]:
print(df)

       loan_size  interest_rate  borrower_income  debt_to_income  \
0        10700.0          7.672            52800        0.431818   
1         8400.0          6.692            43600        0.311927   
2         9000.0          6.963            46100        0.349241   
3        10700.0          7.664            52700        0.430740   
4        10800.0          7.698            53000        0.433962   
...          ...            ...              ...             ...   
77531    19100.0         11.261            86600        0.653580   
77532    17700.0         10.662            80900        0.629172   
77533    17600.0         10.595            80300        0.626401   
77534    16300.0         10.068            75300        0.601594   
77535    15600.0          9.742            72300        0.585062   

       num_of_accounts  derogatory_marks  total_debt  loan_status  
0                    5                 1       22800            0  
1                    3                 0       

In [4]:
# Define the X (features) and y (target) sets, and drop the output

y = df["loan_status"].values
X = df.drop("loan_status", axis=1)

In [5]:
X

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt
0,10700.0,7.672,52800,0.431818,5,1,22800
1,8400.0,6.692,43600,0.311927,3,0,13600
2,9000.0,6.963,46100,0.349241,3,0,16100
3,10700.0,7.664,52700,0.430740,5,1,22700
4,10800.0,7.698,53000,0.433962,5,1,23000
...,...,...,...,...,...,...,...
77531,19100.0,11.261,86600,0.653580,12,2,56600
77532,17700.0,10.662,80900,0.629172,11,2,50900
77533,17600.0,10.595,80300,0.626401,11,2,50300
77534,16300.0,10.068,75300,0.601594,10,2,45300


### Split our data into two sets; training and testing data

In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [8]:
X_train

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt
29175,8600.0,6.792,44500,0.325843,3,0,14500
23020,7800.0,6.419,41000,0.268293,2,0,11000
31269,10000.0,7.386,50100,0.401198,4,1,20100
35479,9300.0,7.093,47300,0.365751,3,0,17300
13470,9200.0,7.045,46900,0.360341,3,0,16900
...,...,...,...,...,...,...,...
20609,7200.0,6.177,38700,0.224806,1,0,8700
21440,10000.0,7.389,50100,0.401198,4,1,20100
73349,10200.0,7.463,50800,0.409449,4,1,20800
50057,11100.0,7.838,54400,0.448529,5,1,24400


In [13]:
X_test

Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt
60914,12600.0,8.469,60300,0.502488,6,1,30300
36843,9800.0,7.289,49200,0.390244,4,0,19200
1966,10900.0,7.770,53700,0.441341,5,1,23700
70137,10700.0,7.666,52700,0.430740,5,1,22700
27237,9900.0,7.353,49800,0.397590,4,0,19800
...,...,...,...,...,...,...,...
45639,9900.0,7.328,49600,0.395161,4,0,19600
11301,9900.0,7.317,49500,0.393939,4,0,19500
51614,8000.0,6.520,42000,0.285714,2,0,12000
4598,11500.0,8.001,55900,0.463327,5,1,25900


In [9]:
y_train

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [14]:
y_test

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

### Create a logistic regression model

In [10]:
# Create the model by using LogisticRegression function

from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier


LogisticRegression()

### Fit (train) our model by using the training data

In [11]:
# Fit the model by using the train dataset
classifier.fit(X_train, y_train)

LogisticRegression()

### Validate the model by using the test data

In [12]:
# Validate the model, and score closest to 1.0 for both train and test set is ideal.
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 0.9921240885954051
Testing Data Score: 0.9918489475856377
