In [23]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('/Credit-Risk-Modelling/Datasets/loan_data_clean.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32574 entries, 0 to 32573
Data columns (total 12 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   person_age                  32574 non-null  int64  
 1   person_income               32574 non-null  int64  
 2   person_home_ownership       32574 non-null  object 
 3   person_emp_length           32574 non-null  float64
 4   loan_intent                 32574 non-null  object 
 5   loan_grade                  32574 non-null  object 
 6   loan_amnt                   32574 non-null  int64  
 7   loan_int_rate               32574 non-null  float64
 8   loan_status                 32574 non-null  int64  
 9   loan_percent_income         32574 non-null  float64
 10  cb_person_default_on_file   32574 non-null  object 
 11  cb_person_cred_hist_length  32574 non-null  int64  
dtypes: float64(3), int64(5), object(4)
memory usage: 3.0+ MB


In [5]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
person_age,32574.0,27.718426,6.204987,20.0,23.0,26.0,30.0,94.0
person_income,32574.0,65878.480813,52531.938809,4000.0,38500.0,55000.0,79200.0,2039784.0
person_emp_length,32574.0,4.760576,3.981181,0.0,2.0,4.0,7.0,41.0
loan_amnt,32574.0,9588.018051,6320.249598,500.0,5000.0,8000.0,12200.0,35000.0
loan_int_rate,32574.0,11.00947,3.081664,5.42,8.49,10.99,13.11,23.22
loan_status,32574.0,0.21818,0.413017,0.0,0.0,0.0,0.0,1.0
loan_percent_income,32574.0,0.170202,0.106755,0.0,0.09,0.15,0.23,0.83
cb_person_cred_hist_length,32574.0,5.804108,4.053873,2.0,3.0,4.0,8.0,30.0


### Simple Logistic Regression using only one feature

In [9]:
X = df[['loan_int_rate']]
y = df[['loan_status']]

In [14]:
one_param = LogisticRegression(random_state=2).fit(X, np.ravel(y))

In [16]:
one_param.get_params()

{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 100,
 'multi_class': 'auto',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': 2,
 'solver': 'lbfgs',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

In [17]:
one_param.intercept_

array([-4.42067035])

##### Add more parameters

In [18]:
X = df[['loan_int_rate', 'person_emp_length']]
y = df[['loan_status']]

In [19]:
two_param = LogisticRegression(random_state=2).fit(X, np.ravel(y))

In [20]:
two_param.intercept_

array([-4.17793465])

this model has an .intercept_ value closer to zero. 
<br>This means the log odds of a non-default is approaching zero.

## Use train, test and add more parameters

In [22]:
X = df[['loan_int_rate', 'person_emp_length', 'person_income']]
y = df[['loan_status']]

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=2)

In [26]:
multi_param = LogisticRegression(random_state=2).fit(X_train, np.ravel(y_train))

In [27]:
multi_param.coef_

array([[ 8.20256019e-02, -6.46164736e-02, -3.21720652e-05]])

In [28]:
multi_param.intercept_

array([-0.01330487])

there are three values in .coef_? This tells you how important each column, or feature, was for predicting. The more positive the value, the more it predicts defaults.

#### One hot encoding for object variables

In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32574 entries, 0 to 32573
Data columns (total 12 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   person_age                  32574 non-null  int64  
 1   person_income               32574 non-null  int64  
 2   person_home_ownership       32574 non-null  object 
 3   person_emp_length           32574 non-null  float64
 4   loan_intent                 32574 non-null  object 
 5   loan_grade                  32574 non-null  object 
 6   loan_amnt                   32574 non-null  int64  
 7   loan_int_rate               32574 non-null  float64
 8   loan_status                 32574 non-null  int64  
 9   loan_percent_income         32574 non-null  float64
 10  cb_person_default_on_file   32574 non-null  object 
 11  cb_person_cred_hist_length  32574 non-null  int64  
dtypes: float64(3), int64(5), object(4)
memory usage: 3.0+ MB


In [33]:
one_hot = pd.get_dummies(df)

In [34]:
one_hot.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32574 entries, 0 to 32573
Data columns (total 27 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   person_age                      32574 non-null  int64  
 1   person_income                   32574 non-null  int64  
 2   person_emp_length               32574 non-null  float64
 3   loan_amnt                       32574 non-null  int64  
 4   loan_int_rate                   32574 non-null  float64
 5   loan_status                     32574 non-null  int64  
 6   loan_percent_income             32574 non-null  float64
 7   cb_person_cred_hist_length      32574 non-null  int64  
 8   person_home_ownership_MORTGAGE  32574 non-null  uint8  
 9   person_home_ownership_OTHER     32574 non-null  uint8  
 10  person_home_ownership_OWN       32574 non-null  uint8  
 11  person_home_ownership_RENT      32574 non-null  uint8  
 12  loan_intent_DEBTCONSOLIDATION   

In [36]:
one_hot['loan_intent_HOMEIMPROVEMENT'].value_counts()

0    28969
1     3605
Name: loan_intent_HOMEIMPROVEMENT, dtype: int64

In [35]:
df['loan_intent'].value_counts()

EDUCATION            6451
MEDICAL              6071
VENTURE              5716
PERSONAL             5519
DEBTCONSOLIDATION    5212
HOMEIMPROVEMENT      3605
Name: loan_intent, dtype: int64

In [38]:
X = one_hot.drop('loan_status', axis=1)
y = one_hot[['loan_status']]

In [41]:
X_test, X_train, y_test, y_train = train_test_split(X, y, test_size=0.4, random_state=2)

In [42]:
prob = LogisticRegression(random_state=2).fit(X_train, np.ravel(y_train))

In [43]:
prob.predict_proba(X_test)  # prob show as two arrays: non-default, default

array([[0.60340781, 0.39659219],
       [0.92954828, 0.07045172],
       [0.60185769, 0.39814231],
       ...,
       [0.80319276, 0.19680724],
       [0.73876939, 0.26123061],
       [0.79818205, 0.20181795]])

In [44]:
prob.score(X_test, y_test)

0.8022922636103151