In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# For modeling
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
sns.set_theme(style='whitegrid')
pd.options.mode.chained_assignment = None  # Supress warning, default='warn'

# Read Data

In [2]:
df = pd.read_csv('../data/preprocessed/accepted_2007_to_2018Q4_preprocessed.csv')
print(f"Shape of df: {df.shape}")
df.head(5)

Shape of df: (1345310, 101)


Unnamed: 0,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,emp_length,home_ownership,...,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,hardship_flag,disbursement_method,debt_settlement_flag
0,3600.0,3600.0,3600.0,36,13.99,123.03,C,C4,10.0,MORTGAGE,...,0.0,0.0,0.0,178050.0,7746.0,2400.0,13734.0,N,Cash,N
1,24700.0,24700.0,24700.0,36,11.99,820.28,C,C1,10.0,MORTGAGE,...,7.7,0.0,0.0,314017.0,39475.0,79300.0,24667.0,N,Cash,N
2,20000.0,20000.0,20000.0,60,10.78,432.66,B,B4,10.0,MORTGAGE,...,50.0,0.0,0.0,218418.0,18696.0,6200.0,14877.0,N,Cash,N
3,10400.0,10400.0,10400.0,60,22.45,289.91,F,F1,3.0,MORTGAGE,...,60.0,0.0,0.0,439570.0,95768.0,20300.0,88097.0,N,Cash,N
4,11950.0,11950.0,11950.0,36,13.44,405.18,C,C3,4.0,RENT,...,100.0,0.0,0.0,16900.0,12798.0,9400.0,4000.0,N,Cash,N


In [3]:
df.dtypes

loan_amnt                     float64
funded_amnt                   float64
funded_amnt_inv               float64
term                            int64
int_rate                      float64
                               ...   
total_bc_limit                float64
total_il_high_credit_limit    float64
hardship_flag                  object
disbursement_method            object
debt_settlement_flag           object
Length: 101, dtype: object

## Take Sample to Work with
TODO: Delete this later and run notebook on full data

In [4]:
# Take only fraction of the data
df = df.sample(frac = 0.33)
print(df.shape)

(443952, 101)


# Encode Categorical / Nominal Columns to numeric with OHE

In [5]:
# Preview categorical data
df.dtypes[df.dtypes == "object"]

grade                   object
sub_grade               object
home_ownership          object
verification_status     object
pymnt_plan              object
purpose                 object
addr_state              object
initial_list_status     object
application_type        object
hardship_flag           object
disbursement_method     object
debt_settlement_flag    object
dtype: object

In [6]:
# Drop categorical columns that have too many realization for OHE
df = df.drop(["sub_grade", "purpose", "grade", "addr_state"], axis=1)
df.dtypes[df.dtypes == "object"]

home_ownership          object
verification_status     object
pymnt_plan              object
initial_list_status     object
application_type        object
hardship_flag           object
disbursement_method     object
debt_settlement_flag    object
dtype: object

# One Hot Encoder (OHE) of categorical data

In [11]:
# Get categorical columns names
categorical_columns = list(df.dtypes[df.dtypes == "object"].keys())
# Transform those categorical columns using One Hot Encoding
df = pd.get_dummies(data=df, columns=categorical_columns)
df.head(10)

Unnamed: 0,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,emp_length,annual_inc,loan_status,dti,...,pymnt_plan_n,initial_list_status_f,initial_list_status_w,application_type_Individual,application_type_Joint App,hardship_flag_N,disbursement_method_Cash,disbursement_method_DirectPay,debt_settlement_flag_N,debt_settlement_flag_Y
606699,12000.0,12000.0,12000.0,36,10.75,391.45,3.0,75000.0,0,21.39,...,1,0,1,1,0,1,1,0,1,0
1215951,11900.0,11900.0,11900.0,36,15.59,415.97,5.0,46500.0,0,8.29,...,1,0,1,1,0,1,1,0,1,0
413101,20000.0,20000.0,20000.0,36,12.62,670.23,2.0,69888.0,0,13.43,...,1,0,1,1,0,1,1,0,1,0
1232864,30000.0,30000.0,30000.0,60,24.49,871.6,5.0,60000.0,1,16.44,...,1,0,1,1,0,1,1,0,1,0
1007993,5000.0,5000.0,5000.0,36,7.9,156.46,2.0,72500.0,0,14.17,...,1,1,0,1,0,1,1,0,1,0
411717,25000.0,25000.0,25000.0,60,13.59,576.41,0.0,118000.0,1,38.99,...,1,0,1,0,1,1,1,0,1,0
426776,24000.0,24000.0,24000.0,36,9.44,768.12,10.0,70000.0,1,18.69,...,1,1,0,1,0,1,1,0,1,0
401848,40000.0,40000.0,39950.0,36,15.05,1387.6,4.0,106000.0,0,38.67,...,1,1,0,0,1,1,1,0,1,0
502266,12000.0,12000.0,12000.0,36,5.32,361.38,2.0,82000.0,0,11.21,...,1,0,1,1,0,1,1,0,1,0
1274627,14000.0,14000.0,14000.0,36,7.07,432.73,10.0,96000.0,0,8.13,...,1,0,1,1,0,1,1,0,1,0


In [14]:
df.dtypes.value_counts()

float64    87
object      8
int64       2
dtype: int64

# Preprocessing

In [3]:
# Select only numeric attributes first
# TODO: Include also categorical values but encode them first!
df = df.select_dtypes([np.number])

# Fill NaNs with Median
df = df.fillna(df.median())

# Preview
df.head(5)

Unnamed: 0,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,emp_length,annual_inc,loan_status,dti,...,num_tl_90g_dpd_24m,num_tl_op_past_12m,pct_tl_nvr_dlq,percent_bc_gt_75,pub_rec_bankruptcies,tax_liens,tot_hi_cred_lim,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit
0,3600.0,3600.0,3600.0,36,13.99,123.03,10.0,55000.0,0,5.91,...,0.0,3.0,76.9,0.0,0.0,0.0,178050.0,7746.0,2400.0,13734.0
1,24700.0,24700.0,24700.0,36,11.99,820.28,10.0,65000.0,0,16.06,...,0.0,2.0,97.4,7.7,0.0,0.0,314017.0,39475.0,79300.0,24667.0
2,20000.0,20000.0,20000.0,60,10.78,432.66,10.0,63000.0,0,10.78,...,0.0,0.0,100.0,50.0,0.0,0.0,218418.0,18696.0,6200.0,14877.0
3,10400.0,10400.0,10400.0,60,22.45,289.91,3.0,104433.0,0,25.37,...,0.0,4.0,96.6,60.0,0.0,0.0,439570.0,95768.0,20300.0,88097.0
4,11950.0,11950.0,11950.0,36,13.44,405.18,4.0,34000.0,0,10.2,...,0.0,0.0,100.0,100.0,0.0,0.0,16900.0,12798.0,9400.0,4000.0


## Data Split

In [4]:
# Fraction of test data
TEST_SIZE = 0.3

X = df.drop("loan_status", axis=1)
y = df["loan_status"]

# Split data into train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=1997, stratify=y)

print(f"Shape of X_train: {X_train.shape}")
print(f"Shape of X_test: {X_test.shape}")
print(f"Shape of y_train: {y_train.shape}")
print(f"Shape of y_test: {y_test.shape}")

Shape of X_train: (941717, 88)
Shape of X_test: (403593, 88)
Shape of y_train: (941717,)
Shape of y_test: (403593,)


## Scale Data

In [5]:
# Init object
scaler = StandardScaler()
# Scale X_train
X_train_scaled = scaler.fit_transform(X_train)
X_train_scaled = pd.DataFrame(X_train_scaled)
X_train_scaled.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,78,79,80,81,82,83,84,85,86,87
0,-1.152367,-1.151919,-1.149012,-0.563661,-0.026841,-1.110779,0.287232,-0.485986,-0.892698,-0.362623,...,-0.171852,1.018006,0.659137,0.614594,-0.35461,-0.131363,-0.839936,-0.696632,-0.320656,-0.796593
1,2.153819,2.155715,2.157652,1.774117,0.6943,1.44907,0.56643,0.862696,0.612028,-0.362623,...,-0.171852,0.46219,0.659137,1.205706,-0.35461,-0.131363,-0.152515,1.087189,0.283468,1.491959
2,1.557387,1.559021,1.561133,1.774117,0.817984,0.993616,1.124826,1.014078,-0.22571,1.920589,...,3.91279,-1.205259,-0.672513,-0.334597,-0.35461,-0.131363,3.038517,0.948475,-0.797347,1.686703
3,-0.507187,-0.506457,-0.50374,-0.563661,-1.060335,-0.473924,0.008034,-0.362127,1.306586,-0.362623,...,-0.171852,0.46219,0.226935,0.341772,-0.35461,2.380595,-0.573706,-0.373701,-0.188504,-0.282918
4,-1.043403,-1.042908,-1.040033,-0.563661,0.434354,-0.966392,1.124826,-0.499748,1.329708,-0.362623,...,-0.171852,-1.205259,0.659137,0.992565,-0.35461,-0.131363,-0.682125,-0.285584,-0.438648,-0.166748


## Fit Model

In [6]:
lr = LogisticRegression(fit_intercept=True, max_iter=300, solver="saga")
lr.fit(X_train_scaled, y_train)

# Print coefficients and intercept
print(f"Intercept: {lr.intercept_}")
print(f"Coefficients: {lr.coef_}")

KeyboardInterrupt: 

In [None]:
# Scale Test Samples
X_test_scaled = scaler.transform(X_test)

# Predict in-sample values
y_pred = lr.predict(X_test_scaled)

# Plot Confusion matrics
cm = confusion_matrix(y_true=y_test, y_pred=y_pred)

# Heatmap plot settings
plt.figure(figsize=(7,5))
plt.title("Confusion matrix")
sns.heatmap(cm, annot=True, fmt='d')
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

# Plot classification report
print(80 * "_")
print(classification_report(y_true=y_test, y_pred=y_pred))
print("Information:")
print("Precision: Percentage of correct positive predictions relative to total positive predictions")
print("Recall: Percentage of correct positive predictions relative to total actual positives.")
print("F1 Score: A weighted harmonic mean of precision and recall. The closer to 1, the better the model.")
print(80 * "_")