# [ ML ]  대출연체 여부 예측

In [1]:
import pandas as pd 
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

import warnings
warnings.filterwarnings("ignore")

### (0) 문제정의하기
- 전처리 작업을 마친 데이터셋을 이용하여 대출 연체 여부를 예측해보세요

### (1) 데이터 불러오기

In [2]:
df = pd.read_csv('loan_data_preprocessed.csv')

### (2) 데이터 확인하기

In [3]:
df.head()

Unnamed: 0,Loan_ID,loan_status,Principal,terms,effective_date,due_date,past_due_days,age,Gender,loan_duration,check_duration,effective_month,effective_day,effective_weekday,Principal_log,education_High School or Below,education_Master or Above,education_college,education,Gender_label
0,xqd20160234,PAIDOFF,1000,30,2016-09-12,2016-10-11,0.0,36,male,30,True,9,12,0,6.908755,True,False,False,High School or Below,0
1,xqd20160329,COLLECTION,1000,30,2016-09-11,2016-10-10,59.0,24,female,30,True,9,11,6,6.908755,True,False,False,High School or Below,1
2,xqd20160179,PAIDOFF,1000,30,2016-09-12,2016-10-11,0.0,21,male,30,True,9,12,0,6.908755,True,False,False,High School or Below,0
3,xqd20160454,COLLECTION_PAIDOFF,1000,30,2016-09-12,2016-10-11,3.0,26,male,30,True,9,12,0,6.908755,True,False,False,High School or Below,0
4,xqd20320403,COLLECTION_PAIDOFF,1000,30,2016-09-09,2016-11-07,13.0,39,male,60,False,9,9,4,6.908755,False,False,True,college,0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 379 entries, 0 to 378
Data columns (total 20 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Loan_ID                         379 non-null    object 
 1   loan_status                     379 non-null    object 
 2   Principal                       379 non-null    int64  
 3   terms                           379 non-null    int64  
 4   effective_date                  379 non-null    object 
 5   due_date                        379 non-null    object 
 6   past_due_days                   379 non-null    float64
 7   age                             379 non-null    int64  
 8   Gender                          379 non-null    object 
 9   loan_duration                   379 non-null    int64  
 10  check_duration                  379 non-null    bool   
 11  effective_month                 379 non-null    int64  
 12  effective_day                   379 

In [5]:
# numeric 형
df.describe()

Unnamed: 0,Principal,terms,past_due_days,age,loan_duration,effective_month,effective_day,effective_weekday,Principal_log,Gender_label
count,379.0,379.0,379.0,379.0,379.0,379.0,379.0,379.0,379.0,379.0
mean,999.472296,25.274406,14.147757,30.643799,27.862797,9.0,11.390501,3.564644,6.908199,0.153034
std,7.254712,7.383928,24.46711,5.795664,10.907604,0.0,1.09376,2.683068,0.007636,0.360496
min,900.0,7.0,0.0,18.0,7.0,9.0,8.0,0.0,6.803505,0.0
25%,1000.0,15.0,0.0,26.0,15.0,9.0,11.0,0.0,6.908755,0.0
50%,1000.0,30.0,0.0,30.0,30.0,9.0,11.0,5.0,6.908755,0.0
75%,1000.0,30.0,14.0,34.0,30.0,9.0,12.0,6.0,6.908755,0.0
max,1000.0,30.0,76.0,50.0,60.0,9.0,14.0,6.0,6.908755,1.0


In [6]:
# object 형
df.describe(include='object')

Unnamed: 0,Loan_ID,loan_status,effective_date,due_date,Gender,education
count,379,379,379,379,379,379
unique,379,3,7,24,2,4
top,xqd20160490,PAIDOFF,2016-09-11,2016-10-10,male,High School or Below
freq,1,222,176,121,321,165


In [7]:
# 모든 열
df.describe(include='all')

Unnamed: 0,Loan_ID,loan_status,Principal,terms,effective_date,due_date,past_due_days,age,Gender,loan_duration,check_duration,effective_month,effective_day,effective_weekday,Principal_log,education_High School or Below,education_Master or Above,education_college,education,Gender_label
count,379,379,379.0,379.0,379,379,379.0,379.0,379,379.0,379,379.0,379.0,379.0,379.0,379,379,379,379,379.0
unique,379,3,,,7,24,,,2,,2,,,,,2,2,2,4,
top,xqd20160490,PAIDOFF,,,2016-09-11,2016-10-10,,,male,,True,,,,,False,False,False,High School or Below,
freq,1,222,,,176,121,,,321,,341,,,,,214,378,216,165,
mean,,,999.472296,25.274406,,,14.147757,30.643799,,27.862797,,9.0,11.390501,3.564644,6.908199,,,,,0.153034
std,,,7.254712,7.383928,,,24.46711,5.795664,,10.907604,,0.0,1.09376,2.683068,0.007636,,,,,0.360496
min,,,900.0,7.0,,,0.0,18.0,,7.0,,9.0,8.0,0.0,6.803505,,,,,0.0
25%,,,1000.0,15.0,,,0.0,26.0,,15.0,,9.0,11.0,0.0,6.908755,,,,,0.0
50%,,,1000.0,30.0,,,0.0,30.0,,30.0,,9.0,11.0,5.0,6.908755,,,,,0.0
75%,,,1000.0,30.0,,,14.0,34.0,,30.0,,9.0,12.0,6.0,6.908755,,,,,0.0


### (3) 데이터 탐색하기

In [8]:
# loan_status 열
df['loan_status'].value_counts()

loan_status
PAIDOFF               222
COLLECTION_PAIDOFF     80
COLLECTION             77
Name: count, dtype: int64

In [9]:
# 정상 대출과 연체 대출을 0과 1로 변환
df['loan_status'] = df['loan_status'].map({'PAIDOFF':0, 'COLLECTION_PAIDOFF':1, 'COLLECTION':1}) 

df['loan_status'].value_counts()

loan_status
0    222
1    157
Name: count, dtype: int64

In [10]:
# 성별에 따른 대출 상태
df.groupby(['Gender'])['loan_status'].mean()

Gender
female    0.310345
male      0.433022
Name: loan_status, dtype: float64

In [11]:
# 교육 수준에 따른 대출 상태
df.groupby(['education'])['loan_status'].mean()

education
Bechalor                0.360000
High School or Below    0.430303
Master or Above         1.000000
college                 0.411043
Name: loan_status, dtype: float64

In [12]:
# 성별에 따른 대출 연체일수 평균
df.groupby(['Gender'])['past_due_days'].mean()

Gender
female     9.965517
male      14.903427
Name: past_due_days, dtype: float64

In [13]:
# 교육 수준에 따른 대출 연체일수 평균
df.groupby(['education'])['past_due_days'].mean()

education
Bechalor                11.400000
High School or Below    15.739394
Master or Above         74.000000
college                 13.012270
Name: past_due_days, dtype: float64

### (4) 학습용 데이터셋 만들기 

In [14]:
df.head(1)

Unnamed: 0,Loan_ID,loan_status,Principal,terms,effective_date,due_date,past_due_days,age,Gender,loan_duration,check_duration,effective_month,effective_day,effective_weekday,Principal_log,education_High School or Below,education_Master or Above,education_college,education,Gender_label
0,xqd20160234,0,1000,30,2016-09-12,2016-10-11,0.0,36,male,30,True,9,12,0,6.908755,True,False,False,High School or Below,0


In [15]:
# feature data 와 target data 분리
X = df[['Principal', 'terms', 'age', 'loan_duration',
       'education_High School or Below', 'education_Master or Above',
       'education_college', 'Gender_label']]

y = df['loan_status']

X.shape, y.shape

((379, 8), (379,))

In [16]:
# feature data
X.head(1)

Unnamed: 0,Principal,terms,age,loan_duration,education_High School or Below,education_Master or Above,education_college,Gender_label
0,1000,30,36,30,True,False,False,0


In [17]:
# target data 결측치 확인
X.isnull().sum()

Principal                         0
terms                             0
age                               0
loan_duration                     0
education_High School or Below    0
education_Master or Above         0
education_college                 0
Gender_label                      0
dtype: int64

In [18]:
# target data 구성비율 확인
y.value_counts(normalize=True)

loan_status
0    0.585752
1    0.414248
Name: proportion, dtype: float64

### (5) 모델링 & 예측

In [19]:
# train, test 데이터 분리  
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((303, 8), (76, 8), (303,), (76,))

In [20]:
# Logistic Regression

logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)

acc_log = logreg.score(X_test, y_test)
print(acc_log)

0.5921052631578947


In [21]:
# Support Vector Machines

svc = SVC()
svc.fit(X_train, y_train)
y_pred = svc.predict(X_test)

acc_svc = svc.score(X_test, y_test)
print(acc_svc)

0.5921052631578947


In [22]:
# Random Forest

random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(X_train, y_train)
y_pred = random_forest.predict(X_test)

acc_random_forest = random_forest.score(X_test, y_test)
print(acc_random_forest)

0.5


### (6) 모델 예측 결과 정리

In [23]:
models = pd.DataFrame({
    'Model': ['Support Vector Machines', 'Logistic Regression', 'Random Forest',],
    'Score': [acc_svc,acc_log, acc_random_forest, ]})
    
models.sort_values(by='Score', ascending=False)

Unnamed: 0,Model,Score
0,Support Vector Machines,0.592105
1,Logistic Regression,0.592105
2,Random Forest,0.5
