In [1]:
import pandas as pd
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
import plotly.express as px
import scipy.stats as stats

mpl.rc('font', family='Malgun Gothic')

In [2]:
df1 = pd.read_csv('01_Data.csv')
df1

Unnamed: 0,Index,Member_ID,Sales_Type,Contract_Type,Channel,Datetime,Term,Payment_Type,Product_Type,Amount_Month,Customer_Type,Age,Address1,Address2,State,Overdue_count,Overdue_Type,Gender,Credit_Rank,Bank
0,1,66758234,렌탈,일반계약,영업방판,2019-05-06,60,CMS,DES-1,96900,개인,42.0,경기도,경기도,계약확정,0,없음,여자,9.0,새마을금고
1,2,66755948,렌탈,교체계약,영업방판,2020-02-20,60,카드이체,DES-1,102900,개인,39.0,경기도,경기도,계약확정,0,없음,남자,2.0,현대카드
2,3,66756657,렌탈,일반계약,홈쇼핑/방송,2019-02-28,60,CMS,DES-1,96900,개인,48.0,경기도,경기도,계약확정,0,없음,여자,8.0,우리은행
3,4,66423450,멤버십,멤버십3유형,재계약,2019-05-13,12,CMS,DES-1,66900,개인,39.0,경기도,경기도,계약확정,0,없음,남자,5.0,농협회원조합
4,5,66423204,멤버십,멤버십3유형,재계약,2019-05-10,12,CMS,DES-1,66900,개인,60.0,경기도,경기도,기간만료,12,있음,남자,8.0,농협회원조합
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51296,51298,66579515,렌탈,프로모션계약,대형마트A,2019-03-01,60,CMS,DES-3A,96900,개인,47.0,경기도,경기도,계약확정,0,없음,남자,,기업은행
51297,51299,66799558,렌탈,일반계약,대형마트A,2019-04-01,60,CMS,DES-1,96900,개인,42.0,경기도,경기도,계약확정,0,없음,여자,8.0,새마을금고
51298,51300,66799197,렌탈,프로모션계약,영업방판,2019-04-01,39,카드이체,ERA,120900,개인,65.0,서울특별시,서울특별시,계약확정,0,없음,여자,1.0,롯데카드
51299,51301,66792778,렌탈,일반계약,홈쇼핑/방송,2020-02-06,60,카드이체,DES-1,96900,개인,54.0,서울특별시,서울특별시,계약확정,0,없음,여자,2.0,롯데카드


In [3]:
df1['Target'] = df1['State'].replace({'계약확정':'정상', '기간만료':'정상', 
                                      '해약확정':'해약', '해약진행중':'해약'})

In [4]:
df1.columns

Index(['Index', 'Member_ID', 'Sales_Type', 'Contract_Type', 'Channel',
       'Datetime', 'Term', 'Payment_Type', 'Product_Type', 'Amount_Month',
       'Customer_Type', 'Age', 'Address1', 'Address2', 'State',
       'Overdue_count', 'Overdue_Type', 'Gender', 'Credit_Rank', 'Bank',
       'Target'],
      dtype='object')

In [5]:
X = df1[['Sales_Type', 'Term', 'Product_Type', 
         'Amount_Month', 'Age', 'Gender', 'Credit_Rank']]
Y = df1['Target']

In [6]:
from sklearn.model_selection import train_test_split  # 학습 검증 데이터 분할
from imblearn.pipeline import make_pipeline  # 학습 + 특성공학 파이프 구축
from sklearn.compose import make_column_transformer  # 데이터 타입별 특성공학
from sklearn.impute import SimpleImputer  # 결측값 대치
from sklearn.preprocessing import MinMaxScaler  # 숫자 데이터 스케일링
from sklearn.preprocessing import OneHotEncoder  # 문자 데이터 인코딩
from sklearn.model_selection import GridSearchCV  # 교차검증 + 매개변수 튜닝
from sklearn.tree import DecisionTreeClassifier  # 학습 수행 알고리즘
from sklearn.metrics import classification_report  # 평가 수행
import pickle  # 학습 모델을 파일로 저장

In [7]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=1234)

In [8]:
numeric_pipe = make_pipeline((SimpleImputer(strategy='mean')), 
                              (MinMaxScaler()))
category_pipe = make_pipeline((SimpleImputer(strategy='most_frequent')), 
                              (OneHotEncoder()))

In [9]:
numeric_list = X.describe().columns.tolist()
category_list = X.describe(include='object').columns.tolist()
preprocessing_pipe = make_column_transformer((numeric_pipe, numeric_list), 
                                             (category_pipe, category_list))
model_pipe = make_pipeline(preprocessing_pipe, DecisionTreeClassifier())

In [10]:
# max_depth 값을 5부터 9까지 값으로 번갈아가며 대입
hyperparameter_dict = {'decisiontreeclassifier__max_depth':range(5, 10)}
grid_model = GridSearchCV(model_pipe, cv=5, 
                          param_grid=hyperparameter_dict, scoring='f1', 
                          n_jobs=-1)  # scoring default = accuracy
                                    # n_jobs를 사용하여 CPU 개수를 조절해 머신러닝 시간을 줄일 수 있다.
# 5번의 교차검증 + 5번의 hyperparameter tuning = 총 25번의 모델
grid_model.fit(X_train, Y_train)



In [11]:
grid_model.best_params_ 

{'decisiontreeclassifier__max_depth': 5}

In [12]:
best_model = grid_model.best_estimator_ 

In [13]:
# 평가
def evaluatuon_func(best_model):
    Y_train_pred = best_model.predict(X_train)
    Y_test_pred = best_model.predict(X_test)
    print('학습성능')
    print(classification_report(Y_train, Y_train_pred))
    print('일반화성능')
    print(classification_report(Y_test, Y_test_pred))

In [14]:
evaluatuon_func(best_model)

학습성능
              precision    recall  f1-score   support

          정상       0.99      1.00      0.99     37984
          해약       0.80      0.01      0.02       491

    accuracy                           0.99     38475
   macro avg       0.89      0.50      0.50     38475
weighted avg       0.98      0.99      0.98     38475

일반화성능
              precision    recall  f1-score   support

          정상       0.99      1.00      0.99     12681
          해약       0.00      0.00      0.00       145

    accuracy                           0.99     12826
   macro avg       0.49      0.50      0.50     12826
weighted avg       0.98      0.99      0.98     12826



# Imbalanced Data Sampling

In [15]:
# Under Samping
# Random Under Sampling
from imblearn.under_sampling import RandomUnderSampler

In [16]:
under_sampler = RandomUnderSampler()
under_sampler.fit(X, Y)  # 데이터를 Under Sampling할 수식을 구성

In [17]:
X_under, Y_under = under_sampler.fit_resample(X, Y)

In [18]:
Y_under.value_counts()

정상    636
해약    636
Name: Target, dtype: int64

In [19]:
# Tomek’s Link
from imblearn.under_sampling import TomekLinks

In [20]:
df1_clean = df1.dropna()

In [21]:
df1_clean['Target'].value_counts()

정상    40137
해약      510
Name: Target, dtype: int64

In [22]:
under_sampler = TomekLinks()
under_sampler.fit(df1_clean[['Age', 'Term']], df1_clean['Target'])

In [23]:
X_under, Y_under = under_sampler.fit_resample(df1_clean[['Age', 'Term']], df1_clean['Target'])
Y_under.value_counts()
# 인접한 데이터가 없어 적용이 안됨

정상    40137
해약      510
Name: Target, dtype: int64

In [24]:
# Over Samping
# SMOTE
from imblearn.over_sampling import SMOTE

In [25]:
# 모델에 적용할 시
model_pipe = make_pipeline(preprocessing_pipe, SMOTE(), DecisionTreeClassifier())

In [30]:
hyperparameter_dict = {'decisiontreeclassifier__max_depth':range(5, 10), 
                       'decisiontreeclassifier__min_samples_leaf':range(5, 10), 
                       'decisiontreeclassifier__min_samples_split':range(5, 10)}
grid_model = GridSearchCV(model_pipe, cv=5, 
                          param_grid=hyperparameter_dict, scoring='f1', 
                          n_jobs=-1)
grid_model.fit(X_train, Y_train)

 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan]


In [31]:
best_model = grid_model.best_estimator_ 

In [32]:
evaluatuon_func(best_model)

학습성능
              precision    recall  f1-score   support

          정상       0.99      0.95      0.97     37984
          해약       0.09      0.38      0.15       491

    accuracy                           0.94     38475
   macro avg       0.54      0.67      0.56     38475
weighted avg       0.98      0.94      0.96     38475

일반화성능
              precision    recall  f1-score   support

          정상       0.99      0.95      0.97     12681
          해약       0.07      0.34      0.12       145

    accuracy                           0.94     12826
   macro avg       0.53      0.64      0.55     12826
weighted avg       0.98      0.94      0.96     12826

