### 1-1. 데이터 로더

In [None]:
import pandas as pd

bank_df = pd.read_csv('UniversalBank.csv')
bank_df.head()

Unnamed: 0,ID,Age,Experience,Income,ZIPCode,Family,CCAvg,Education,Mortgage,PersonalLoan,SecuritiesAccount,CDAccount,Online,CreditCard
0,1,25,1,49,91107,4,1.6,1,0,0,1,0,0,0
1,2,45,19,34,90089,3,1.5,1,0,0,1,0,0,0
2,3,39,15,11,94720,1,1.0,1,0,0,0,0,0,0
3,4,35,9,100,94112,1,2.7,2,0,0,0,0,0,0
4,5,35,8,45,91330,4,1.0,2,0,0,0,0,0,1


### 1-2. 학습에 사용할 특성변수 선택

In [None]:
X = bank_df.drop (['ID','ZIPCode','PersonalLoan'], axis=1)
y = bank_df['PersonalLoan']


### 2. 데이터 분할

In [None]:
from sklearn.model_selection import train_test_split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1, stratify=y)

### 3-1. 학습에 사용할 모델 개별 정의

In [None]:
from sklearn.tree import DecisionTreeClassifier # 결정 트리

tree = DecisionTreeClassifier(max_depth=None, criterion='gini',random_state=1)
tree.fit(X_train, y_train)


DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=1, splitter='best')

### 3-2. 모델 검정

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score # 정확도, 민감도 등

y_pred = tree.predict(X_test)
print('잘못 분류된 샘플 개수: %d' % (y_test != y_pred).sum())
print('정확도: %.3f' % accuracy_score(y_test, y_pred))
print('정밀도: %.3f' % precision_score(y_true=y_test, y_pred=y_pred))
print('재현율: %.3f' % recall_score(y_true=y_test, y_pred=y_pred))
print('F1: %.3f' % f1_score(y_true=y_test, y_pred=y_pred))


잘못 분류된 샘플 개수: 28
정확도: 0.981
정밀도: 0.914
재현율: 0.889
F1: 0.901


### 4. 교차검증

In [None]:
from sklearn.model_selection import cross_validate # 교차타당도
from sklearn.pipeline import make_pipeline # 파이프라인 구축
import numpy as np

scores = cross_validate(estimator=tree, 
                        X=X_train, 
                        y=y_train, 
                        scoring=['accuracy'], 
                        cv=10, 
                        n_jobs=-1,
                        return_train_score=False)
print('CV 정확도 점수: %s' % scores['test_accuracy'])
print('CV 정확도: %.3f +/- %.3f' % (np.mean(scores['test_accuracy']), 
                                 np.std(scores['test_accuracy'])))

CV 정확도 점수: [0.99428571 0.98       0.97714286 0.98285714 0.97714286 0.97714286
 0.99428571 0.98571429 0.96857143 0.98      ]
CV 정확도: 0.982 +/- 0.008


### 5-1. 파이프라인 학습

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV

#pipe_tree = make_pipeline( StandardScaler(), PCA(n_components=10), DecisionTreeClassifier())  # 98.514
pipe_tree = make_pipeline(DecisionTreeClassifier())


In [None]:

param_range1 = [1,2,3,4,5,6,7,8,9,10] # 수정
param_range2 = [10,20,30,40,50] # 수정

param_grid = [{'decisiontreeclassifier__max_depth': param_range1, # 수정
               'decisiontreeclassifier__min_samples_leaf': param_range2}] # 수정

gs = GridSearchCV(estimator=pipe_tree, # 수정
                  param_grid=param_grid, 
                  scoring='accuracy', 
                  cv=10,
                  n_jobs=-1)

gs = gs.fit(X_train, y_train)

print(gs.best_score_)
print(gs.best_params_)


### 5-2. 파이파라인 학습 모델 검정

In [None]:
from sklearn.metrics import confusion_matrix, classification_report

best_tree = gs.best_estimator_
best_tree.fit(X_train, y_train)
y_pred = best_tree.predict(X_test) 

print('Classification Report')
print(classification_report(y_test, y_pred))

Classification Report
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      1356
           1       0.93      0.87      0.90       144

    accuracy                           0.98      1500
   macro avg       0.96      0.93      0.94      1500
weighted avg       0.98      0.98      0.98      1500

