# Personal loan Data 를 활용한 Review 

지금까지 배워원 머신러닝 방법론 중에서 분류문제에 사용가능한 방법론들을 각자 최적화후 그 성능을 비교해보도록 하겠습니다. 

1. Logistic Regression
2. Logistic Regression(forward)
3. Logistic Regression(backward)
4. Logistic Regression(stepwise)
5. Decision Tree(Default)
6. Decision Tree(parameter_optimal)



#### **필요 패키지 불러오기**


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve,confusion_matrix,f1_score
from sklearn.preprocessing import StandardScaler
import statsmodels.api as sm
import itertools
import time



#### **변수선택함수 불러오기**

In [None]:
def processSubset(X,y, feature_set):
            model = sm.Logit(y,X[list(feature_set)])
            regr = model.fit()
            AIC = regr.aic
            return {"model":regr, "AIC":AIC}
        
'''
전진선택법
'''
def forward(X, y, predictors):
    # 데이터 변수들이 미리정의된 predictors에 있는지 없는지 확인 및 분류
    remaining_predictors = [p for p in X.columns.difference(['const']) if p not in predictors]
    tic = time.time()
    results = []
    for p in remaining_predictors:
        results.append(processSubset(X=X, y= y, feature_set=predictors+[p]+['const']))
    # 데이터프레임으로 변환
    models = pd.DataFrame(results)

    # AIC가 가장 낮은 것을 선택
    best_model = models.loc[models['AIC'].argmin()] # index
    toc = time.time()
    print("Processed ", models.shape[0], "models on", len(predictors)+1, "predictors in", (toc-tic))
    print('Selected predictors:',best_model['model'].model.exog_names,' AIC:',best_model[0] )
    return best_model

def forward_model(X,y):
    Fmodels = pd.DataFrame(columns=["AIC", "model"])
    tic = time.time()
    # 미리 정의된 데이터 변수
    predictors = []
    # 변수 1~10개 : 0~9 -> 1~10
    for i in range(1, len(X.columns.difference(['const'])) + 1):
        Forward_result = forward(X=X,y=y,predictors=predictors)
        if i > 1:
            if Forward_result['AIC'] > Fmodel_before:
                break
        Fmodels.loc[i] = Forward_result
        predictors = Fmodels.loc[i]["model"].model.exog_names
        Fmodel_before = Fmodels.loc[i]["AIC"]
        predictors = [ k for k in predictors if k != 'const']
    toc = time.time()
    print("Total elapsed time:", (toc - tic), "seconds.")

    return(Fmodels['model'][len(Fmodels['model'])])


'''
후진소거법
'''
def backward(X,y,predictors):
    tic = time.time()
    results = []
    
    # 데이터 변수들이 미리정의된 predictors 조합 확인
    for combo in itertools.combinations(predictors, len(predictors) - 1):
        results.append(processSubset(X=X, y= y,feature_set=list(combo)+['const']))
    models = pd.DataFrame(results)
    
    # 가장 낮은 AIC를 가진 모델을 선택
    best_model = models.loc[models['AIC'].argmin()]
    toc = time.time()
    print("Processed ", models.shape[0], "models on", len(predictors) - 1, "predictors in",
          (toc - tic))
    print('Selected predictors:',best_model['model'].model.exog_names,' AIC:',best_model[0] )
    return best_model


def backward_model(X, y):
    Bmodels = pd.DataFrame(columns=["AIC", "model"], index = range(1,len(X.columns)))
    tic = time.time()
    predictors = X.columns.difference(['const'])
    Bmodel_before = processSubset(X,y,predictors)['AIC']
    while (len(predictors) > 1):
        Backward_result = backward(X=train_x, y= train_y, predictors = predictors)
        if Backward_result['AIC'] > Bmodel_before:
            break
        Bmodels.loc[len(predictors) - 1] = Backward_result
        predictors = Bmodels.loc[len(predictors) - 1]["model"].model.exog_names
        Bmodel_before = Backward_result['AIC']
        predictors = [ k for k in predictors if k != 'const']

    toc = time.time()
    print("Total elapsed time:", (toc - tic), "seconds.")
    return (Bmodels['model'].dropna().iloc[0])


'''
단계적 선택법
'''
def Stepwise_model(X,y):
    Stepmodels = pd.DataFrame(columns=["AIC", "model"])
    tic = time.time()
    predictors = []
    Smodel_before = processSubset(X,y,predictors+['const'])['AIC']
    # 변수 1~10개 : 0~9 -> 1~10
    for i in range(1, len(X.columns.difference(['const'])) + 1):
        Forward_result = forward(X=X, y=y, predictors=predictors) # constant added
        print('forward')
        Stepmodels.loc[i] = Forward_result
        predictors = Stepmodels.loc[i]["model"].model.exog_names
        predictors = [ k for k in predictors if k != 'const']
        Backward_result = backward(X=X, y=y, predictors=predictors)
        if Backward_result['AIC']< Forward_result['AIC']:
            Stepmodels.loc[i] = Backward_result
            predictors = Stepmodels.loc[i]["model"].model.exog_names
            Smodel_before = Stepmodels.loc[i]["AIC"]
            predictors = [ k for k in predictors if k != 'const']
            print('backward')
        if Stepmodels.loc[i]['AIC']> Smodel_before:
            break
        else:
            Smodel_before = Stepmodels.loc[i]["AIC"]
    toc = time.time()
    print("Total elapsed time:", (toc - tic), "seconds.")
    return (Stepmodels['model'][len(Stepmodels['model'])])

In [None]:
### Data 불러오기
ploan = pd.______("./data/personal_loan.csv")


# 의미없는 변수 제거 ['ID','ZIP Code']
ploan_processed = ploan.dropna().______(['ID','ZIP Code'], axis=1, inplace=False)


# 상수항(constant) 추가 
ploan_processed = sm._______(ploan_processed, has_constant='add')

feature_columns = list(ploan_processed.columns.difference(["Personal Loan"]))
X = ploan_processed[feature_columns]
y = ploan_processed['Personal Loan'] 


### Metric Calculation
def perf_eval(cm):
    # True positive rate: TPR
    TPR = cm[1, 1] / sum(cm[1]) # recall
    # True negative rate: TNR
    TNR = cm[0, 0] / sum(cm[0])
    # Simple Accuracy
    ACC = (cm[0, 0] + cm[1, 1]) / sum(cm.reshape(-1,))
    # Balanced Correction Rate
    BCR = np.sqrt(TPR * TNR)
    # F1-measure
    Precision = cm[1,1] /sum(cm[:,1])
    F1 = 2*TPR*Precision/(TPR+Precision)
    return ([TPR, TNR, ACC, BCR, F1])

def cut_off(y,threshold):
    Y = y.copy() # copy함수를 사용하여 이전의 y값이 변화지 않게 함
    Y[ y>threshold]=1
    Y[ y<=threshold]=0
    return(Y.astype(int))




In [None]:
# 학습데이터와 검증데이터 7:3으로 나누기

train_x, test_x, train_y, test_y = ____________(X, y, stratify=y,train_size=0.7,test_size=0.3,random_state=5959)
print(train_x.shape, test_x.shape, train_y.shape, test_y.shape)


### **1.Logistic Regression(full)**

In [None]:
# Logistic 함수 적합하기
lr_model = sm._____(train_y, train_x)

lrbase_results = lr_model.____(method='newton')


### **2.Logistic Regression(forward)**

In [None]:
Forward_lr_model = forward_model(X=train_x, y= train_y)


### **3.Logistic Regression(backward)**

In [None]:
Backward_lr_model = backward_model(X=train_x, y= train_y)

### **4.Logistic Regression(stepwise)**

In [None]:
Stepwise_lr_model = Stepwise_model(X=train_x, y= train_y)

#### 로지스틱 회귀 분석 모형들 예측하기


In [None]:
pred_y_lr = lrbase_results.______(test_x)
pred_y_lrforward = Forward_lr_model.______(test_x[Forward_lr_model.model.exog_names])
pred_y_lrbackward = Backward_lr_model.______(test_x[Backward_lr_model.model.exog_names])
pred_y_lrstepwise = Stepwise_lr_model.______(test_x[Stepwise_lr_model.model.exog_names])


In [None]:
# Cut-off 동일하게 0.5
pred_Y_full= __________(pred_y_lr,0.5)
pred_Y_forward = __________(pred_y_lrbackward,0.5)
pred_Y_backward = __________(pred_y_lrbackward,0.5)
pred_Y_stepwise = __________(pred_y_lrstepwise,0.5)

## **혼동행렬으로 표현**

In [None]:
cfmat_full = __________(test_y, pred_Y_full)
cfmat_forward = __________(test_y, pred_Y_forward)
cfmat_backward = __________(test_y, pred_Y_backward)
cfmat_stepwise = __________(test_y, pred_Y_stepwise)

## **성능비교**

In [None]:
perf_mat = pd.DataFrame(columns=["TPR", "TNR", "ACC", "BCR","F1"],
                        index =['LR_FULL', 'LR_FORWARD','LR_BACKWARD', 'LR_STEPWISE'])


perf_mat.loc['LR_FULL']=perf_eval(cfmat_full)
perf_mat.loc['LR_FORWARD']=perf_eval(cfmat_forward)
perf_mat.loc['LR_BACKWARD']=perf_eval(cfmat_backward)
perf_mat.loc['LR_STEPWISE']=perf_eval(cfmat_stepwise)

In [None]:
perf_mat

## **Defualt DecisionTree 적합하기**

In [None]:
# 기본 파라미터 기준의 DeicisionTreeClassifier적합하기
defualt_dt = ______________(criterion='gini',#'entropy'
                                random_state=3355)

In [None]:
defualt_dt.fit(train_x, train_y)

In [None]:
# 적합한 모델로 예측하기
df_default_pred = defualt_dt.predict(test_x)

In [1]:
# 혼동행렬 만들기
cfmat_dt_defualt = _________(test_y,df_default_pred)

NameError: name '_________' is not defined

In [None]:
print(cfmat_dt_defualt)

## **최적화된 데이터의 학습 DecisionTree 적합하기**

In [None]:
# 가장 좋은 'max_depth' hyperparameter 값을 찾아보자
max_depths = list(range(1, 15, 1))
test_accs = []
test_f1 = []
dt_models = []
for max_depth in max_depths:
    clf_dt = DecisionTreeClassifier(criterion='gini', max_depth=max_depth, random_state=3355)
    clf_dt.fit(train_x, train_y)
    dt_models.append(clf_dt)
    y_valid_pred = clf_dt.predict(test_x)
    test_accs.append(accuracy_score(test_y, y_valid_pred))
    test_f1.append(f1_score(test_y, y_valid_pred))

In [None]:
# 가장 좋은 'max_depth' hyperparameter 값 찾기위한 정확도, f1 그래프 그리기
fig, ax = plt.subplots(figsize=(7, 7))
ax.plot(max_depths, test_accs, color='red', marker='o', label='test acc')
ax.plot(max_depths, test_f1, color='blue', marker='s', label='test f1')
ax.legend(loc='best')
plt.show(fig)

In [None]:
best_model_idx, best_model_f1 = max(enumerate(test_f1), key=lambda p: p[1])
df_opt_pred = dt_models[best_model_idx].predict(test_x)

In [None]:
cfmat_dt_opt = confusion_matrix(test_y,df_opt_pred)

In [None]:
print(cfmat_dt_opt)

In [None]:
perf_mat_summary = pd.DataFrame(columns=["TPR", "TNR", "ACC", "BCR","F1"],
                        index =['LR_FULL', 'LR_FORWARD','LR_BACKWARD', 'LR_STEPWISE',
                               'DT_default','DT_tuned'])


perf_mat_summary.loc['LR_FULL']=perf_eval(cfmat_full)
perf_mat_summary.loc['LR_FORWARD']=perf_eval(cfmat_forward)
perf_mat_summary.loc['LR_BACKWARD']=perf_eval(cfmat_backward)
perf_mat_summary.loc['LR_STEPWISE']=perf_eval(cfmat_stepwise)
perf_mat_summary.loc['DT_default']=perf_eval(cfmat_dt_defualt)
perf_mat_summary.loc['DT_tuned']=perf_eval(cfmat_dt_opt)



In [None]:
print(perf_mat_summary)