# Ensemble Code 1 - Cross Validation

### 필요한 모듈을 불러온다

In [1]:
import numpy as np
import copy
import pandas as pd
from sklearn.metrics import confusion_matrix
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt
import collections
print("Module Ready!")

Module Ready!


#### 1. Fold_Value는 CV Fold갯수를 의미함 
#### 2. RANDOM_STATE는 실습을 위해 모델의 결과를 같게 하기 위함임

In [2]:
#############################
FOLD_VALUE = 5
RANDOM_STATE = 1026
#############################

## 사용할 Personal Loan 데이터셋을 불러옴
> 1. 난수를 고정하여 8:2 = Training data : Test data로 나눔
> 2. Training dataset의 column별 std와 mean을 이용하여 Train/Test dataset standardization 수행

In [3]:
# 사용할 Personal Loan 데이터셋을 불러옵니다.
Rawdata = pd.read_csv('dataset/Personal Loan.csv')
# Print Column names
print("'Personal Loan' data column name : ", list(Rawdata.columns.values))
print("ID와 ZIP Code는 사용하지 않습니다. 또한 Personal Loan을 분류하는 binary classification 문제 입니다.")
# Allocate column index based on Input and Output varaibles
Input_Column_Index = np.concatenate((range(1,4),range(5,9),range(10,14)))
Target_Column_Index = np.array([9])


# 같은 데이터셋을 사용하기 위해서 난수를 고정합니다.
np.random.seed(150)
Train_Index = np.random.choice(np.shape(Rawdata)[0],int(np.shape(Rawdata)[0]*0.8),replace=False)

# Input variable과 Output variable을 Numpy array로 변환합니다.
Rawdata_Input = np.array(Rawdata)[:,Input_Column_Index]
Rawdata_Output = np.array(Rawdata)[:,Target_Column_Index]


# Training data와 Test data를 나누어 줍니다.
Train_Input = Rawdata_Input[Train_Index,:]
Train_Output = Rawdata_Output[Train_Index,:]
Test_Input = Rawdata_Input[np.delete(range(np.shape(Rawdata)[0]),Train_Index),:]
Test_Output = Rawdata_Output[np.delete(range(np.shape(Rawdata)[0]),Train_Index),:]
print('Data partition complete! \nTrain_Input shape :',np.shape(Train_Input),'\nTrain_Output shape :',np.shape(Train_Output))
print('Test_Input shape :',np.shape(Test_Input),'\nTest_Output shape :',np.shape(Test_Output))

# Input variable standardization based on Training data

def standardization(Data,Data2):
    return ((Data - np.mean(Data2, axis=0)) / np.std(Data2, axis=0))

Train_Input_Normalized = copy.deepcopy(standardization(Train_Input,Train_Input))
Test_Input_Normalized = copy.deepcopy(standardization(Test_Input,Train_Input))
print("standardization complete!")

'Personal Loan' data column name :  ['ID', 'Age', 'Experience', 'Income', 'ZIP Code', 'Family', 'CCAvg', 'Education', 'Mortgage', 'Personal Loan', 'Securities Account', 'CD Account', 'Online', 'CreditCard']
ID와 ZIP Code는 사용하지 않습니다. 또한 Personal Loan을 분류하는 binary classification 문제 입니다.
Data partition complete! 
Train_Input shape : (2000, 11) 
Train_Output shape : (2000, 1)
Test_Input shape : (500, 11) 
Test_Output shape : (500, 1)
standardization complete!


## Best hyperparameter를 찾기위하여 5-Fold 로 데이터를 나누며 CV할 함수를 생성

In [4]:
# Best Hyperparameter를 찾기위하여 5-Fold Cross Validation을 한다
def k_Fold_Maker(InputData,OutputData,Partition_Number):
    Index = 0
    Input_List = list()
    Output_List = list()
    Length = int(np.floor(np.shape(InputData)[0]/Partition_Number))
    for i in range(Partition_Number):
        if(i == (Partition_Number-1)):
            Input_List.append(InputData[range(Index+(Length* i), np.shape(InputData)[0]), :])
            Output_List.append(OutputData[range(Index+(Length* i), np.shape(InputData)[0]), :])
        else:
            Input_List.append(InputData[range(Index + (Length * i), Index + (Length * (i + 1))), :])
            Output_List.append(OutputData[range(Index + (Length * i), Index + (Length * (i + 1))), :])
    return(Input_List,Output_List)

# Make 5-Fold dataset for Cross validation
Fold_Input, Fold_Output = k_Fold_Maker(Train_Input_Normalized, Train_Output, FOLD_VALUE)

# Cross validation의 결과를 위한 함수
def CV_Result_Each_Model(Hyper_Para,Model):    
    FULL_Results = list() # 모든 결과를 담을 객체
    for i in range(FOLD_VALUE):
        Tr_Index = np.delete(range(FOLD_VALUE),i) #Training 에 사용할 Fold Index
        Val_Index = i                    #Validation에 사용할 Fold Index

        TRAIN_INPUT  = list()
        TRAIN_OUTPUT = list()
        for j in Tr_Index:
            TRAIN_INPUT.append(Fold_Input[j])
            TRAIN_OUTPUT.append(Fold_Output[j])
        TRAIN_INPUT = np.concatenate(TRAIN_INPUT)
        TRAIN_OUTPUT = np.concatenate(TRAIN_OUTPUT)
        VALID_INPUT = Fold_Input[i]
        VALID_OUTPUT = Fold_Output[i]
        FULL_Results.append(Model(Hyper_Para,TRAIN_INPUT,TRAIN_OUTPUT,VALID_INPUT,VALID_OUTPUT))
    print("CV Complete!")
    FULL_Results=np.concatenate(FULL_Results,axis=0)
    return(FULL_Results)

# 평가지표를 위한 함수를 생성한다.
def Valid_Index(Data,NAME):
    Accuracy = (Data[0,0]+Data[1,1])/np.sum(Data)
    TPR = Data[1,1]/np.sum(Data[1,:])
    TNR = Data[0,0]/np.sum(Data[0,:])
    Precision = Data[1,1]/np.sum(Data[:,1])
    BCR = np.sqrt(TPR*TNR)
    F1 = (2*TPR*Precision)/(TPR+Precision)
    TMP=pd.DataFrame({'Model' : NAME,
                  'Accuracy' : [Accuracy],
                  'TPR': [TPR],
                  'TNR': [TNR],
                  'Precision': [Precision],
                  'BCR': [BCR],
                  'F1': [F1]})
    Results=TMP[['Model','Accuracy','F1','BCR','Precision','TPR','TNR']]
    return(Results)

![NN_Description](dataset/NeuralNetwork_Description.png)

# Model1. Neural network
## Neuralnetwork 함수와 5-Fold CV를 위해 함수를 생성 후 이행

In [5]:
###############################################
# Neural Network Hyperparameter Set
###############################################
ACTIVATION = 'tanh'        
SOLVER = 'adam'            
BATCH_SIZE = 32            
HIDDEN_LAYER= [10,10,10]   
TR_INPUT = Train_Input     
TR_OUTPUT = Train_Output[:,0]
Iterlation = 3000
L2_Penalty = 0.001
Visualization = False 
Validation_Percent = 0.0 
Decay_Method = 'invscaling'
Power_Value = 0.5         
Tolerence_Value = 1e-04 
###############################################

def NeuralNetwork(HIDDEN_LAYER,Train_Input,TR_OUTPUT,Val_Input,Val_Output):
    MLP=MLPClassifier(activation=ACTIVATION,solver=SOLVER,alpha=L2_Penalty,
                      hidden_layer_sizes=HIDDEN_LAYER,
                      batch_size=BATCH_SIZE,max_iter=Iterlation,verbose=Visualization,early_stopping=False,power_t=Power_Value,
                      validation_fraction=Validation_Percent,learning_rate=Decay_Method,tol=Tolerence_Value,
                      random_state =RANDOM_STATE).fit(Train_Input,TR_OUTPUT[:,0])
    Predict_Value = MLP.predict(Val_Input)
    return(np.concatenate((Predict_Value[:,np.newaxis],Val_Output),axis=1))


# 자 이제 NeuralNet을 기준으로 파라미터별 5-Fold CV를 해보도록 한다.
One_Layer_Node_10 = CV_Result_Each_Model([10],NeuralNetwork)
One_Layer_Node_20 = CV_Result_Each_Model([20],NeuralNetwork)
Two_Layer_Node_10 = CV_Result_Each_Model([10,10],NeuralNetwork)
Two_Layer_Node_20 = CV_Result_Each_Model([20,20],NeuralNetwork)
Three_Layer_Node_10 = CV_Result_Each_Model([10,10,10],NeuralNetwork)
Three_Layer_Node_20 = CV_Result_Each_Model([20,20,20],NeuralNetwork)

ANN_5CV_Results=Valid_Index(confusion_matrix(One_Layer_Node_10[:,1],One_Layer_Node_10[:,0]),"One_Layer_Node_10").append([
Valid_Index(confusion_matrix(One_Layer_Node_20[:,1],One_Layer_Node_20[:,0]),"One_Layer_Node_20"),
Valid_Index(confusion_matrix(Two_Layer_Node_10[:,1],Two_Layer_Node_10[:,0]),"Two_Layer_Node_10"),
Valid_Index(confusion_matrix(Two_Layer_Node_20[:,1],Two_Layer_Node_20[:,0]),"Two_Layer_Node_20"),
Valid_Index(confusion_matrix(Three_Layer_Node_10[:,1],Three_Layer_Node_10[:,0]),"Three_Layer_Node_10"),
Valid_Index(confusion_matrix(Three_Layer_Node_20[:,1],Three_Layer_Node_20[:,0]),"Three_Layer_Node_20")])

ANN_5CV_Results = ANN_5CV_Results.sort_values(by=['F1'],ascending=False)
pd.DataFrame(ANN_5CV_Results)

CV Complete!
CV Complete!
CV Complete!
CV Complete!
CV Complete!
CV Complete!


Unnamed: 0,Model,Accuracy,F1,BCR,Precision,TPR,TNR
0,One_Layer_Node_20,0.9735,0.860892,0.901252,0.906077,0.82,0.990556
0,Two_Layer_Node_20,0.9705,0.849873,0.90716,0.865285,0.835,0.985556
0,One_Layer_Node_10,0.9715,0.848,0.887656,0.908571,0.795,0.991111
0,Three_Layer_Node_10,0.9695,0.844784,0.904185,0.860104,0.83,0.985
0,Three_Layer_Node_20,0.969,0.837696,0.888944,0.879121,0.8,0.987778
0,Two_Layer_Node_10,0.968,0.833333,0.888444,0.869565,0.8,0.986667


# Model2. RandomForest
## RandomForest 함수 5-Fold CV를 위해 함수 실행

In [6]:
def RandomForest(NUMBER,Train_Input,TR_OUTPUT,Val_Input,Val_Output):
    RF=RandomForestClassifier(n_estimators=NUMBER,
                           max_features="sqrt",
                           random_state=RANDOM_STATE).fit(Train_Input,TR_OUTPUT[:,0])
    Predict_Value = RF.predict(Val_Input)
    return(np.concatenate((Predict_Value[:,np.newaxis],Val_Output),axis=1))

RF50_CV = CV_Result_Each_Model(50,RandomForest)
RF100_CV = CV_Result_Each_Model(100,RandomForest)
RF150_CV = CV_Result_Each_Model(150,RandomForest)
RF200_CV = CV_Result_Each_Model(200,RandomForest)

ANN_5CV_Results=Valid_Index(confusion_matrix(RF50_CV[:,1],RF50_CV[:,0]),"RF50_CV").append([
Valid_Index(confusion_matrix(RF100_CV[:,1],RF100_CV[:,0]),"RF100_CV"),
Valid_Index(confusion_matrix(RF150_CV[:,1],RF150_CV[:,0]),"RF150_CV"),
Valid_Index(confusion_matrix(RF200_CV[:,1],RF200_CV[:,0]),"RF200_CV")])

ANN_5CV_Results = ANN_5CV_Results.sort_values(by=['F1'],ascending=False)
pd.DataFrame(ANN_5CV_Results)

CV Complete!
CV Complete!
CV Complete!
CV Complete!


Unnamed: 0,Model,Accuracy,F1,BCR,Precision,TPR,TNR
0,RF150_CV,0.984,0.914439,0.923891,0.982759,0.855,0.998333
0,RF200_CV,0.9835,0.911528,0.921186,0.982659,0.85,0.998333
0,RF100_CV,0.983,0.909091,0.920929,0.977011,0.85,0.997778
0,RF50_CV,0.983,0.908602,0.918472,0.982558,0.845,0.998333


# Model3. Adaboost
## Adaboost함수와 5-Fold CV를 위해 함수를 생성 후 이행

In [7]:
def AdaBoost(NUMBER,Train_Input,TR_OUTPUT,Val_Input,Val_Output):
    ADA=AdaBoostClassifier(DecisionTreeClassifier(max_depth=1),n_estimators=NUMBER,algorithm="SAMME",
                           random_state=RANDOM_STATE).fit(Train_Input,TR_OUTPUT[:,0])
    Predict_Value = ADA.predict(Val_Input)
    return(np.concatenate((Predict_Value[:,np.newaxis],Val_Output),axis=1))

Adaboost_50 = CV_Result_Each_Model(50,AdaBoost)
Adaboost_100 = CV_Result_Each_Model(100,AdaBoost)
Adaboost_200 = CV_Result_Each_Model(200,AdaBoost)
Adaboost_300 = CV_Result_Each_Model(300,AdaBoost)
Adaboost_1000 = CV_Result_Each_Model(1000,AdaBoost)

ADA_5CV_Results=Valid_Index(confusion_matrix(Adaboost_50[:,1],Adaboost_50[:,0]),"Ada_50").append([
Valid_Index(confusion_matrix(Adaboost_100[:,1],Adaboost_100[:,0]),"Ada_100"),
Valid_Index(confusion_matrix(Adaboost_200[:,1],Adaboost_200[:,0]),"Ada_200"),
Valid_Index(confusion_matrix(Adaboost_300[:,1],Adaboost_300[:,0]),"Ada_300"),
Valid_Index(confusion_matrix(Adaboost_1000[:,1],Adaboost_1000[:,0]),"Ada_1000")])

ADA_5CV_Results = ADA_5CV_Results.sort_values(by=['F1'],ascending=False)
ADA_5CV_Results

CV Complete!
CV Complete!
CV Complete!
CV Complete!
CV Complete!


Unnamed: 0,Model,Accuracy,F1,BCR,Precision,TPR,TNR
0,Ada_300,0.9575,0.776903,0.852311,0.81768,0.74,0.981667
0,Ada_50,0.958,0.774194,0.841903,0.837209,0.72,0.984444
0,Ada_1000,0.9565,0.774026,0.854459,0.805405,0.745,0.98
0,Ada_200,0.9565,0.770449,0.846532,0.815642,0.73,0.981667
0,Ada_100,0.957,0.767568,0.836036,0.835294,0.71,0.984444


# Model4. Bagging ANN
## Bagging ANN함수와 5-Fold CV를 위해 함수를 생성 후 이행
> 30번 반복 연산하며 오래 걸리므로 (2 Layer/20 Node), (1 Layer/20 Node) 2가지만 시행해봄

In [8]:
def B_NeuralNetwork(HIDDEN_LAYER,Train_Input,TR_OUTPUT,Val_Input,Val_Output):
    MLP=MLPClassifier(activation=ACTIVATION,solver=SOLVER,alpha=L2_Penalty,
                      hidden_layer_sizes=HIDDEN_LAYER,max_iter=3000,
                      batch_size=BATCH_SIZE,verbose=Visualization,early_stopping=False,power_t=Power_Value,
                      validation_fraction=Validation_Percent,learning_rate=Decay_Method,tol=Tolerence_Value,random_state =RANDOM_STATE)
    BMLP = BaggingClassifier(n_estimators=30, base_estimator=MLP, random_state=RANDOM_STATE, n_jobs=-1).fit(Train_Input, TR_OUTPUT[:, 0])
    Predict_Value = BMLP.predict(Val_Input)
    return(np.concatenate((Predict_Value[:,np.newaxis],Val_Output),axis=1))

B_ANN_Two_20 = CV_Result_Each_Model([20,20],B_NeuralNetwork)
B_ANN_One_20 = CV_Result_Each_Model([20],B_NeuralNetwork) 

CV Complete!
CV Complete!


In [9]:
B_ANN_Results=pd.concat((Valid_Index(confusion_matrix(B_ANN_Two_20[:,1],B_ANN_Two_20[:,0]),"B_ANN_Two_20"),
          Valid_Index(confusion_matrix(B_ANN_One_20[:,1],B_ANN_One_20[:,0]),"B_ANN_One_20")))

B_ANN_Results = B_ANN_Results.sort_values(by=['F1'],ascending=False)
B_ANN_Results

Unnamed: 0,Model,Accuracy,F1,BCR,Precision,TPR,TNR
0,B_ANN_Two_20,0.9735,0.859416,0.896242,0.915254,0.81,0.991667
0,B_ANN_One_20,0.973,0.854839,0.888402,0.924419,0.795,0.992778


# Model5. Bagging Decision Tree
## Bagging Decision Tree함수와 5-Fold CV를 위해 함수를 생성 후 이행
>  depth=5, depth=6 2가지만 시행해봄

In [10]:
def B_Tree(NUMBER,Train_Input,TR_OUTPUT,Val_Input,Val_Output):
    Tree = DecisionTreeClassifier(max_depth=NUMBER, random_state=RANDOM_STATE).fit(Train_Input, TR_OUTPUT[:, 0])
    BMLP = BaggingClassifier(n_estimators=30, base_estimator=Tree, random_state=RANDOM_STATE, n_jobs=-1).fit(Train_Input, TR_OUTPUT[:, 0])
    Predict_Value = BMLP.predict(Val_Input)
    return(np.concatenate((Predict_Value[:,np.newaxis],Val_Output),axis=1))

B_Tree_D6=CV_Result_Each_Model(6,B_Tree)
B_Tree_D5=CV_Result_Each_Model(5,B_Tree)

CV Complete!
CV Complete!


In [11]:
B_Tree_Results=pd.concat((Valid_Index(confusion_matrix(B_Tree_D6[:,1],B_Tree_D6[:,0]),"B_Tree_D6"),
          Valid_Index(confusion_matrix(B_Tree_D5[:,1],B_Tree_D5[:,0]),"B_Tree_D5")))

B_Tree_Results = B_Tree_Results.sort_values(by=['F1'],ascending=False)
B_Tree_Results

Unnamed: 0,Model,Accuracy,F1,BCR,Precision,TPR,TNR
0,B_Tree_D6,0.982,0.90625,0.930143,0.945652,0.87,0.994444
0,B_Tree_D5,0.982,0.905263,0.925299,0.955556,0.86,0.995556


# Model6. Gradient Boosting Machine
## Bagging Decision Tree함수와 5-Fold CV를 위해 함수를 생성 후 이행

In [12]:
def GBM(NUMBER,Train_Input,TR_OUTPUT,Val_Input,Val_Output):
    gbm=GradientBoostingClassifier(n_estimators=NUMBER,random_state=RANDOM_STATE).fit(Train_Input,TR_OUTPUT[:,0])
    Predict_Value = gbm.predict(Val_Input)
    return(np.concatenate((Predict_Value[:,np.newaxis],Val_Output),axis=1))

GBM_100 = CV_Result_Each_Model(100,GBM)
GBM_150 = CV_Result_Each_Model(150,GBM)
GBM_200 = CV_Result_Each_Model(200,GBM)
GBM_250 = CV_Result_Each_Model(250,GBM)
GBM_300 = CV_Result_Each_Model(300,GBM)

GBM_5CV_Results=Valid_Index(confusion_matrix(GBM_100[:,1],GBM_100[:,0]),"GBM_100").append([
Valid_Index(confusion_matrix(GBM_150[:,1],GBM_150[:,0]),"GBM_150"),
Valid_Index(confusion_matrix(GBM_200[:,1],GBM_200[:,0]),"GBM_200"),
Valid_Index(confusion_matrix(GBM_250[:,1],GBM_250[:,0]),"GBM_250"),
Valid_Index(confusion_matrix(GBM_300[:,1],GBM_300[:,0]),"GBM_300")])

GBM_5CV_Results = GBM_5CV_Results.sort_values(by=['F1'],ascending=False)
print(GBM_5CV_Results)

CV Complete!
CV Complete!
CV Complete!
CV Complete!
CV Complete!
     Model  Accuracy        F1       BCR  Precision    TPR       TNR
0  GBM_100    0.9860  0.927461  0.944203   0.962366  0.895  0.996111
0  GBM_300    0.9850  0.922280  0.941299   0.956989  0.890  0.995556
0  GBM_150    0.9845  0.919897  0.941037   0.951872  0.890  0.995000
0  GBM_250    0.9845  0.919897  0.941037   0.951872  0.890  0.995000
0  GBM_200    0.9845  0.919481  0.938652   0.956757  0.885  0.995556
