In [1]:
# 数据读取与计算
import pandas as  pd
import matplotlib.pyplot as plt
import numpy as np

# 数据预处理与模型选择
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix, precision_recall_curve, auc, roc_auc_score, roc_curve, recall_score, classification_report
import itertools

# 随机森林与SVM
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from scipy import stats

import warnings
warnings.filterwarnings("ignore")

In [2]:

# 一些基本参数设定
mode = 2
ratio = 1
iteration1 = 100
show_best_c = True
show_bdry = True

##读取数据
data=pd.read_csv('creditcard.csv')
data.drop('Time',axis=1,inplace=True)
data.head(10)


Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,0.090794,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,-0.166974,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,0.207643,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,-0.054952,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,0.753074,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0
5,-0.425966,0.960523,1.141109,-0.168252,0.420987,-0.029728,0.476201,0.260314,-0.568671,-0.371407,...,-0.208254,-0.559825,-0.026398,-0.371427,-0.232794,0.105915,0.253844,0.08108,3.67,0
6,1.229658,0.141004,0.045371,1.202613,0.191881,0.272708,-0.005159,0.081213,0.46496,-0.099254,...,-0.167716,-0.27071,-0.154104,-0.780055,0.750137,-0.257237,0.034507,0.005168,4.99,0
7,-0.644269,1.417964,1.07438,-0.492199,0.948934,0.428118,1.120631,-3.807864,0.615375,1.249376,...,1.943465,-1.015455,0.057504,-0.649709,-0.415267,-0.051634,-1.206921,-1.085339,40.8,0
8,-0.894286,0.286157,-0.113192,-0.271526,2.669599,3.721818,0.370145,0.851084,-0.392048,-0.41043,...,-0.073425,-0.268092,-0.204233,1.011592,0.373205,-0.384157,0.011747,0.142404,93.2,0
9,-0.338262,1.119593,1.044367,-0.222187,0.499361,-0.246761,0.651583,0.069539,-0.736727,-0.366846,...,-0.246914,-0.633753,-0.120794,-0.38505,-0.069733,0.094199,0.246219,0.083076,3.68,0


In [3]:
##归一化
def normalize_feature(data,amount_only=False):
    if amount_only:
        data['Amount']=StandardScaler().fit_transform(data['Amount'].values.reshape(-1,1))
    else:
        for feature in data.columns.values.tolist():
            if feature!='Class':
                data[feature]=StandardScaler().fit_transform(data[feature].values.reshape(-1,1))
    return data

data=normalize_feature(data)
data.head(10)

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,-0.694242,-0.044075,1.672773,0.973366,-0.245117,0.347068,0.193679,0.082637,0.331128,0.083386,...,-0.024923,0.382854,-0.176911,0.110507,0.246585,-0.39217,0.330892,-0.063781,0.244964,0
1,0.608496,0.161176,0.109797,0.316523,0.043483,-0.06182,-0.0637,0.071253,-0.232494,-0.15335,...,-0.307377,-0.880077,0.162201,-0.561131,0.320694,0.261069,-0.022256,0.044608,-0.342475,0
2,-0.6935,-0.811578,1.169468,0.268231,-0.364572,1.351454,0.639776,0.207373,-1.378675,0.1907,...,0.337632,1.063358,1.45632,-1.138092,-0.628537,-0.288447,-0.137137,-0.181021,1.160686,0
3,-0.493325,-0.112169,1.182516,-0.609727,-0.007469,0.93615,0.192071,0.316018,-1.262503,-0.050468,...,-0.147443,0.007267,-0.304777,-1.941027,1.241904,-0.460217,0.155396,0.186189,0.140534,0
4,-0.59133,0.531541,1.021412,0.284655,-0.295015,0.071999,0.479302,-0.22651,0.744326,0.691625,...,-0.012839,1.100011,-0.220123,0.23325,-0.395202,1.041611,0.54362,0.651816,-0.073403,0
5,-0.217475,0.581675,0.752585,-0.118833,0.305009,-0.022313,0.384936,0.217955,-0.517619,-0.341101,...,-0.283522,-0.771427,-0.042273,-0.613273,-0.446584,0.219637,0.6289,0.245636,-0.338556,0
6,0.627795,0.085389,0.029923,0.849383,0.13902,0.204695,-0.00417,0.067998,0.423218,-0.091155,...,-0.228334,-0.373032,-0.24678,-1.287973,1.439037,-0.533436,0.085492,0.015656,-0.333279,0
7,-0.328928,0.858692,0.708576,-0.347631,0.687512,0.321345,0.90586,-3.188229,0.560129,1.14743,...,2.645889,-1.399276,0.092085,-1.072754,-0.796633,-0.107075,-2.990154,-3.288083,-0.190107,0
8,-0.456573,0.173291,-0.074653,-0.191774,1.934149,2.793594,0.299206,0.712592,-0.356851,-0.37694,...,-0.099963,-0.369425,-0.327055,1.670269,0.715943,-0.796633,0.029104,0.43142,0.019392,0
9,-0.172698,0.678005,0.688781,-0.156927,0.361792,-0.185219,0.526706,0.058223,-0.670587,-0.336912,...,-0.336156,-0.873298,-0.193438,-0.635767,-0.133773,0.195342,0.61001,0.251681,-0.338516,0


In [4]:
# 数据被切分成训练集和测试集
def split_train_test(fraud_indices, normal_indices, test_size = 0.3):
    number_records_fraud = len(fraud_indices)
    number_records_normal = len(normal_indices)
    test_fraud_end = int(number_records_fraud * test_size)
    test_normal_end = int(number_records_normal  * test_size)

    test_fraud_indices = fraud_indices[0:test_fraud_end]
    train_fraud_indices = fraud_indices[test_fraud_end:]

    test_normal_indices = normal_indices[0:test_normal_end]
    train_normal_indices = normal_indices[test_normal_end:]

    return train_normal_indices, train_fraud_indices, test_normal_indices, test_fraud_indices


In [None]:
# indices存储的是数据的下标
def getTrainingSample(train_fraud_indices, train_normal_indices, data, train_normal_pos,ratio):
    train_number_records_fraud= int(ratio*len(train_fraud_indices))
    train_number_records_normal= len(train_normal_indices)
    
    # 数据下采样
    if train_normal_pos + train_number_records_fraud <= train_number_records_normal:
        small_train_normal_indices = train_normal_indics[train_normal_pos: train_normal_pos+train_number_records_fraud]
        train_normal_pos = train_normal_pos + train_number_records_fraud
        
    # 数据上采样
    else:
        small_train_normal_indices = np.concatenate([train_normal_indices[train_normal_pos: train_number_records_normal], 
                                            train_normal_indices[0: train_normal_pos + train_number_records_fraud - train_number_records_normal]])
        train_normal_pos = train_normal_pos+train_number_records_fraud - train_number_records_normal
    
    # 进行数据下标合并，并打乱
    under_train_sample_indices = np.concatenate([train_fraud_indices, small_train_normal_indices])
    np.random.shuffle(under_train_sample_indices)
    
    #下采样
    under_train_sample_data = data.iloc[under_train_sample_indices,:]
    
    x_train_undersample = under_train_sample_data.ix[:,under_train_sample_data.columns != 'Class']
    y_train_undersample = under_train_sample_data.ix[:,under_train_sample_data.columns == 'Class']
    
    # 返回的是已经进行过采样的特征和目标特征
    return x_train_undersample,y_train_undersample,train_normal_pos

In [None]:
# predict_proba  https://blog.csdn.net/anqijiayou/article/details/80295237
def knn_module(x,y,indices, c_param, bdry=None):
    knn=KNeighborsClassifier(n_neighbors=c_param)
    #ravel把数组变平
    knn.fit(x.iloc[indices[0],:], y.iloc[indices[0],:].values.ravel())
    y_pred_undersample = knn.predict(x.iloc[indices[1],:].values)
    
    return y_pred_undersample
    
def svm_rbf_module(x, y, indices, c_param, bdry= 0.5):
    svm_rbf = SVC(C=c_param, probability=True)
    svm_rbf.fit(x.iloc[indices[0],:], y.iloc[indices[0],:].values.ravel())
    y_pred_undersample = svm_rbf.predict_proba(x.iloc[indices[1],:].values)[:,1] >= bdry
    return y_pred_undersample

def svm_poly_module(x,y, indices, c_param, bdry=0.5):
    svm_poly=SVC(C=c_param[0], kernel='poly', degree= c_param[1], probability=True)
    svm_poly.fit(x.iloc[indices[0],:], y.iloc[indices[0],:].values.ravel())
    y_pred_undersample = svm_poly.predict_proba(x.iloc[indices[1],:].values)[:,1] >= bdry
    return y_pred_undersample

def lr_module(x,y, indices, c_param, bdry=0.5):
    # penalty惩罚系数
    lr = LogisticRegression(C=c_param,penalty='11')
    lr.fit(X.iloc[indices[0],:], y.iloc[indices[0],:].values.ravel())
    y_pred_undersample= lr.predict_proba(X.iloc[indices[1],:].values)[:,1]>=bdry
    return y_pred_undersample
    
def rf_module(x,y, indices, c_param, bdry=0.5):
    # 参数设置 https://www.cnblogs.com/harvey888/p/6512312.html
    rf= RandomForestClassifier(n_jobs=-1,n_estimators=100, criterion='entropy', max_features= 'auto',
                               max_depth=None,min_samples_split= c_param, random_state=0)
    rf.fit(X.iloc[indices[0],:], y.iloc[indices[0],:].values.ravel())
    y_pred_undersample = rf.predict_proba(X.iloc[indices[1],:].values)[:,1]>=bdry
    return y_pred_undersample

![TIM截图20181229221010.png](https://i.loli.net/2018/12/29/5c278050b4c47.png)

In [None]:
#https://www.cnblogs.com/zhixingheyi/p/8097782.html
#https://blog.csdn.net/xierhacker/article/details/70903617
#计算召回率和auc
#y_t是真实值，y_p是预测值
def compute_recall_and_auc(y_t, y_p):
    #混淆矩阵
    cnf_matrix=confusion_matrix(y_t,y_p)
    #设置numpy的打印精度
    np.set_printoptions(precision=2)
    recall_score = cnf_matrix[0,0]/(cnf_matrix[1,0]+cnf_matrix[0,0])
    
    #Roc曲线
    # https://www.cnblogs.com/gatherstars/p/6084696.html
    fpr, tpr,thresholds = roc_curve(y_t,y_p)
    roc_auc= auc(fpr,tpr)
    return recall_score , roc_auc
    

In [None]:
#自己实现寻找最优超参数
def cross_validation_recall(x_train_data, y_train_data, c_param_range, models_dict, model_name):
    #使用K折交叉验证来寻找最优超参数
    fold=KFold(5,shuffle=False)
    # 构造超参数得分列表
    results_table = pd.DataFrame(index= range(len(c_param_range),2), columns = ['C_parameter','Mean recall score'])
    results_table['C_parameter'] = c_param_range
    
    recall_mean=[]
    # 循环使用每个超参数
    for c_param in c_param_range:
        recall_aucs=[]
        
        # 循环交叉集
        for i,train_index in enumerate(fold.split(y_train_data)):
            # 模型训练
            y_pred_undersample= models_dict[model_name](x_train_data,y_train_data, train_index, c_param)
            
            # 计算召回率和ROC曲线
            recall_auc, _=compute_recall_and_auc(y_train_data.iloc[train_index[1],:].values,y_pred_undersample)
            print(model_name,'第',i,'次：',recall_auc)
            recall_aucs.append(recall_auc)
        
        # auc取平均值作为这组超参数的分数
        recall_mean.append(np.mean(recall_aucs))
    
    results_table['Mean recall score'] = recall_mean
    # 得分最大的一组作为最优超参数，并返回
    best_c = results_table.loc[results_table['Mean recall score'].idxmax()]['C_parameter']
    return best_c

In [None]:

# 不同的决策边界阈值
# 也是通过遍历调参的方式确定
def decision_boundary(x_train_data, y_train_data, fold, best_c, bdry_dict, models_dict, model_name):
    bdry_range= [0.3,0.35,0.4,0.45,0.5]
    results_table = pd.DataFrame(index = range(len(bdry_ranges),2) , columns = ['Bdry_params','Mean recall score * auc'])
    results_table['Bdry_params']= bdry_ranges
    
    recall_mean=[]
    for bdry in bdry_ranges:
        recall_accs_aucs = []
        for iteration, indices in enumerate(fold.split(y_train_data)):
            y_pred_undersample = models_dict[model_name](x_train_data, y_train_data, indices, best_c, bdry)
            recall_acc, roc_auc = compute_recall_and_auc(y_train_data.iloc[indices[1],:].values, y_pred_undersample)
            
            # bdry_dict[model_name]是调用不同模型的计算公式
            recall_accs_aucs.append(bdry_dict[model_name](recall_acc, roc_auc))
        recall_mean.append(np.mean(recall_accs_aucs))

    results_table['Mean recall score * auc'] = recall_mean
    best_bdry = results_table.loc[results_table['Mean recall score * auc'].idxmax()]['Bdry_params']

    return best_bdry

In [None]:
def model(x,y,train, bdry_dict = None, best_c=None, best_bdry=None, models= None, mode=None):
    #训练阶段
    if train:
        #用不同的模型进行训练
        models_dict = {'knn' : knn_module, 'svm_rbf': svm_rbf_module, 'svm_poly': svm_poly_module,
                        'lr': lr_module, 'rf': rf_module}
        
        #knn中取不同的k值(超参数)
        c_param_range_knn=[3,5,7,9]
        #自定义cross_validation_recall，使用循环找出最适合的超参数。
        best_c_knn=cross_validation_recall(x,y, c_param_range_knn,models_dict, 'knn')
        
        # SVM-RBF中不同的参数
        c_param_range_svm_rbf=[0.01,0.1,1,10,100]
        best_c_svm_rbf = cross_validation_recall(x,y,c_param_range_svm_rbf, models_dict, 'svm_rbf')
        
        c_param_range_svm_poly = [[0.01, 2], [0.01, 3], [0.01, 4], [0.01, 5], [0.01, 6], [0.01, 7], [0.01, 8], [0.01, 9],
                                  [0.1, 2], [0.1, 3], [0.1, 4], [0.1, 5], [0.1, 6], [0.1, 7], [0.1, 8], [0.1, 9],
                                  [1, 2], [1, 3], [1, 4], [1, 5], [1, 6], [1, 7], [1, 8], [1, 9],
                                  [10, 2], [10, 3], [10, 4], [10, 5], [10, 6], [10, 7], [10, 8], [10, 9],
                                  [100, 2], [100, 3], [100, 4], [100, 5], [100, 6], [100, 7], [100, 8], [100, 9]]
        
        best_c_svm_poly = cross_validation_recall(x,y, c_param_range_svm_poly, models_dict, 'svm_poly')
        
        # 逻辑回归当中的正则化强度
        c_param_range_lr=[0.01,0.1,1,10,100]
        best_c_lr = cross_validation_recall(x,y, c_param_range_lr, models_dict, 'lr')
        
        # 随机森林里调参
        c_param_range_rf = [2,5,10,15,20]
        best_c_rf= cross_validation_recall(X, y, c_param_range_rf, models_dict, 'rf')
        
        # 合并超参数
        best_c = [best_c_knn, best_c_svm_rbf, best_c_svm_poly, best_c_lr, best_c_rf, best_c]
        
        # 交叉验证确定合适的决策边界阈值
        fold = KFold(4,shuffle=True)
        
        # decision_boundary是一个计算决策边界的函数
        best_bdry_svm_rbf= decision_boundary(x, y, fold, best_c_svm_rbf, bdry_dict, models_dict, 'svm_rbf')
        best_bdry_svm_poly = decision_boundary(x, y, fold, best_c_svm_poly, bdry_dict, models_dict, 'svm_poly')
        best_bdry_lr = decision_boundary(x, y, fold, best_c_lr, bdry_dict, models_dict, 'lr')
        best_bdry_rf = decision_boundary(x, y, fold, best_c_rf, bdry_dict, models_dict, 'rf')
        best_bdry = [0.5, best_bdry_svm_rbf, best_bdry_svm_poly, best_bdry_lr, best_bdry_rf]
        
        # 最优参数建模
        knn = KNeighborsClassifier(n_neighbors = int(best_c_knn))
        knn.fit(x.values, y.values.ravel())
        
        svm_rbf = SVC(C=best_c_svm_rbf, probability = True)
        svm_rbf.fit(x.values, y.values.ravel())
        
        svm_poly = SVC(C=best_c_svm_poly[0], kernel = 'poly', degree = best_c_svm_poly[1], probability = True)
        svm_poly.fit(x.values, y.values.ravel())

        lr = LogisticRegression(C = best_c_lr, penalty ='l1', warm_start = False)
        lr.fit(x.values, y.values.ravel())

        rf = RandomForestClassifier(n_jobs=-1, n_estimators = 100, criterion = 'entropy', 
                                    max_features = 'auto', max_depth = None, 
                                    min_samples_split  = int(best_c_rf), random_state=0)
        rf.fit(x.values, y.values.ravel())
        
        models = [knn,svm_rbf,svm_poly, lr, rf]
        return best_c,best_bdry,models
    else:
        #预测阶段
        [knn, svm_rbf, svm_poly, lr, rf] = models
        [_, best_bdry_svm_rbf, best_bdry_svm_poly, best_bdry_lr, best_bdry_rf] = best_bdry
        
        # KNN
        y_pred_knn = knn.predict(x.values)
        # 用rbf核的SVM
        y_pred_svm_rbf = svm_rbf.predict_proba(x.values)[:,1] >= best_bdry_svm_rbf
        # 用多项式核的SVM
        y_pred_svm_poly = svm_poly.predict_proba(x.values)[:,1] >= best_bdry_svm_poly
        # LR
        y_pred_lr= lr.predict_proba(x.values)[:,1] >= best_bdry_lr
        # 随机森林
        y_pred_rf = rf.predict_proba(x.values)[:,1] >= best_bdry_rf
        
        x_of_three_models = {'knn' : y_pred_knn, 'svm_rbf' : y_pred_svm_rbf, 'svm_poly' : y_pred_svm_poly, 'lr' : y_pred_lr, 'rf': y_pred_rf}
        
        #得到5个模型的预测结果
        X_5_data = pd.DataFrame(data = x_of_three_models)
        
        # 进行投票机制，大于2票的为正样本
        y_prd= np.sum(x_5_data,axis=1)>=2
        
        y_pred_lr_controls = []
        params = [0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5]

        # 投票器去产出最终结果
        for param in params:
            y_pred_lr_controls.append(lr.predict_proba(X.values)[:,1] >= param)
        return y_pred, y_pred_lr_controls, params
        
        
        
        

In [None]:
def run(data,mode,ratio,iteration1,bdry_dict):
    recall_score_list =[]
    auc_list = []
    recall_score_lr_list =[]
    auc_lr_list = []
    best_c = None
    best_bdry = None
    for itr1 in range(iteration1):
        #欺诈类的样本
        fraud_indices=np.array(data[data.Class==1].index)
        #进行随机排列
        np.random.shuffle(fraud_indices)
        
        #获取正常样本
        normal_indices=np.array(data[data.Class==0].index)
        np.random.shuffle(normal_indices)
        
        
        #划分训练集和测试集
        train_normal_indices, train_fraud_indices, test_normal_indices, test_fraud_indices=split_train_test(
            normal_indices,fraud_indices)
        
        ##合并测试集
        test_indices=np.concatenate([test_normal_indices,test_fraud_indices])
        
        #通过下标选取测试集数据，[表示选取行,表示选取列]
        test_data=data.iloc[test_indices,:]
        x_test=test_data.ix[:,test_data.columns != 'Class']
        y_test=test_data.ix[:,test_data.columns == 'Class']
        
        #数据下采样
        x_train_undersample,y_train_undersample,train_normal_pos=getTrainingSample(
            train_fraud_indices,train_normal_indices,data,0,ratio)
        
        #模型训练
        best_c,best_bdry,models=model(x_train_undersample, y_train_undersample,train=True,
                                         bdry_dict= bdry_dict, best_c=best_c, best_bdry=best_bdry)
        
        if show_best_c:
            print("超参数值:")
            print("k-nearest nbd: %.2f, svm (rbf kernel): [%.2f, %.2f], svm (poly kernel): %.2f, logistic reg: %.2f, random forest: %.2f"
                  %(best_c[0], best_c[1], best_c[2][0], best_c[2][1], best_c[3], best_c[4]))

        if show_bdry:
            print("决策边界阈值:")
            print("k-nearest nbd: %.2f, svm (rbf kernel): %.2f, svm (poly kernel): %.2f, logistic reg: %.2f, random forest: %.2f"
                  %(best_bdry[0], best_bdry[1], best_bdry[2], best_bdry[3], best_bdry[4]))
            
        
         # 预测
        y_pred, y_pred_lr_controls, params = model(x_test, y_test, train = False, bdry_dict = None,
                                                   best_c = best_c, best_bdry = best_bdry, models = models, mode = mode)
        
        #记录指标
        recall_score, roc_auc= compute_recall_and_auc(y_test,y_pred)
        recall_score_list.append(recall_score)
        auc_list.append(roc_auc)
        
        control_recall_all_param = []
        control_roc_all_param = []
        for i in range(len(params)):
            recall_score_lr, roc_auc_lr = compute_recall_and_auc(y_test, y_pred_lr_controls[i]) # for control
            control_recall_all_param.append(recall_score_lr)
            control_roc_all_param.append(roc_auc_lr)

        recall_score_lr_list.append(control_recall_all_param)
        auc_lr_list.append(control_roc_all_param)
        
        
    #计算平均得分
    mean_recall_score = np.mean(recall_score_list)
    std_recall_score = np.std(recall_score_list)
    
    mean_auc= np.mean(auc_list)
    std_auc = np.std(auc_list)
        
    mean_recall_score_lr = np.mean(recall_score_lr_list, axis = 0)
    std_recall_score_lr = np.std(recall_score_lr_list, axis = 0)
    mean_auc_lr= np.mean(auc_lr_list, axis = 0)
    std_auc_lr = np.std(auc_lr_list, axis = 0)
        
    result = [mean_recall_score, std_recall_score, mean_auc, std_auc]
    control = [mean_recall_score_lr, std_recall_score_lr, mean_auc_lr, std_auc_lr]
    return result, control, params    

In [None]:

def lr_bdry_module(recall_acc, roc_auc):
    return 0.9*recall_acc+0.1*roc_auc
def svm_rbf_bdry_module(recall_acc, roc_auc):
    return recall_acc*roc_auc
def svm_poly_bdry_module(recall_acc, roc_auc):
    return recall_acc*roc_auc
def rf_bdry_module(recall_acc, roc_auc):
    return 0.5*recall_acc+0.5*roc_auc

bdry_dict = {'lr': lr_bdry_module,'svm_rbf': svm_rbf_bdry_module,
             'svm_poly': svm_poly_bdry_module, 'rf': rf_bdry_module}

result, control, params =run(data = data, mode = mode, ratio = ratio, iteration1 = iteration1, bdry_dict = bdry_dict)
print("超参数值:")
print("比率为: ", ratio, " 模式为: ", mode)
print("knn, svm_rbf, svm_poly, lr 和 rf 投票产出的结果是:")
print("平均召回率为 ", result[0], " 召回率标准差为 ", result[1])
print("平均auc为 ", result[2], " auc标准差为 ", result[3])
print()
print("调整逻辑回归不同的阈值")
print("我们把超过阈值的样本判定为positive(欺诈)")
for i, param in enumerate(params):
    print("阈值", param)
    print("平均召回率 ", control[0][i], " 召回率标准差 ", control[1][i])
    print("平均auc为 ", control[2][i], " auc标准差 ", control[3][i])
    print()

knn 第 0 次： 0.9993719381987187
knn 第 1 次： 0.9995227569577012
knn 第 2 次： 0.9991210447011551
knn 第 3 次： 0.9994224588188028
knn 第 4 次： 0.9994473889126121
knn 第 0 次： 0.9993219998493333
knn 第 1 次： 0.9995730788548468
knn 第 2 次： 0.999095999799111
knn 第 3 次： 0.999472772463659
knn 第 4 次： 0.9994474860615802
knn 第 0 次： 0.9992718141870685
knn 第 1 次： 0.9995479658463083
knn 第 2 次： 0.9991210667738128
knn 第 3 次： 0.9993974996234373
knn 第 4 次： 0.9994725870859181
knn 第 0 次： 0.9992969063881076
knn 第 1 次： 0.9995479658463083
knn 第 2 次： 0.9990959316926168
knn 第 3 次： 0.9994225603173408
knn 第 4 次： 0.9994976893711071
svm_rbf 第 0 次： 0.9983442877928854
svm_rbf 第 1 次： 0.9986201013598274
svm_rbf 第 2 次： 0.9974177050166722
svm_rbf 第 3 次： 0.9983450351053159
svm_rbf 第 4 次： 0.9988454684637201
svm_rbf 第 0 次： 0.9989457037427517
svm_rbf 第 1 次： 0.9987453893056984
svm_rbf 第 2 次： 0.9983437476473688
svm_rbf 第 3 次： 0.9988208730556949
svm_rbf 第 4 次： 0.9986449864498645
svm_rbf 第 0 次： 0.9994474583082178
svm_rbf 第 1 次： 0.99969859848