In [3]:
from model.data_util import *
from sklearn.metrics import *
from imbens.metrics import *
from imbens.ensemble import *
from sklearn.model_selection import train_test_split, StratifiedKFold
from imbens.datasets import fetch_datasets
from collections import Counter

In [4]:
# Load data
# X, y, dataset_name = get_ecoli1()
dataset = fetch_datasets()['us_crime']
X, y = dataset['data'], dataset['target']
print(Counter(y))
dataset_name = 'us_crime'
# 将-1类别转换为0
y = np.where(y == -1, 0, y)
print(Counter(y))

Counter({-1: 1844, 1: 150})
Counter({0: 1844, 1: 150})


In [5]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
accs = []
f1s = []
precs = []
recs = []
gmeans = []
aucs = []
auprs = []
sens = []
spes = []

for train_index, test_index in skf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    model = SelfPacedEnsembleClassifier(n_jobs=-1)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    accs.append(accuracy_score(y_test, y_pred))
    f1s.append(f1_score(y_test, y_pred, average='macro'))
    precs.append(precision_score(y_test, y_pred, average='macro'))
    recs.append(recall_score(y_test, y_pred, average='macro'))
    gmeans.append(geometric_mean_score(y_test, y_pred))
    aucs.append(roc_auc_score(y_test, y_pred_proba))
    auprs.append(average_precision_score(y_test, y_pred_proba))
    sens.append(sensitivity_score(y_test, y_pred))
    spes.append(specificity_score(y_test, y_pred))
    
print('model:', model.__class__)
print('dataset:', dataset_name)
print('Accuracy:', np.mean(accs))
print('F1:', np.mean(f1s))
print('Precision:', np.mean(precs))
print('Recall:', np.mean(recs))
print('G-mean:', np.mean(gmeans))
print('AUC:', np.mean(aucs))
print('AUPR:', np.mean(auprs))
print('Sensitivity:', np.mean(sens))
print('Specificity:', np.mean(spes))

Majority class detected: 0
Majority class detected: 0
Majority class detected: 0
Majority class detected: 0
Majority class detected: 0
model: <class 'imbens.ensemble._under_sampling.self_paced_ensemble.SelfPacedEnsembleClassifier'>
dataset: us_crime
Accuracy: 0.910734121736502
F1: 0.7415072785151271
Precision: 0.7114650920362949
Recall: 0.8016911158242017
G-mean: 0.7888548307883955
AUC: 0.91511981560033
AUPR: 0.5694553793150475
Sensitivity: 0.6733333333333333
Specificity: 0.9300488983150702


In [6]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
accs = []
f1s = []
precs = []
recs = []
gmeans = []
aucs = []
auprs = []
sens = []
spes = []

for train_index, test_index in skf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    model = BalanceCascadeClassifier(n_jobs=-1)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    accs.append(accuracy_score(y_test, y_pred))
    f1s.append(f1_score(y_test, y_pred, average='macro'))
    precs.append(precision_score(y_test, y_pred, average='macro'))
    recs.append(recall_score(y_test, y_pred, average='macro'))
    gmeans.append(geometric_mean_score(y_test, y_pred))
    aucs.append(roc_auc_score(y_test, y_pred_proba))
    auprs.append(average_precision_score(y_test, y_pred_proba))
    sens.append(sensitivity_score(y_test, y_pred))
    spes.append(specificity_score(y_test, y_pred))
    
print('model:', model.__class__)
print('dataset:', dataset_name)
print('Accuracy:', np.mean(accs))
print('F1:', np.mean(f1s))
print('Precision:', np.mean(precs))
print('Recall:', np.mean(recs))
print('G-mean:', np.mean(gmeans))
print('AUC:', np.mean(aucs))
print('AUPR:', np.mean(auprs))
print('Sensitivity:', np.mean(sens))
print('Specificity:', np.mean(spes))

model: <class 'imbens.ensemble._under_sampling.balance_cascade.BalanceCascadeClassifier'>
dataset: us_crime
Accuracy: 0.9027014773113688
F1: 0.709956291540802
Precision: 0.6847589502660177
Recall: 0.7606041593024625
G-mean: 0.7356733596970557
AUC: 0.8946668679549115
AUPR: 0.5518082762950063
Sensitivity: 0.5933333333333333
Specificity: 0.9278749852715918


In [7]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
accs = []
f1s = []
precs = []
recs = []
gmeans = []
aucs = []
auprs = []
sens = []
spes = []

for train_index, test_index in skf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    model = UnderBaggingClassifier(n_jobs=-1)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    accs.append(accuracy_score(y_test, y_pred))
    f1s.append(f1_score(y_test, y_pred, average='macro'))
    precs.append(precision_score(y_test, y_pred, average='macro'))
    recs.append(recall_score(y_test, y_pred, average='macro'))
    gmeans.append(geometric_mean_score(y_test, y_pred))
    aucs.append(roc_auc_score(y_test, y_pred_proba))
    auprs.append(average_precision_score(y_test, y_pred_proba))
    sens.append(sensitivity_score(y_test, y_pred))
    spes.append(specificity_score(y_test, y_pred))
    
print('model:', model.__class__)
print('dataset:', dataset_name)
print('Accuracy:', np.mean(accs))
print('F1:', np.mean(f1s))
print('Precision:', np.mean(precs))
print('Recall:', np.mean(recs))
print('G-mean:', np.mean(gmeans))
print('AUC:', np.mean(aucs))
print('AUPR:', np.mean(auprs))
print('Sensitivity:', np.mean(sens))
print('Specificity:', np.mean(spes))

model: <class 'imbens.ensemble._under_sampling.under_bagging.UnderBaggingClassifier'>
dataset: us_crime
Accuracy: 0.8620936764020606
F1: 0.7018938720407076
Precision: 0.6642049390188748
Recall: 0.855008247908566
G-mean: 0.853183253531242
AUC: 0.9115876487569224
AUPR: 0.4801817544532755
Sensitivity: 0.8466666666666667
Specificity: 0.8633498291504654


In [8]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
accs = []
f1s = []
precs = []
recs = []
gmeans = []
aucs = []
auprs = []
sens = []
spes = []

for train_index, test_index in skf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    model = EasyEnsembleClassifier(n_jobs=-1)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    accs.append(accuracy_score(y_test, y_pred))
    f1s.append(f1_score(y_test, y_pred, average='macro'))
    precs.append(precision_score(y_test, y_pred, average='macro'))
    recs.append(recall_score(y_test, y_pred, average='macro'))
    gmeans.append(geometric_mean_score(y_test, y_pred))
    aucs.append(roc_auc_score(y_test, y_pred_proba))
    auprs.append(average_precision_score(y_test, y_pred_proba))
    sens.append(sensitivity_score(y_test, y_pred))
    spes.append(specificity_score(y_test, y_pred))
    
print('model:', model.__class__)
print('dataset:', dataset_name)
print('Accuracy:', np.mean(accs))
print('F1:', np.mean(f1s))
print('Precision:', np.mean(precs))
print('Recall:', np.mean(recs))
print('G-mean:', np.mean(gmeans))
print('AUC:', np.mean(aucs))
print('AUPR:', np.mean(auprs))
print('Sensitivity:', np.mean(sens))
print('Specificity:', np.mean(spes))

model: <class 'imbens.ensemble._under_sampling.easy_ensemble.EasyEnsembleClassifier'>
dataset: us_crime
Accuracy: 0.8525635697283409
F1: 0.6924169959793665
Precision: 0.6574552364628645
Recall: 0.8559794391422175
G-mean: 0.8538971894710349
AUC: 0.9268088449000432
AUPR: 0.6081564370021083
Sensitivity: 0.8600000000000001
Specificity: 0.851958878284435


In [9]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
accs = []
f1s = []
precs = []
recs = []
gmeans = []
aucs = []
auprs = []
sens = []
spes = []

for train_index, test_index in skf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    model = RUSBoostClassifier()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    accs.append(accuracy_score(y_test, y_pred))
    f1s.append(f1_score(y_test, y_pred, average='macro'))
    precs.append(precision_score(y_test, y_pred, average='macro'))
    recs.append(recall_score(y_test, y_pred, average='macro'))
    gmeans.append(geometric_mean_score(y_test, y_pred))
    aucs.append(roc_auc_score(y_test, y_pred_proba))
    auprs.append(average_precision_score(y_test, y_pred_proba))
    sens.append(sensitivity_score(y_test, y_pred))
    spes.append(specificity_score(y_test, y_pred))
    
print('model:', model.__class__)
print('dataset:', dataset_name)
print('Accuracy:', np.mean(accs))
print('F1:', np.mean(f1s))
print('Precision:', np.mean(precs))
print('Recall:', np.mean(recs))
print('G-mean:', np.mean(gmeans))
print('AUC:', np.mean(aucs))
print('AUPR:', np.mean(auprs))
print('Sensitivity:', np.mean(sens))
print('Specificity:', np.mean(spes))

model: <class 'imbens.ensemble._under_sampling.rus_boost.RUSBoostClassifier'>
dataset: us_crime
Accuracy: 0.8139582624903967
F1: 0.5863985956331289
Precision: 0.5764402309226915
Recall: 0.6574992635795922
G-mean: 0.6281426387367932
AUC: 0.7621840756451042
AUPR: 0.266620362535101
Sensitivity: 0.4733333333333333
Specificity: 0.8416651938258513


In [10]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
accs = []
f1s = []
precs = []
recs = []
gmeans = []
aucs = []
auprs = []
sens = []
spes = []

for train_index, test_index in skf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    model = BalancedRandomForestClassifier(n_jobs=-1)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    accs.append(accuracy_score(y_test, y_pred))
    f1s.append(f1_score(y_test, y_pred, average='macro'))
    precs.append(precision_score(y_test, y_pred, average='macro'))
    recs.append(recall_score(y_test, y_pred, average='macro'))
    gmeans.append(geometric_mean_score(y_test, y_pred))
    aucs.append(roc_auc_score(y_test, y_pred_proba))
    auprs.append(average_precision_score(y_test, y_pred_proba))
    sens.append(sensitivity_score(y_test, y_pred))
    spes.append(specificity_score(y_test, y_pred))
    
print('model:', model.__class__)
print('dataset:', dataset_name)
print('Accuracy:', np.mean(accs))
print('F1:', np.mean(f1s))
print('Precision:', np.mean(precs))
print('Recall:', np.mean(recs))
print('G-mean:', np.mean(gmeans))
print('AUC:', np.mean(aucs))
print('AUPR:', np.mean(auprs))
print('Sensitivity:', np.mean(sens))
print('Specificity:', np.mean(spes))

model: <class 'imbens.ensemble._under_sampling.balanced_random_forest.BalancedRandomForestClassifier'>
dataset: us_crime
Accuracy: 0.8430447979244594
F1: 0.6838631417908656
Precision: 0.6520783297130108
Recall: 0.8569587310003535
G-mean: 0.8547280648872416
AUC: 0.9209093073720591
AUPR: 0.5245437444198744
Sensitivity: 0.8733333333333333
Specificity: 0.8405841286673736


In [11]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
accs = []
f1s = []
precs = []
recs = []
gmeans = []
aucs = []
auprs = []
sens = []
spes = []

for train_index, test_index in skf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    model = AdaCostClassifier()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    accs.append(accuracy_score(y_test, y_pred))
    f1s.append(f1_score(y_test, y_pred, average='macro'))
    precs.append(precision_score(y_test, y_pred, average='macro'))
    recs.append(recall_score(y_test, y_pred, average='macro'))
    gmeans.append(geometric_mean_score(y_test, y_pred))
    aucs.append(roc_auc_score(y_test, y_pred_proba))
    auprs.append(average_precision_score(y_test, y_pred_proba))
    sens.append(sensitivity_score(y_test, y_pred))
    spes.append(specificity_score(y_test, y_pred))
    
print('model:', model.__class__)
print('dataset:', dataset_name)
print('Accuracy:', np.mean(accs))
print('F1:', np.mean(f1s))
print('Precision:', np.mean(precs))
print('Recall:', np.mean(recs))
print('G-mean:', np.mean(gmeans))
print('AUC:', np.mean(aucs))
print('AUPR:', np.mean(auprs))
print('Sensitivity:', np.mean(sens))
print('Specificity:', np.mean(spes))

model: <class 'imbens.ensemble._reweighting.adacost.AdaCostClassifier'>
dataset: us_crime
Accuracy: 0.7547700910567877
F1: 0.6058129769802194
Precision: 0.6102703518112912
Recall: 0.8306654294803817
G-mean: 0.8237132009838211
AUC: 0.8993123060759594
AUPR: 0.48010208098213114
Sensitivity: 0.9200000000000002
Specificity: 0.7413308589607636


In [12]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
accs = []
f1s = []
precs = []
recs = []
gmeans = []
aucs = []
auprs = []
sens = []
spes = []

for train_index, test_index in skf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    model = AdaUBoostClassifier()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    accs.append(accuracy_score(y_test, y_pred))
    f1s.append(f1_score(y_test, y_pred, average='macro'))
    precs.append(precision_score(y_test, y_pred, average='macro'))
    recs.append(recall_score(y_test, y_pred, average='macro'))
    gmeans.append(geometric_mean_score(y_test, y_pred))
    aucs.append(roc_auc_score(y_test, y_pred_proba))
    auprs.append(average_precision_score(y_test, y_pred_proba))
    sens.append(sensitivity_score(y_test, y_pred))
    spes.append(specificity_score(y_test, y_pred))
    
print('model:', model.__class__)
print('dataset:', dataset_name)
print('Accuracy:', np.mean(accs))
print('F1:', np.mean(f1s))
print('Precision:', np.mean(precs))
print('Recall:', np.mean(recs))
print('G-mean:', np.mean(gmeans))
print('AUC:', np.mean(aucs))
print('AUPR:', np.mean(auprs))
print('Sensitivity:', np.mean(sens))
print('Specificity:', np.mean(spes))

model: <class 'imbens.ensemble._reweighting.adauboost.AdaUBoostClassifier'>
dataset: us_crime
Accuracy: 0.8896663769977708
F1: 0.7015328281536665
Precision: 0.6716576228659632
Recall: 0.7719247083775185
G-mean: 0.7570863131333768
AUC: 0.8924658300930837
AUPR: 0.5282907834991026
Sensitivity: 0.6333333333333333
Specificity: 0.9105160834217039


In [13]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
accs = []
f1s = []
precs = []
recs = []
gmeans = []
aucs = []
auprs = []
sens = []
spes = []

for train_index, test_index in skf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    model = AsymBoostClassifier()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    accs.append(accuracy_score(y_test, y_pred))
    f1s.append(f1_score(y_test, y_pred, average='macro'))
    precs.append(precision_score(y_test, y_pred, average='macro'))
    recs.append(recall_score(y_test, y_pred, average='macro'))
    gmeans.append(geometric_mean_score(y_test, y_pred))
    aucs.append(roc_auc_score(y_test, y_pred_proba))
    auprs.append(average_precision_score(y_test, y_pred_proba))
    sens.append(sensitivity_score(y_test, y_pred))
    spes.append(specificity_score(y_test, y_pred))
    
print('model:', model.__class__)
print('dataset:', dataset_name)
print('Accuracy:', np.mean(accs))
print('F1:', np.mean(f1s))
print('Precision:', np.mean(precs))
print('Recall:', np.mean(recs))
print('G-mean:', np.mean(gmeans))
print('AUC:', np.mean(aucs))
print('AUPR:', np.mean(auprs))
print('Sensitivity:', np.mean(sens))
print('Specificity:', np.mean(spes))

model: <class 'imbens.ensemble._reweighting.asymmetric_boost.AsymBoostClassifier'>
dataset: us_crime
Accuracy: 0.9177453684462412
F1: 0.7259897899168758
Precision: 0.7218746721494678
Recall: 0.7411754742547425
G-mean: 0.7096823647545796
AUC: 0.8981502101252896
AUPR: 0.5441736331983217
Sensitivity: 0.5333333333333334
Specificity: 0.9490176151761517


In [12]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import average_precision_score, f1_score, confusion_matrix
from imbens.ensemble import (
    BalanceCascadeClassifier,
    SelfPacedEnsembleClassifier,
    UnderBaggingClassifier,
    EasyEnsembleClassifier,
    RUSBoostClassifier,
    BalancedRandomForestClassifier,
    AdaCostClassifier,
    AdaUBoostClassifier,
    AsymBoostClassifier
)
from UADF import DualGranularBalancedDeepForest
from demo import get_config

# 生成棋盘数据集
def generate_checkerboard_data(n_samples=11000):
    X, y = make_classification(
        n_samples=n_samples,
        n_features=2,
        n_informative=2,
        n_redundant=0,
        n_clusters_per_class=1,
        weights=[0.9, 0.1],
        class_sep=1.0,
        random_state=42
    )
    return X, y

# 定义评估函数
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    auc = average_precision_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    g_mean = np.sqrt((tp / (tp + fn)) * (tn / (tn + fp)))
    return {'AUCPRC': auc, 'F1': f1, 'G-Mean': g_mean}

# 配置不平衡数据处理模型
ensemble_methods = {
    'BalanceCascade': BalanceCascadeClassifier(),
    'SelfPacedEnsemble': SelfPacedEnsembleClassifier(),
    'UnderBagging': UnderBaggingClassifier(),
    'EasyEnsemble': EasyEnsembleClassifier(),
    'RUSBoost': RUSBoostClassifier(),
    'BalancedRandomForest': BalancedRandomForestClassifier(),
    'AdaCost': AdaCostClassifier(),
    'AdaUBoost': AdaUBoostClassifier(),
    'AsymBoost': AsymBoostClassifier(),
    'DualGranularBalancedDeepForest': DualGranularBalancedDeepForest(get_config())
}

# 加载数据集
X, y = generate_checkerboard_data()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 对每个集成方法进行训练和评估
results = {}
for method_name, model in ensemble_methods.items():
    model.fit(X_train, y_train)
    results[method_name] = evaluate_model(model, X_test, y_test)

# 输出结果
for method_name, metrics in results.items():
    print(f"Method: {method_name}")
    print(f"  AUCPRC: {metrics['AUCPRC']:.3f}, F1: {metrics['F1']:.3f}, G-Mean: {metrics['G-Mean']:.3f}")


Majority class detected: 0
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49


Begin to train.... - 2024-11-24 19:46:16,452 - DualGranularBalancedDeepForest
the shape of training samples: (7700, 2) - 2024-11-24 19:46:16,453 - DualGranularBalancedDeepForest
use f1_macro as training evaluation - 2024-11-24 19:46:16,453 - DualGranularBalancedDeepForest
stacking: False, save model: False - 2024-11-24 19:46:16,454 - DualGranularBalancedDeepForest
-----------------------------------------layer-0-------------------------------------------- - 2024-11-24 19:46:16,454 - DualGranularBalancedDeepForest
The shape of x_train is (7700, 2) - 2024-11-24 19:46:16,454 - DualGranularBalancedDeepForest
layer_0, estimator_0, BalancedEnsembleClassifier, n_fold_0,Accuracy=0.9747, f1_score=0.9336, auc=0.9647, gmean=0.9543, sen=0.9295, spe=0.9798, aupr=0.9116 - 2024-11-24 19:46:20,413 - KFoldWrapper
layer_0, estimator_0, BalancedEnsembleClassifier, n_fold_1,Accuracy=0.9740, f1_score=0.9335, auc=0.9708, gmean=0.9656, sen=0.9551, spe=0.9762, aupr=0.9112 - 2024-11-24 19:46:20,417 - KFoldWrap

Final enhanced_vector_cur_layer type: <class 'numpy.ndarray'>
enhanced_vector_cur_layer shape: (7700, 10)
num_layers_before_append: 0
num_layers: 1


layer_1, estimator_0, BalancedEnsembleClassifier, n_fold_0,Accuracy=0.9851, f1_score=0.9582, auc=0.9680, gmean=0.9480, sen=0.9038, spe=0.9942, aupr=0.9169 - 2024-11-24 19:46:38,549 - KFoldWrapper
layer_1, estimator_0, BalancedEnsembleClassifier, n_fold_1,Accuracy=0.9760, f1_score=0.9294, auc=0.9575, gmean=0.8964, sen=0.8077, spe=0.9949, aupr=0.8877 - 2024-11-24 19:46:38,554 - KFoldWrapper
layer_1, estimator_0, BalancedEnsembleClassifier, n_fold_2,Accuracy=0.9825, f1_score=0.9523, auc=0.9769, gmean=0.9555, sen=0.9231, spe=0.9892, aupr=0.9137 - 2024-11-24 19:46:38,560 - KFoldWrapper
layer_1, estimator_0, BalancedEnsembleClassifier, n_fold_3,Accuracy=0.9721, f1_score=0.9227, auc=0.9678, gmean=0.9141, sen=0.8471, spe=0.9863, aupr=0.8912 - 2024-11-24 19:46:38,565 - KFoldWrapper
layer_1, estimator_0, BalancedEnsembleClassifier, n_fold_4,Accuracy=0.9766, f1_score=0.9331, auc=0.9490, gmean=0.9101, sen=0.8344, spe=0.9928, aupr=0.8765 - 2024-11-24 19:46:38,569 - KFoldWrapper
layer_1, estimator_0

KeyboardInterrupt: 

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import average_precision_score, f1_score, confusion_matrix
from imbens.ensemble import (
    BalanceCascadeClassifier,
    SelfPacedEnsembleClassifier,
    UnderBaggingClassifier,
    EasyEnsembleClassifier,
    RUSBoostClassifier,
    BalancedRandomForestClassifier,
    AdaCostClassifier,
    AdaUBoostClassifier,
    AsymBoostClassifier
)
from UADF import DualGranularBalancedDeepForest
from demo import get_config

# 生成棋盘数据集
def generate_checkerboard_data(n_samples=11000):
    X, y = make_classification(
        n_samples=n_samples,
        n_features=2,
        n_informative=2,
        n_redundant=0,
        n_clusters_per_class=1,
        weights=[0.9, 0.1],
        class_sep=1.0,
        random_state=42
    )
    return X, y

# 定义评估函数
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    auc = average_precision_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    g_mean = np.sqrt((tp / (tp + fn)) * (tn / (tn + fp)))
    return {'AUCPRC': auc, 'F1': f1, 'G-Mean': g_mean}

# 可视化函数，展示每个方法在不同阶段的分类结果
def visualize_model_stages(model, X_train, y_train, X_test, y_test, method_name, n_stages=3):
    fig, axes = plt.subplots(1, n_stages, figsize=(15, 5))
    for stage in range(n_stages):
        # 模拟每个阶段的结果，通过部分训练数据集来可视化
        sample_indices = np.random.choice(len(X_train), size=int(len(X_train) * (stage + 1) / n_stages), replace=False)
        X_stage, y_stage = X_train[sample_indices], y_train[sample_indices]
        
        model.fit(X_stage, y_stage)  # 部分训练数据拟合
        y_pred = model.predict(X_test)
        
        # 绘制测试集的预测结果
        scatter = axes[stage].scatter(X_test[:, 0], X_test[:, 1], c=y_pred, cmap='coolwarm', alpha=0.6)
        axes[stage].scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap='coolwarm', edgecolor='k', alpha=0.3)
        axes[stage].set_title(f"{method_name} - Stage {stage + 1}")
        fig.colorbar(scatter, ax=axes[stage], orientation='vertical')

    plt.suptitle(f"Classification Stages for {method_name}")
    plt.show()

# 配置不平衡数据处理模型
ensemble_methods = {
    'BalanceCascade': BalanceCascadeClassifier(),
    'SelfPacedEnsemble': SelfPacedEnsembleClassifier(),
    'UnderBagging': UnderBaggingClassifier(),
    'EasyEnsemble': EasyEnsembleClassifier(),
    'RUSBoost': RUSBoostClassifier(),
    'BalancedRandomForest': BalancedRandomForestClassifier(),
    'AdaCost': AdaCostClassifier(),
    'AdaUBoost': AdaUBoostClassifier(),
    'AsymBoost': AsymBoostClassifier(),
    'DualGranularBalancedDeepForest': DualGranularBalancedDeepForest(get_config())
}

# 加载数据集
X, y = generate_checkerboard_data()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 对每个集成方法进行训练、评估和可视化
results = {}
for method_name, model in ensemble_methods.items():
    model.fit(X_train, y_train)
    results[method_name] = evaluate_model(model, X_test, y_test)
    # 可视化该模型在不同阶段的结果
    visualize_model_stages(model, X_train, y_train, X_test, y_test, method_name)

# 输出结果
for method_name, metrics in results.items():
    print(f"Method: {method_name}")
    print(f"  AUCPRC: {metrics['AUCPRC']:.3f}, F1: {metrics['F1']:.3f}, G-Mean: {metrics['G-Mean']:.3f}")


In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs
from sklearn.model_selection import train_test_split
from sklearn.metrics import average_precision_score, f1_score, confusion_matrix
from imbens.ensemble import (
    BalanceCascadeClassifier,
    SelfPacedEnsembleClassifier,
    UnderBaggingClassifier,
    EasyEnsembleClassifier,
    RUSBoostClassifier,
    BalancedRandomForestClassifier,
    AdaCostClassifier,
    AdaUBoostClassifier,
    AsymBoostClassifier
)
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NeighbourhoodCleaningRule

# 生成棋盘数据集
def generate_checkerboard_data(n_samples=1600):
    centers = [(i, j) for i in range(4) for j in range(4)]
    X, y = make_blobs(n_samples=n_samples, centers=centers, cluster_std=0.5, random_state=42)
    y = (y % 2).astype(int)  # 交替将类标签设为0和1，形成棋盘格子
    return X, y

# 定义评估函数
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    auc = average_precision_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    g_mean = np.sqrt((tp / (tp + fn)) * (tn / (tn + fp)))
    return {'AUCPRC': auc, 'F1': f1, 'G-Mean': g_mean}

# 绘制数据集和预测结果
def plot_checkerboard(X, y, title, ax):
    scatter = ax.scatter(X[:, 0], X[:, 1], c=y, cmap='coolwarm', alpha=0.6, edgecolor='k')
    ax.set_title(title)
    plt.colorbar(scatter, ax=ax, orientation='vertical')

# 可视化方法的不同阶段
def visualize_method_stages(X_train, y_train, method_name, stages):
    fig, axes = plt.subplots(1, len(stages), figsize=(15, 5))
    for i, (title, X_stage, y_stage) in enumerate(stages):
        plot_checkerboard(X_stage, y_stage, f"{method_name} - {title}", axes[i])
    plt.show()

# 配置数据集和方法
X, y = generate_checkerboard_data()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 不平衡处理方法的不同阶段可视化
# 1. Clean 方法
ncr = NeighbourhoodCleaningRule()
X_clean, y_clean = ncr.fit_resample(X_train, y_train)
visualize_method_stages(X_train, y_train, "Clean", [("Original", X_train, y_train), ("After Cleaning", X_clean, y_clean)])

# 2. SMOTE 方法
smote = SMOTE()
X_smote, y_smote = smote.fit_resample(X_train, y_train)
visualize_method_stages(X_train, y_train, "SMOTE", [("Original", X_train, y_train), ("After SMOTE", X_smote, y_smote)])

# 3. Easy Ensemble 方法
easy_ensemble = EasyEnsembleClassifier(n_estimators=3)
easy_ensemble.fit(X_train, y_train)
stages_easy = [("Stage 1", X_train, y_train)]
for i, estimator in enumerate(easy_ensemble.estimators_):
    X_easy, y_easy = estimator[0].fit_resample(X_train, y_train)
    stages_easy.append((f"Stage {i + 2}", X_easy, y_easy))
visualize_method_stages(X_train, y_train, "EasyEnsemble", stages_easy)

# 4. Cascade 方法
cascade = BalanceCascadeClassifier(n_estimators=3)
cascade.fit(X_train, y_train)
stages_cascade = [("Stage 1", X_train, y_train)]
for i, estimator in enumerate(cascade.estimators_):
    X_cascade, y_cascade = estimator[0].fit_resample(X_train, y_train)
    stages_cascade.append((f"Stage {i + 2}", X_cascade, y_cascade))
visualize_method_stages(X_train, y_train, "Cascade", stages_cascade)

# 5. Self-Paced Ensemble (SPE) 方法
spe = SelfPacedEnsembleClassifier(n_estimators=3)
spe.fit(X_train, y_train)
stages_spe = [("Stage 1", X_train, y_train)]
for i, estimator in enumerate(spe.estimators_):
    X_spe, y_spe = estimator[0].fit_resample(X_train, y_train)
    stages_spe.append((f"Stage {i + 2}", X_spe, y_spe))
visualize_method_stages(X_train, y_train, "SelfPacedEnsemble", stages_spe)


In [None]:
import numpy as np
import matplotlib.pyplot as plt

# 生成严格交替分布的棋盘数据集
def generate_strict_checkerboard_data(n_samples_per_blob=500, grid_size=4, cluster_std=0.05):
    centers = []
    labels = []
    for i in range(grid_size):
        for j in range(grid_size):
            centers.append((i, j))
            labels.append((i + j) % 2)  # 交替标签，确保相邻格子颜色不同
    
    X = []
    y = []
    for center, label in zip(centers, labels):
        X_blob = np.random.multivariate_normal(center, np.eye(2) * cluster_std, n_samples_per_blob)
        X.extend(X_blob)
        y.extend([label] * n_samples_per_blob)
    
    return np.array(X), np.array(y)

# 加载数据集
X, y = generate_strict_checkerboard_data()

# 可视化生成的棋盘数据集
plt.figure(figsize=(6, 6))
scatter = plt.scatter(X[:, 0], X[:, 1], c=y, cmap='coolwarm', alpha=0.6, edgecolor='k')
plt.title("Generated Checkerboard Dataset (Strict Alternating Pattern)")
plt.colorbar(scatter, orientation='vertical', label='Class Label')
plt.show()


In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import average_precision_score, f1_score, confusion_matrix
from imbens.ensemble import (
    BalanceCascadeClassifier,
    SelfPacedEnsembleClassifier,
    UnderBaggingClassifier,
    EasyEnsembleClassifier,
    RUSBoostClassifier,
    BalancedRandomForestClassifier,
    AdaCostClassifier,
    AdaUBoostClassifier,
    AsymBoostClassifier
)
from UADF import DualGranularBalancedDeepForest
from demo import get_config
import os

# 定义评估函数
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    auc = average_precision_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='macro')
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    g_mean = np.sqrt((tp / (tp + fn)) * (tn / (tn + fp)))
    return {'AUCPRC': auc, 'F1': f1, 'G-Mean': g_mean}, y_pred

# 配置不平衡数据处理模型
ensemble_methods = {
    'BalanceCascade': BalanceCascadeClassifier(),
    'SelfPacedEnsemble': SelfPacedEnsembleClassifier(),
    'UnderBagging': UnderBaggingClassifier(),
    'EasyEnsemble': EasyEnsembleClassifier(),
    'RUSBoost': RUSBoostClassifier(),
    'BalancedRandomForest': BalancedRandomForestClassifier(),
    'AdaCost': AdaCostClassifier(),
    'AdaUBoost': AdaUBoostClassifier(),
    'AsymBoost': AsymBoostClassifier(),
    'DualGranularBalancedDeepForest': DualGranularBalancedDeepForest(get_config())
}

# 加载不平衡的棋盘数据集
def generate_imbalanced_checkerboard_data(n_samples_class0=10000, n_samples_class1=1000, grid_size=3, cluster_std=0.03):
    centers = []
    labels = []
    for i in range(grid_size):
        for j in range(grid_size):
            centers.append((i, j))
            labels.append((i + j) % 2)  # 交替标签，确保相邻格子颜色不同
    
    X = []
    y = []
    for center, label in zip(centers, labels):
        n_samples = n_samples_class0 if label == 0 else n_samples_class1  # 根据标签分配不同的样本数
        X_blob = np.random.multivariate_normal(center, np.eye(2) * cluster_std, n_samples)
        X.extend(X_blob)
        y.extend([label] * n_samples)
    
    return np.array(X), np.array(y)

# 创建用于保存图像和数据的文件夹
output_dir = "pred_results2"
os.makedirs(output_dir, exist_ok=True)

# 生成数据并划分训练集和测试集
X, y = generate_imbalanced_checkerboard_data()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

# 保存生成的原始数据集
np.save(os.path.join(output_dir, "X.npy"), X)
np.save(os.path.join(output_dir, "y.npy"), y)

# 可视化生成的棋盘数据集并显示和保存为 PNG
plt.figure(figsize=(6, 6))
scatter = plt.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap='coolwarm', alpha=0.6, edgecolor='k')
plt.title("Generated Checkerboard Dataset (Strict Alternating Pattern)")
plt.colorbar(scatter, orientation='vertical', label='Class Label')
plt.savefig(os.path.join(output_dir, "Generated_Checkerboard_Dataset.png"), format="png")
plt.show()

# 对每个集成方法进行训练、评估、可视化并保存结果
results = {}
for method_name, model in ensemble_methods.items():
    print(f"Training {method_name}...")
    model.fit(X_train, y_train)
    metrics, y_pred = evaluate_model(model, X_test, y_test)
    results[method_name] = metrics

    # 保存预测结果
    np.save(os.path.join(output_dir, f"{method_name}_y_pred.npy"), y_pred)

    # 可视化分类结果并保存为 PNG
    plt.figure(figsize=(6, 6))
    scatter = plt.scatter(X_test[:, 0], X_test[:, 1], c=y_pred, cmap='coolwarm', alpha=0.6, edgecolor='k')
    plt.title(f"{method_name} - Classification Result")
    plt.colorbar(scatter, orientation='vertical', label='Predicted Class')
    plt.savefig(os.path.join(output_dir, f"{method_name}_Classification_Result.png"), format="png")
    plt.show()

# 输出结果
for method_name, metrics in results.items():
    print(f"Method: {method_name}")
    print(f"  AUCPRC: {metrics['AUCPRC']:.3f}, F1: {metrics['F1']:.3f}, G-Mean: {metrics['G-Mean']:.3f}")


In [None]:
# 可视化深度森林采样的过程，和样本的Loss、Uncertainty
