In [1]:
import sys
sys.path.append('D:\PyCharmProjects\VFPUMC02')
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append('D:\PyCharmProjects\VFPUMC02')
from datasets.BankDataset import BankDataset
from consts.Constants import DATASETS_PATH
from enums.SplitRatio import SplitRatio
from enums.HideRatio import HideRatio
from configs.Config import VFPUConfig

In [3]:
bankDataset =  BankDataset()
dfA, dfB, origY = bankDataset.get_data_by_split_ratio(SplitRatio.RATIO_50_50)
hiddenY = bankDataset.get_hidden_labels(HideRatio.RATIO_0_2)

In [22]:
import numpy as np
import pandas as pd
from sklearn.semi_supervised import LabelSpreading, LabelPropagation, SelfTrainingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, recall_score, roc_auc_score, f1_score
from sklearn.model_selection import train_test_split

# 假设 HideRatio 是一个枚举类，bankDataset 是一个对象，包含 df, hidden_y 和 y 属性
# bankDataset.df: 特征数据 (41188, 64)
# bankDataset.hidden_y: 标签数据，包含 0, 1, -1，-1 表示未标记样本
# bankDataset.y: 原始样本的标签，只有 0 和 1，用于验证

# 评估函数
def evaluate_model(y_true, y_pred, y_prob):
    # 将 NaN 替换为 0
    y_true = np.nan_to_num(y_true, nan=0)
    y_pred = np.nan_to_num(y_pred, nan=0)
    y_prob = np.nan_to_num(y_prob, nan=0)
    accuracy = accuracy_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    auc = roc_auc_score(y_true, y_prob)
    f1 = f1_score(y_true, y_pred)
    return accuracy, recall, auc, f1

# 创建一个空的 DataFrame，用于存储结果
methods = ['Label Spreading', 'Label Propagation', 'Self-training']
metrics = ['Accuracy', 'Recall', 'AUC', 'F1']
index = pd.MultiIndex.from_product([methods, metrics], names=['Method', 'Metric'])
results_df = pd.DataFrame(columns=index)

# 遍历 HideRatio 枚举类
for member in HideRatio:
    print(f"当前的 member 为 {member.name}")
    
    # 获取隐藏标签
    bankDataset.get_hidden_labels(member)
    
    # 提取数据
    X = bankDataset.df
    hidden_y = bankDataset.hidden_y
    y_true = bankDataset.y
    
    # 找到未标记样本的索引
    unlabeled_mask = hidden_y == -1
    
    # 找到已标记样本的索引
    labeled_mask = hidden_y != -1
    
    # 只对未标记样本进行预测，因此我们需要对这些样本的预测结果进行评估
    X_unlabeled = X[unlabeled_mask]
    y_true_unlabeled = y_true[unlabeled_mask]  # 用于评估的真实标签

    X_small, _, hidden_y_small, _ = train_test_split(X, hidden_y, test_size=0.9, random_state=42)
    
    # 1. Label Spreading
    label_spread = LabelSpreading(kernel='knn', n_neighbors=5)
    label_spread.fit(X, hidden_y)
    
    # 对未标记样本进行预测
    y_pred_spread = label_spread.predict(X_unlabeled)
    y_prob_spread = label_spread.predict_proba(X_unlabeled)[:, 1]
    
    # 评估 Label Spreading
    accuracy_spread, recall_spread, auc_spread, f1_spread = evaluate_model(y_true_unlabeled, y_pred_spread, y_prob_spread)
    
    # 2. Label Propagation
    label_prop = LabelPropagation(kernel='rbf', gamma=20)
    label_prop.fit(X_small, hidden_y_small)
    
    # 对未标记样本进行预测
    y_pred_prop = label_prop.predict(X_unlabeled)
    y_prob_prop = label_prop.predict_proba(X_unlabeled)[:, 1]
    
    # 评估 Label Propagation
    accuracy_prop, recall_prop, auc_prop, f1_prop = evaluate_model(y_true_unlabeled, y_pred_prop, y_prob_prop)
    
    # 3. Self-training
    svc = SVC(probability=True)
    self_training_model = SelfTrainingClassifier(base_estimator=svc, threshold=0.5)
    self_training_model.fit(X_small, hidden_y_small)
    
    # 对未标记样本进行预测
    y_pred_self_train = self_training_model.predict(X_unlabeled)
    y_prob_self_train = self_training_model.predict_proba(X_unlabeled)[:, 1]
    
    # 评估 Self-training
    accuracy_self_train, recall_self_train, auc_self_train, f1_self_train = evaluate_model(y_true_unlabeled, y_pred_self_train, y_prob_self_train)
    
    # 将结果存储到 DataFrame 中
    results_df.loc[member.name, ('Label Spreading', 'Accuracy')] = accuracy_spread
    results_df.loc[member.name, ('Label Spreading', 'Recall')] = recall_spread
    results_df.loc[member.name, ('Label Spreading', 'AUC')] = auc_spread
    results_df.loc[member.name, ('Label Spreading', 'F1')] = f1_spread
    
    results_df.loc[member.name, ('Label Propagation', 'Accuracy')] = accuracy_prop
    results_df.loc[member.name, ('Label Propagation', 'Recall')] = recall_prop
    results_df.loc[member.name, ('Label Propagation', 'AUC')] = auc_prop
    results_df.loc[member.name, ('Label Propagation', 'F1')] = f1_prop
    
    results_df.loc[member.name, ('Self-training', 'Accuracy')] = accuracy_self_train
    results_df.loc[member.name, ('Self-training', 'Recall')] = recall_self_train
    results_df.loc[member.name, ('Self-training', 'AUC')] = auc_self_train
    results_df.loc[member.name, ('Self-training', 'F1')] = f1_self_train

# 输出最终的 DataFrame
print(results_df)

当前的 member 为 RATIO_0_1
当前的 member 为 RATIO_0_2
当前的 member 为 RATIO_0_3


  probabilities /= normalizer
  probabilities /= normalizer


当前的 member 为 RATIO_0_4


  probabilities /= normalizer
  probabilities /= normalizer
  probabilities /= normalizer
  probabilities /= normalizer


当前的 member 为 RATIO_0_5


  probabilities /= normalizer
  probabilities /= normalizer
  probabilities /= normalizer
  probabilities /= normalizer


当前的 member 为 RATIO_0_6


  probabilities /= normalizer
  probabilities /= normalizer
  probabilities /= normalizer
  probabilities /= normalizer


当前的 member 为 RATIO_0_7


  probabilities /= normalizer
  probabilities /= normalizer
  probabilities /= normalizer
  probabilities /= normalizer


当前的 member 为 RATIO_0_8


  probabilities /= normalizer
  probabilities /= normalizer
  probabilities /= normalizer
  probabilities /= normalizer


当前的 member 为 RATIO_0_9


  probabilities /= normalizer
  probabilities /= normalizer
  probabilities /= normalizer
  probabilities /= normalizer


Method    Label Spreading                               Label Propagation  \
Metric           Accuracy    Recall       AUC        F1          Accuracy   
RATIO_0_1        0.941719  0.589011  0.958417  0.690722          0.917678   
RATIO_0_2        0.940755  0.611714  0.950479   0.69802          0.919874   
RATIO_0_3         0.94011   0.61316  0.948955  0.696223          0.913564   
RATIO_0_4        0.937299  0.598413  0.945325  0.686495          0.913323   
RATIO_0_5         0.93333  0.574361  0.933179  0.666343           0.90774   
RATIO_0_6        0.931976  0.582095   0.93505  0.656448          0.903488   
RATIO_0_7        0.927543  0.551166  0.930662   0.62915          0.906628   
RATIO_0_8        0.917724  0.503864  0.914033  0.582473          0.900698   
RATIO_0_9        0.907794  0.479403  0.898931  0.537983          0.894656   

Method                                  Self-training                      \
Metric       Recall       AUC        F1      Accuracy    Recall       AUC  

In [24]:
from consts.Constants import DATASETS_PATH
import os
results_df.to_csv(os.path.join(DATASETS_PATH,"SklearnSemiMethodBank.csv"))