In [139]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neighbors import NearestNeighbors

# 生成不均衡数据集，少数类分布在中心圆形区域，多数类侵入少数类区域
def generate_sparse_data_with_overlap(n_samples=300, imbalance_ratio=0.3, noise_level=0.4):
    np.random.seed()
    n_majority = int(n_samples * (1 - imbalance_ratio))
    n_minority = n_samples - n_majority

    # 生成少数类：集中分布在中心圆形区域
    radius_minority = np.random.uniform(0, 1.5, size=n_minority)
    angle_minority = np.random.uniform(0, 2 * np.pi, size=n_minority)
    minority_x = np.vstack([radius_minority * np.cos(angle_minority), radius_minority * np.sin(angle_minority)]).T
    
    # 为少数类添加噪声
    noise_minority = np.random.normal(scale=noise_level, size=minority_x.shape)
    minority_x += noise_minority

    # 生成多数类：主要分布在外围区域，但一部分侵入中心区域
    majority_x = []
    while len(majority_x) < n_majority:
        x, y = np.random.uniform(-5, 5, size=2)
        distance_to_center = np.sqrt(x**2 + y**2)
        if distance_to_center > 2.0 or np.random.rand() < 0.1:  # 10%概率放置在中心圆内
            majority_x.append([x, y])
    majority_x = np.array(majority_x)

    # 为多数类添加轻微噪声
    noise_majority = np.random.normal(scale=noise_level, size=majority_x.shape)
    majority_x += noise_majority
    
    # 合并数据集
    X = np.vstack([majority_x, minority_x])
    y = np.hstack([np.zeros(len(majority_x)), np.ones(len(minority_x))])
    return X, y

# 生成原始数据集
X, y = generate_sparse_data_with_overlap(n_samples=200, imbalance_ratio=0.1, noise_level=0.4)

# 选择一个靠近少数类区域的多数类样本
# 使用K近邻查找少数类样本的中心
minority_indices = np.where(y == 1)[0]
minority_samples = X[minority_indices]

# 找出靠近少数类样本的多数类样本
majority_indices = np.where(y == 0)[0]
majority_samples = X[majority_indices]

# 使用K近邻找到少数类中心最近的多数类样本
nbrs = NearestNeighbors(n_neighbors=1).fit(minority_samples)
distances, indices = nbrs.kneighbors(majority_samples)

# 找到距离最小的多数类样本
target_index = np.argmin(distances)
target_sample = majority_samples[target_index].reshape(1, -1)

# 保存原始数据集和目标样本到 .npy 文件
np.save("X_original.npy", X)
np.save("y_original.npy", y)
np.save("target_sample.npy", target_sample)

print("数据保存完成：")
print("X_original.npy：原始数据集的特征")
print("y_original.npy：原始数据集的标签")
print("target_sample.npy：靠近少数类区域的多数类样本坐标")


数据保存完成：
X_original.npy：原始数据集的特征
y_original.npy：原始数据集的标签
target_sample.npy：靠近少数类区域的多数类样本坐标


In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neighbors import NearestNeighbors
from imblearn.under_sampling import RandomUnderSampler

# 加载保存的数据
X_loaded = np.load("X_original.npy")
y_loaded = np.load("y_original.npy")
target_loaded = np.load("target_sample.npy")

# 确保目标样本存在于下采样数据集中
found_target = False
while not found_target:
    rus = RandomUnderSampler(random_state=None)  # 每次随机重采样
    X_resampled, y_resampled = rus.fit_resample(X_loaded, y_loaded)
    
    # 检查目标样本是否在下采样后的数据集中
    if any(np.all(X_resampled == target_loaded, axis=1)):
        found_target = True

# 标记采样到和未采样到的多数类样本
majority_indices = np.where(y_loaded == 0)[0]  # 原始多数类样本索引
resampled_majority_indices = np.where(y_resampled == 0)[0]  # 下采样后的多数类索引
X_majority = X_loaded[majority_indices]

# 找出未被采样到的多数类样本
resampled_majority_points = X_resampled[resampled_majority_indices]
unsampled_majority_points = np.array([point for point in X_majority if not any(np.all(point == resampled_majority_points, axis=1))])

# 使用K近邻找出5个最近邻样本
nbrs = NearestNeighbors(n_neighbors=6).fit(X_resampled)  # 5个邻居+自身
distances, indices = nbrs.kneighbors(target_loaded)

# 提取5个最近邻样本的坐标及类别
nearest_points = X_resampled[indices[0][1:]]  # 排除自身
nearest_labels = y_resampled[indices[0][1:]]

# 可视化结果
plt.figure(figsize=(8, 6))
plt.title("RUS Resampled Data Visualization with Nearest Neighbors")

# 多数类：空心蓝色（未采样到）
plt.scatter(unsampled_majority_points[:, 0], unsampled_majority_points[:, 1],
            color='blue', edgecolor='blue', facecolors='none', label="Unsampled Majority Class")

# 多数类：实心蓝色（采样到）
plt.scatter(resampled_majority_points[:, 0], resampled_majority_points[:, 1],
            color='blue', edgecolor='blue', label="Sampled Majority Class")

# 少数类：红色实心方块
plt.scatter(X_resampled[y_resampled == 1][:, 0], X_resampled[y_resampled == 1][:, 1],
            color='red', marker='s', label="Minority Class")

# 标记目标样本
plt.scatter(target_loaded[:, 0], target_loaded[:, 1], color='black', s=100,
            label="Target Sample", edgecolor='yellow')

# 标记5个最近邻样本，根据类别区分颜色
plt.scatter([], [], color='cyan', edgecolor='black', s=100, label="Nearest Neighbor (Majority)")  # 图例
plt.scatter([], [], color='orange', edgecolor='black', s=100, label="Nearest Neighbor (Minority)")  # 图例

for i, (point, label) in enumerate(zip(nearest_points, nearest_labels)):
    if label == 0:  # 多数类
        plt.scatter(point[0], point[1], color='cyan', edgecolor='black', s=100)
    else:  # 少数类
        plt.scatter(point[0], point[1], color='orange', edgecolor='black', s=100)

# 显示图例
plt.xlabel("Feature 1")
plt.ylabel("Feature 2")
plt.legend(loc="upper right")
plt.show()


FileNotFoundError: [Errno 2] No such file or directory: 'X_original.npy'