In [1]:
LINK_DATASET = "/media/namvq/Data/chest_xray"
# LINK_DATASET = "/kaggle/input/chest-xray-pneumonia/chest_xray"
NUM_WORKERS = 6
# BASE_FOLDER_NOISE = "/kaggle/input/chest-xray-noise-60-partitions"
BASE_FOLDER_NOISE = "/media/namvq/Data/code_chinh_sua/fedavg"

In [2]:
"""Partition the data and create the dataloaders."""

from typing import List, Optional, Tuple

import torch
from omegaconf import DictConfig

from torch.utils.data import DataLoader, random_split, Subset
from torchvision import transforms
from torchvision.datasets import MNIST
from torchvision.transforms import Compose, Normalize, ToTensor
import os
from torchvision.datasets import ImageFolder
from torchvision.transforms import Compose, Resize, Grayscale, ToTensor
from collections import Counter
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
matplotlib.use('Agg')  # Chuyển sang backend không cần GUI

print('BACKEND: ', matplotlib.get_backend())
# def get_custom_dataset(data_path: str = "/media/namvq/Data/chest_xray"):
#     """Load custom dataset and apply transformations."""
#     transform = Compose([
#         Resize((100, 100)),
#         Grayscale(num_output_channels=1),
#         ToTensor()
#     ])
#     trainset = ImageFolder(os.path.join(data_path, 'train'), transform=transform)
#     testset = ImageFolder(os.path.join(data_path, 'test'), transform=transform)
#     return trainset, testset

# def get_custom_dataset(data_path: str = "/kaggle/input/chest-xray-pneumonia/chest_xray"):
#     """Load custom dataset and apply transformations."""
#     transform = transforms.Compose([
#         transforms.Resize((224, 224)),  # Kích thước ảnh cho EfficientNet
#         transforms.RandomHorizontalFlip(),
#         transforms.ToTensor(),
#         transforms.Normalize([0.485, 0.456, 0.406],  # Mean chuẩn của ImageNet
#                              [0.229, 0.224, 0.225])  # Std chuẩn của ImageNet
#     ])
#     trainset = ImageFolder(os.path.join(data_path, 'train'), transform=transform)
#     testset = ImageFolder(os.path.join(data_path, 'test'), transform=transform)
#     return trainset, testset

# def get_custom_dataset(data_path: str = "/media/namvq/Data/chest_xray"):
#     """Load custom dataset and apply transformations."""
#     train_transform = transforms.Compose([
#         transforms.Resize((224, 224)),  # Kích thước ảnh cho EfficientNet
#         transforms.RandomHorizontalFlip(),
#         transforms.ToTensor(),
#         transforms.Normalize([0.485, 0.456, 0.406],  # Mean chuẩn của ImageNet
#                              [0.229, 0.224, 0.225])  # Std chuẩn của ImageNet
#     ])
#     test_transform = transforms.Compose([
#         transforms.Resize((224, 224)),  # Kích thước ảnh cho EfficientNet
#         transforms.ToTensor(),
#         transforms.Normalize([0.485, 0.456, 0.406],  # Mean chuẩn của ImageNet
#                              [0.229, 0.224, 0.225])  # Std chuẩn của ImageNet
#     ])
#     trainset = ImageFolder(os.path.join(data_path, 'train'), transform=train_transform)
#     testset = ImageFolder(os.path.join(data_path, 'test'), transform=test_transform)
#     return trainset, testset
def get_custom_dataset(data_path: str = LINK_DATASET):
    """Load custom dataset and apply transformations."""
    train_transform = transforms.Compose([
        transforms.Resize(256),  # Kích thước ảnh cho VGG
        transforms.RandomAffine(degrees=0, shear=10),
        transforms.RandomHorizontalFlip(),
        transforms.RandomResizedCrop(224, scale=(0.8, 1.0)),
        transforms.RandomAffine(degrees=0, translate=(0.2, 0)),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406],  # Mean chuẩn của ImageNet
                             [0.229, 0.224, 0.225])  # Std chuẩn của ImageNet
    ])
    test_transform = transforms.Compose([
        transforms.Resize((150, 150)),  # Kích thước ảnh cho VGG
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406],  # Mean chuẩn của ImageNet
                             [0.229, 0.224, 0.225])  # Std chuẩn của ImageNet
    ])
    trainset = ImageFolder(os.path.join(data_path, 'train'), transform=train_transform)
    testset = ImageFolder(os.path.join(data_path, 'test'), transform=test_transform)
    return trainset, testset

#Lay tap val goc co 16 anh thoi
def get_val_dataloader(batch_size: int = 10):
    val_transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ])
    valset = ImageFolder(os.path.join(LINK_DATASET, 'val'), transform=val_transform)
    valloader = DataLoader(valset, batch_size=batch_size, shuffle=False, num_workers=NUM_WORKERS)
    return valloader

def prepare_dataset_for_centralized_train(batch_size: int, val_ratio: float = 0.1, seed: int = 42):
    trainset, testset = get_custom_dataset()
    # Split trainset into trainset and valset
    num_train = int((1 - val_ratio) * len(trainset))
    num_val = len(trainset) - num_train
    trainset, valset = random_split(trainset, [num_train, num_val], torch.Generator().manual_seed(seed))

    trainloader = DataLoader(trainset, batch_size=batch_size, shuffle=True, num_workers=NUM_WORKERS)
    valloader = DataLoader(valset, batch_size=batch_size, shuffle=False, num_workers=NUM_WORKERS)
    testloader = DataLoader(testset, batch_size=batch_size, shuffle=False, num_workers=NUM_WORKERS)

    print(f'Number of train samples: {len(trainset)}, val samples: {len(valset)}, test samples: {len(testloader.dataset)}')
    return trainloader, valloader, testloader


def prepare_dataset(num_partitions: int, batch_size: int, val_ratio: float = 0.1, alpha: float = 100, seed: int = 42):
    """Load custom dataset and generate non-IID partitions using Dirichlet distribution."""
    trainset, testset = get_custom_dataset()
    
    # Split trainset into trainset and valset
    num_train = int((1 - val_ratio) * len(trainset))
    num_val = len(trainset) - num_train
    trainset, valset = random_split(trainset, [num_train, num_val], torch.Generator().manual_seed(seed))
    
    # Get labels for the entire trainset
    train_labels = np.array([trainset.dataset.targets[i] for i in trainset.indices])
    
    # Generate Dirichlet distribution for each class
    class_indices = [np.where(train_labels == i)[0] for i in range(len(np.unique(train_labels)))]
    partition_indices = [[] for _ in range(num_partitions)]
    
    for class_idx in class_indices:
        np.random.shuffle(class_idx)
        proportions = np.random.dirichlet(np.repeat(alpha, num_partitions))
        proportions = (np.cumsum(proportions) * len(class_idx)).astype(int)[:-1]
        class_partitions = np.split(class_idx, proportions)
        for i in range(num_partitions):
            partition_indices[i].extend(class_partitions[i])
    
    # Create Subsets for each partition
    trainsets = [Subset(trainset.dataset, indices) for indices in partition_indices]
    
    # Split valset into partitions
    partition_len_val = [len(valset) // num_partitions] * num_partitions
    for i in range(len(valset) % num_partitions):
        partition_len_val[i] += 1

    valsets = random_split(valset, partition_len_val, torch.Generator().manual_seed(seed))
    
    # Create DataLoaders for each partition
    trainloaders = [DataLoader(ts, batch_size=batch_size, shuffle=True, num_workers=NUM_WORKERS) for ts in trainsets]
    valloaders = [DataLoader(vs, batch_size=batch_size, shuffle=False, num_workers=NUM_WORKERS) for vs in valsets]
    testloader = DataLoader(testset, batch_size=batch_size, shuffle=False, num_workers=NUM_WORKERS)

    # Calculate class distribution for each partition in trainloaders
    class_distributions = []
    for i, trainloader in enumerate(trainloaders):
        class_counts = Counter()
        for _, labels in trainloader:
            class_counts.update(labels.numpy())
        class_distributions.append(class_counts)
        print(f'Partition {i} class distribution: {dict(class_counts)}')

    # Plot class distribution
    partitions = range(num_partitions)
    class_0_counts = [class_distributions[i][0] for i in partitions]
    class_1_counts = [class_distributions[i][1] for i in partitions]

    bar_width = 0.5
    plt.figure(figsize=(12, 8))
    plt.bar(partitions, class_0_counts, bar_width, label='Class 0', color='blue')
    plt.bar(partitions, class_1_counts, bar_width, bottom=class_0_counts, label='Class 1', color='red')
    plt.xlabel('Partition')
    plt.ylabel('Number of Samples')
    plt.title('Class Distribution in Each Partition')
    plt.legend()
    plt.grid(True)
    plt.show()

    print(f'Number of train samples: {len(trainset)}, val samples: {len(valset)}, test samples: {len(testloader.dataset)}')
    return trainloaders, valloaders, testloader

def prepare_partitioned_dataset(num_partitions: int, batch_size: int, val_ratio: float = 0.1, num_labels_each_party: int = 1, seed: int = 42):
    """Load custom dataset and generate partitions where each party has a fixed number of labels."""
    trainset, testset = get_custom_dataset()  # Load datasets

    # Split the trainset into trainset and valset based on the validation ratio
    num_train = int((1 - val_ratio) * len(trainset))
    num_val = len(trainset) - num_train
    trainset, valset = random_split(trainset, [num_train, num_val], generator=torch.Generator().manual_seed(seed))

    # Get labels for the entire trainset
    train_labels = np.array([trainset.dataset.targets[i] for i in trainset.indices])

    # Define partitions: each party has k labels
    num_labels = len(np.unique(train_labels))  # Assuming labels are 0 and 1 for binary classification
    times = [0 for i in range(num_labels)]
    contain = []
    #Phan label cho cac client
    for i in range(num_partitions):
        current = [i%num_labels]
        times[i%num_labels] += 1
        if num_labels_each_party > 1:
            current.append(1-i%num_labels)
            times[1-i%num_labels] += 1
        contain.append(current)
    print(times)
    print(contain)
    # Create Subsets for each partition

    partition_indices = [[] for _ in range(num_partitions)]
    for i in range(num_labels):
        idx_i = np.where(train_labels == i)[0]  # Get indices of label i in train_labels
        idx_i = [trainset.indices[j] for j in idx_i]  # Convert indices to indices in trainset
        # #print label of idx_i
        # print("Label of idx: ", i)
        # for j in range(len(idx_i)):
        #     idx_in_dataset = trainset.indices[idx_i[j]]
        #     print(trainset.dataset.targets[idx_in_dataset])
        np.random.shuffle(idx_i)
        split = np.array_split(idx_i, times[i])
        ids = 0
        for j in range(num_partitions):
            if i in contain[j]:
                partition_indices[j].extend(split[ids])
                ids += 1
    
    trainsets = [Subset(trainset.dataset, indices) for indices in partition_indices]

    # #print label of client 0
    # print("Client 0")
    # for i in range(len(trainsets[0])):
    #     print(trainsets[0][i][1])

    # Split valset into partitions
    partition_len_val = [len(valset) // num_partitions] * num_partitions
    for i in range(len(valset) % num_partitions):
        partition_len_val[i] += 1
    
    valsets = random_split(valset, partition_len_val, generator=torch.Generator().manual_seed(seed))

    # Create DataLoaders for each partition
    trainloaders = [DataLoader(ts, batch_size=batch_size, shuffle=True, num_workers=NUM_WORKERS) for ts in trainsets]
    valloaders = [DataLoader(vs, batch_size=batch_size, shuffle=False, num_workers=NUM_WORKERS) for vs in valsets]
    testloader = DataLoader(testset, batch_size=batch_size, shuffle=False, num_workers=NUM_WORKERS)

    # Calculate class distribution for each partition in trainloaders
    class_distributions = []
    for i, trainloader in enumerate(trainloaders):
        class_counts = Counter()
        for _, labels in trainloader:
            class_counts.update(labels.numpy())
        class_distributions.append(class_counts)
        print(f'Partition {i} class distribution: {dict(class_counts)}')
    # Plot class distribution
    partitions = range(num_partitions)
    class_0_counts = [class_distributions[i][0] for i in partitions]
    class_1_counts = [class_distributions[i][1] for i in partitions]

    bar_width = 0.5
    plt.figure(figsize=(12, 8))
    plt.bar(partitions, class_0_counts, bar_width, label='Class 0', color='blue')
    plt.bar(partitions, class_1_counts, bar_width, bottom=class_0_counts, label='Class 1', color='red')
    plt.xlabel('Partition')
    plt.ylabel('Number of Samples')
    plt.title('Class Distribution in Each Partition')
    plt.legend()
    plt.grid(True)
    # plt.show()

    #  Lưu đồ thị vào thư mục running_outputs với tên data_partition
    output_dir = 'running_outputs'
    os.makedirs(output_dir, exist_ok=True)
    plt.savefig(os.path.join(output_dir, 'data_partition.png'))
    plt.close()



    print(f'Number of train samples: {len(trainset)}, val samples: {len(valset)}, test samples: {len(testloader.dataset)}')
    return trainloaders, valloaders, testloader

def prepare_imbalance_label_dirichlet(num_partitions: int, batch_size: int, val_ratio: float = 0.1, beta: float = 0.5, seed: int = 42):
    """Load custom dataset and generate partitions where each party has a fixed number of labels."""
    trainset, testset = get_custom_dataset()  # Load datasets

    # Split the trainset into trainset and valset based on the validation ratio
    num_train = int((1 - val_ratio) * len(trainset))
    num_val = len(trainset) - num_train
    trainset, valset = random_split(trainset, [num_train, num_val], generator=torch.Generator().manual_seed(seed))

    # Get labels for the entire trainset
    train_labels = np.array([trainset.dataset.targets[i] for i in trainset.indices])

    # Define partitions: each party has k labels
    num_labels = len(np.unique(train_labels))  # Assuming labels are 0 and 1 for binary classification
    min_size = 0
    min_require_size = 2

    N = len(trainset)


    while(min_size < min_require_size):
        partition_indices = [[] for _ in range(num_partitions)]
        for label in range(num_labels):
            idx_label = np.where(train_labels == label)[0]
            idx_label = [trainset.indices[j] for j in idx_label]
            np.random.shuffle(idx_label)

            proportions = np.random.dirichlet(np.repeat(beta, num_partitions))
            # proportions = np.array( [p * len(idx_j) < N/num_partitions] for p, idx_j in zip(proportions, partition_indices))
            proportions = np.array([p if p * len(idx_j) < N / num_partitions else 0 for p, idx_j in zip(proportions, partition_indices)])

            proportions = proportions / np.sum(proportions)
            proportions = (np.cumsum(proportions) * len(idx_label)).astype(int)[:-1]

            partition_indices = [idx_j + idx.tolist() for idx_j, idx in zip(partition_indices, np.split(idx_label, proportions))]
            min_size = min([len(idx_j) for idx_j in partition_indices])
        
    trainsets = [Subset(trainset.dataset, indices) for indices in partition_indices]

    partition_len_val = [len(valset) // num_partitions] * num_partitions
    for i in range(len(valset) % num_partitions):
        partition_len_val[i] += 1
    
    valsets = random_split(valset, partition_len_val, generator=torch.Generator().manual_seed(seed))

    trainloaders = [DataLoader(ts, batch_size=batch_size, shuffle=True, num_workers=NUM_WORKERS) for ts in trainsets]
    valloaders = [DataLoader(vs, batch_size=batch_size, shuffle=False, num_workers=NUM_WORKERS) for vs in valsets]
    testloader = DataLoader(testset, batch_size=batch_size, shuffle=False, num_workers=NUM_WORKERS)

    class_distributions = []
    for i, trainloader in enumerate(trainloaders):
        class_counts = Counter()
        for _, labels in trainloader:
            class_counts.update(labels.numpy())
        class_distributions.append(class_counts)
        print(f'Partition {i} class distribution: {dict(class_counts)}')
    # Plot class distribution
    partitions = range(num_partitions)
    class_0_counts = [class_distributions[i][0] for i in partitions]
    class_1_counts = [class_distributions[i][1] for i in partitions]

    bar_width = 0.5
    plt.figure(figsize=(12, 8))
    plt.bar(partitions, class_0_counts, bar_width, label='Class 0', color='blue')
    plt.bar(partitions, class_1_counts, bar_width, bottom=class_0_counts, label='Class 1', color='red')
    plt.xlabel('Partition')
    plt.ylabel('Number of Samples')
    plt.title('Class Distribution in Each Partition')
    plt.legend()
    plt.grid(True)
    #  Lưu đồ thị vào thư mục running_outputs với tên data_partition
    output_dir = 'running_outputs'
    os.makedirs(output_dir, exist_ok=True)
    plt.savefig(os.path.join(output_dir, 'data_partition.png'))
    plt.close()

    print(f'Number of train samples: {len(trainset)}, val samples: {len(valset)}, test samples: {len(testloader.dataset)}')

    return trainloaders, valloaders, testloader



def apply_gaussian_noise(tensor, std_dev):
    noise = torch.randn_like(tensor) * std_dev
    return tensor + noise

# Hàm đảo ngược chuẩn hóa
def unnormalize_image(image_tensor, mean, std):
    # Đảo ngược Normalize: (image * std) + mean
    for t, m, s in zip(image_tensor, mean, std):
        t.mul_(s).add_(m)  # Thực hiện từng kênh
    return image_tensor

# Hàm hiển thị ảnh từ một tensor
def display_image(image_tensor, mean, std):
    # Đảo ngược chuẩn hóa
    image_tensor = unnormalize_image(image_tensor, mean, std)
    # Chuyển tensor thành NumPy array và điều chỉnh thứ tự kênh màu (CHW -> HWC)
    image_numpy = image_tensor.permute(1, 2, 0).numpy()
    # Cắt giá trị ảnh về phạm vi [0, 1] để hiển thị đúng
    image_numpy = image_numpy.clip(0, 1)
    # Trả về ảnh NumPy
    return image_numpy



def prepare_noise_based_imbalance(num_partitions: int, batch_size: int, val_ratio: float = 0.1, sigma: float = 0.05, seed: int = 42):
    """
    Chia dữ liệu ngẫu nhiên và đều cho các bên, sau đó thêm noise vào các bên.
    Mỗi bên i có noise khác nhau Gauss(0, sigma*i/N). Nếu dữ liệu đã tồn tại, tải từ thư mục dataset_noise_{sigma}.
    """
    # noise_dir = f'chest_xray_noise_{sigma}'
    # noise_dir = f'/kaggle/input/chest-xray-noise-60-partitions/chest_xray_noise_{sigma}'
    noise_dir = f"{BASE_FOLDER_NOISE}/chest_xray_noise_{sigma}"
    mean = [0.485, 0.456, 0.406]
    std = [0.229, 0.224, 0.225]
    noisy_transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize(mean, std)
    ])
    
    if os.path.exists(noise_dir):
        print(f"Loading noisy dataset from {noise_dir}...")
        # Sử dụng ImageFolder để tải dữ liệu đã được thêm nhiễu với transform phù hợp
        train_partitions = [ImageFolder(os.path.join(noise_dir, f'partition_{i}'), transform=noisy_transform) for i in range(num_partitions)]
        
        # Tải val và test set như bình thường
        trainset, testset = get_custom_dataset()
        num_train = int((1 - val_ratio) * len(trainset))
        num_val = len(trainset) - num_train
        _, valset = random_split(trainset, [num_train, num_val], generator=torch.Generator().manual_seed(seed))
        
        # Chia valset thành các partition
        partition_len_val = [len(valset) // num_partitions] * num_partitions
        for i in range(len(valset) % num_partitions):
            partition_len_val[i] += 1
        valsets = random_split(valset, partition_len_val, generator=torch.Generator().manual_seed(seed))
        
        # Tạo DataLoaders
        trainloaders = [DataLoader(part, batch_size=batch_size, shuffle=True, num_workers=6) for part in train_partitions]
        valloaders = [DataLoader(vs, batch_size=batch_size, shuffle=False, num_workers=6) for vs in valsets]
        testloader = DataLoader(testset, batch_size=batch_size, shuffle=False, num_workers=6)
        
        print("Dữ liệu đã được tải thành công từ thư mục lưu trữ.")
    else:
        print(f"Creating noisy dataset and saving to {noise_dir}...")
        os.makedirs(noise_dir, exist_ok=True)
        
        trainset, testset = get_custom_dataset()
        num_train = int((1 - val_ratio) * len(trainset))
        num_val = len(trainset) - num_train
        trainset, valset = random_split(trainset, [num_train, num_val], generator=torch.Generator().manual_seed(seed))
    
        indices = trainset.indices
        np.random.shuffle(indices)
        partition_indices = np.array_split(indices, num_partitions)
    
        # Mean và std từ Normalize đã được định nghĩa trước
        for i, part_indices in enumerate(partition_indices):
            partition_std_dev = sigma * (i + 1) / num_partitions
            partition_set = Subset(trainset.dataset, part_indices)
            
            # Tạo thư mục cho partition và các lớp
            partition_dir = os.path.join(noise_dir, f'partition_{i}')
            os.makedirs(partition_dir, exist_ok=True)
            class_dirs = {}
            for _, label in partition_set:
                if label not in class_dirs:
                    class_dirs[label] = os.path.join(partition_dir, f'class_{label}')
                    os.makedirs(class_dirs[label], exist_ok=True)
            
            for j, (image, label) in enumerate(partition_set):
                noisy_image = apply_gaussian_noise(image, partition_std_dev)
                # Đảo ngược chuẩn hóa để lưu ảnh đúng định dạng
                noisy_image = unnormalize_image(noisy_image, mean, std)
                # Chuyển tensor thành PIL Image
                noisy_image_pil = transforms.ToPILImage()(noisy_image.clamp(0, 1))
                # Lưu ảnh với tên duy nhất
                image_filename = f'image_{j}.png'
                noisy_image_pil.save(os.path.join(class_dirs[label], image_filename))
        
        # Tải dữ liệu từ thư mục đã lưu với transform phù hợp
        train_partitions = [ImageFolder(os.path.join(noise_dir, f'partition_{i}'), transform=noisy_transform) for i in range(num_partitions)]
        
        # Chia valset thành các partition
        partition_len_val = [len(valset) // num_partitions] * num_partitions
        for i in range(len(valset) % num_partitions):
            partition_len_val[i] += 1
        valsets = random_split(valset, partition_len_val, generator=torch.Generator().manual_seed(seed))
        
        # Tạo DataLoaders
        trainloaders = [DataLoader(part, batch_size=batch_size, shuffle=True, num_workers=6) for part in train_partitions]
        valloaders = [DataLoader(vs, batch_size=batch_size, shuffle=False, num_workers=6) for vs in valsets]
        testloader = DataLoader(testset, batch_size=batch_size, shuffle=False, num_workers=6)
    
    # Phân tích phân bố lớp
    class_distributions = []
    for i, trainloader in enumerate(trainloaders):
        class_counts = Counter()
        for _, labels in trainloader:
            class_counts.update(labels.numpy())
        class_distributions.append(class_counts)
        print(f'Partition {i} class distribution: {dict(class_counts)}')
    
    partitions = range(num_partitions)
    class_0_counts = [class_distributions[i].get(0, 0) for i in partitions]
    class_1_counts = [class_distributions[i].get(1, 0) for i in partitions]
    
    bar_width = 0.5
    plt.figure(figsize=(12, 8))
    plt.bar(partitions, class_0_counts, bar_width, label='Class 0', color='blue')
    plt.bar(partitions, class_1_counts, bar_width, bottom=class_0_counts, label='Class 1', color='red')
    plt.xlabel('Partition')
    plt.ylabel('Number of Samples')
    plt.title('Class Distribution in Each Partition')
    plt.legend()
    plt.grid(True)
    # Lưu đồ thị vào thư mục running_outputs với tên data_partition
    output_dir = 'running_outputs'
    os.makedirs(output_dir, exist_ok=True)
    plt.savefig(os.path.join(output_dir, 'data_partition.png'))
    plt.close()
    
    # Lưu ảnh nhiễu vào running_outputs
    # Tạo thư mục lưu ảnh nếu chưa tồn tại
    output_dir = "running_outputs"
    os.makedirs(output_dir, exist_ok=True)
    
    # Khởi tạo một lưới 10x6 để hiển thị ảnh
    fig, axes = plt.subplots(10, 6, figsize=(15, 25))
    
    # Duyệt qua trainloaders và hiển thị ảnh đầu tiên từ mỗi partition
    for i, trainloader in enumerate(trainloaders[:min(num_partitions, 60)]):
        if len(trainloader.dataset) == 0:
            continue
        # Lấy ảnh đầu tiên từ trainloader
        image_tensor, label = trainloader.dataset[0]
        
        # Tìm vị trí hàng, cột trong lưới
        row, col = divmod(i, 6)
        if row >= 10:
            break  # Chỉ hiển thị tối đa 60 ảnh
        # Hiển thị ảnh
        image_numpy = unnormalize_image(image_tensor.clone(), mean, std).permute(1, 2, 0).numpy().clip(0, 1)
        axes[row, col].imshow(image_numpy)
        axes[row, col].axis('off')
    # Điều chỉnh layout để không bị chồng lấn
    plt.tight_layout()
    
    # Lưu ảnh thay vì hiển thị
    output_path = os.path.join(output_dir, "image_noise.png")
    plt.savefig(output_path, dpi=300)  # Lưu ảnh với chất lượng cao
    
    plt.close()  # Đóng figure
    
    print(f"Ảnh minh họa đã được lưu tại {output_path}")
    print(f'Number of train samples: {len(trainset)}, val samples: {len(valset)}, test samples: {len(testloader.dataset)}')
    
    return trainloaders, valloaders, testloader


def prepare_quantity_skew_dirichlet(num_partitions: int, batch_size: int, val_ratio: float = 0.1, beta: float = 10, seed: int = 42):
    trainset, testset = get_custom_dataset()
    num_train = int((1 - val_ratio) * len(trainset))
    num_val = len(trainset) - num_train
    trainset, valset = random_split(trainset, [num_train, num_val], generator=torch.Generator().manual_seed(seed))

    all_indices = trainset.indices

    min_size = 0
    while min_size < 1:
        proportions = np.random.dirichlet(np.repeat(beta, num_partitions))
        proportions = (np.cumsum(proportions) * len(all_indices)).astype(int)[:-1]

        partition_indices = np.split(all_indices, proportions)

        min_size = min([len(partition) for partition in partition_indices])
        print('Partition sizes:', [len(partition) for partition in partition_indices])
        print('Min partition size:', min_size)

    trainsets = [Subset(trainset.dataset, indices) for indices in partition_indices]

    partition_len_val = [len(valset) // num_partitions] * num_partitions
    for i in range(len(valset) % num_partitions):
        partition_len_val[i] += 1
    
    valsets = random_split(valset, partition_len_val, generator=torch.Generator().manual_seed(seed))

    trainloaders = [DataLoader(ts, batch_size=batch_size, shuffle=True, num_workers=NUM_WORKERS) for ts in trainsets]
    valloaders = [DataLoader(vs, batch_size=batch_size, shuffle=False, num_workers=NUM_WORKERS) for vs in valsets]
    testloader = DataLoader(testset, batch_size=batch_size, shuffle=False, num_workers=NUM_WORKERS)

    class_distributions = []
    for i, trainloader in enumerate(trainloaders):
        class_counts = Counter()
        for _, labels in trainloader:
            class_counts.update(labels.numpy())
        class_distributions.append(class_counts)
        print(f'Partition {i} class distribution: {dict(class_counts)}')
    
    partitions = range(num_partitions)
    class_0_counts = [class_distributions[i][0] for i in partitions]
    class_1_counts = [class_distributions[i][1] for i in partitions]

    bar_width = 0.5
    plt.figure(figsize=(12, 8))
    plt.bar(partitions, class_0_counts, bar_width, label='Class 0', color='blue')
    plt.bar(partitions, class_1_counts, bar_width, bottom=class_0_counts, label='Class 1', color='red')
    plt.xlabel('Partition')
    plt.ylabel('Number of Samples')
    plt.title('Class Distribution in Each Partition')
    plt.legend()
    plt.grid(True)
    # plt.show()
    #  Lưu đồ thị vào thư mục running_outputs với tên data_partition
    output_dir = 'running_outputs'
    os.makedirs(output_dir, exist_ok=True)
    plt.savefig(os.path.join(output_dir, 'data_partition.png'))
    plt.close()

    print(f'Number of train samples: {len(trainset)}, val samples: {len(valset)}, test samples: {len(testloader.dataset)}')

    return trainloaders, valloaders, testloader


def load_datasets(
    config: DictConfig,
    num_clients: int,
    val_ratio: float = 0.1,
    seed: Optional[int] = 42,
) -> Tuple[List[DataLoader], List[DataLoader], DataLoader]:
    """Create the dataloaders to be fed into the model.

    Parameters
    ----------
    config: DictConfig
        Parameterises the dataset partitioning process
    num_clients : int
        The number of clients that hold a part of the data
    val_ratio : float, optional
        The ratio of training data that will be used for validation (between 0 and 1),
        by default 0.1
    seed : int, optional
        Used to set a fix seed to replicate experiments, by default 42

    Returns
    -------
    Tuple[DataLoader, DataLoader, DataLoader]
        The DataLoaders for training, validation, and testing.
    """
    print(f"Dataset partitioning config: {config}")
    batch_size = -1
    print('config:' , config)
    if "batch_size" in config:
        batch_size = config.batch_size
    elif "batch_size_ratio" in config:
        batch_size_ratio = config.batch_size_ratio
    else:
        raise ValueError
    partitioning = ""
    
    if "partitioning" in config:
        partitioning = config.partitioning

    # partition the data
    if partitioning == "imbalance_label":
        return prepare_partitioned_dataset(num_clients, batch_size, val_ratio, config.labels_per_client, config.seed)

    if partitioning == "imbalance_label_dirichlet":
        return prepare_imbalance_label_dirichlet(num_clients, batch_size, val_ratio, config.alpha, config.seed)

    if partitioning == "noise_based_imbalance":
        return prepare_noise_based_imbalance(num_clients, batch_size, val_ratio, config.sigma, config.seed)

    if partitioning == "quantity_skew_dirichlet":
        return prepare_quantity_skew_dirichlet(num_clients, batch_size, val_ratio, config.alpha, config.seed)
    











  from .autonotebook import tqdm as notebook_tqdm


BACKEND:  Agg


In [3]:
# prepare_noise_based_imbalance(4, 10, 0, 0.1, 42)

In [4]:
def prepare_imbalanced_and_noisy_data(num_partitions: int, batch_size: int, val_ratio: float = 0.1, beta: float = 0.5, sigma: float = 0.05, seed: int = 42):
    """Combine label-imbalanced partitioning and Gaussian noise application."""
    trainset, testset = get_custom_dataset()

    # Split the trainset into trainset and valset based on the validation ratio
    num_train = int((1 - val_ratio) * len(trainset))
    num_val = len(trainset) - num_train
    trainset, valset = random_split(trainset, [num_train, num_val], generator=torch.Generator().manual_seed(seed))

    # Get labels for the entire trainset
    train_labels = np.array([trainset.dataset.targets[i] for i in trainset.indices])

    # Define partitions: each party has k labels (Dirichlet distribution)
    num_labels = len(np.unique(train_labels))
    min_size = 0
    min_require_size = 2
    N = len(trainset)

    while min_size < min_require_size:
        partition_indices = [[] for _ in range(num_partitions)]
        for label in range(num_labels):
            idx_label = np.where(train_labels == label)[0]
            idx_label = [trainset.indices[j] for j in idx_label]
            np.random.shuffle(idx_label)

            proportions = np.random.dirichlet(np.repeat(beta, num_partitions))
            proportions = (np.cumsum(proportions) * len(idx_label)).astype(int)[:-1]

            partition_indices = [idx_j + idx.tolist() for idx_j, idx in zip(partition_indices, np.split(idx_label, proportions))]
        min_size = min([len(idx_j) for idx_j in partition_indices])

    trainsets = [Subset(trainset.dataset, indices) for indices in partition_indices]

    # Add Gaussian noise to each partition
    noisy_partitions = []
    for i, train_partition in enumerate(trainsets):
        partition_std_dev = sigma * (i + 1) / num_partitions
        noisy_data = []
        for image, label in train_partition:
            noisy_image = apply_gaussian_noise(image, partition_std_dev)
            noisy_data.append((noisy_image, label))
        noisy_partitions.append(noisy_data)

    # Partition validation set equally
    partition_len_val = [len(valset) // num_partitions] * num_partitions
    for i in range(len(valset) % num_partitions):
        partition_len_val[i] += 1
    valsets = random_split(valset, partition_len_val, generator=torch.Generator().manual_seed(seed))

    # Create DataLoaders
    trainloaders = [DataLoader(part, batch_size=batch_size, shuffle=True, num_workers=6) for part in noisy_partitions]
    valloaders = [DataLoader(vs, batch_size=batch_size, shuffle=False, num_workers=6) for vs in valsets]
    testloader = DataLoader(testset, batch_size=batch_size, shuffle=False, num_workers=6)

    # Analyze class distributions
    class_distributions = []
    for i, trainloader in enumerate(trainloaders):
        class_counts = Counter()
        for _, labels in trainloader:
            class_counts.update(labels.numpy())
        class_distributions.append(class_counts)
        print(f'Partition {i} class distribution: {dict(class_counts)}')

    # Plot class distributions
    partitions = range(num_partitions)
    class_counts = [{cls: class_distributions[i].get(cls, 0) for cls in range(num_labels)} for i in partitions]

    bar_width = 0.5
    plt.figure(figsize=(12, 8))
    for cls in range(num_labels):
        counts = [class_counts[i].get(cls, 0) for i in partitions]
        plt.bar(partitions, counts, bar_width, label=f'Class {cls}')

    plt.xlabel('Partition')
    plt.ylabel('Number of Samples')
    plt.title('Class Distribution in Each Partition')
    plt.legend()
    plt.grid(True)
    output_dir = 'running_outputs'
    os.makedirs(output_dir, exist_ok=True)
    plt.savefig(os.path.join(output_dir, 'data_partition_combined.png'))
    plt.close()

    print(f'Number of train samples: {len(trainset)}, val samples: {len(valset)}, test samples: {len(testloader.dataset)}')

    return trainloaders, valloaders, testloader


In [5]:
# prepare_imbalanced_and_noisy_data(4, 10, 0, 0.5, 0.1, 42)

In [6]:
# prepare_combined_imbalance(4, 10, 0, 0.5, 0.1, 42)

In [7]:
# def prepare_imbalanced_and_noisy_data(num_partitions: int, batch_size: int, 
#                                       val_ratio: float = 0.1, beta: float = 0.5, 
#                                       sigma: float = 0.05, seed: int = 42):
#     """
#     Phân chia dữ liệu với phân phối không cân bằng và thêm nhiễu Gaussian.
#     Nếu dữ liệu đã được lưu, tải từ thư mục lưu trữ; nếu chưa, thực hiện phân chia và lưu.
#     """
#     # Định nghĩa thư mục lưu trữ dựa trên các tham số
#     noise_dir = f"data_partition_combined_{num_partitions}_beta_{beta}_sigma_{sigma}"
#     mean = [0.485, 0.456, 0.406]
#     std = [0.229, 0.224, 0.225]
#     noisy_transform = transforms.Compose([
#         transforms.ToTensor(),
#         transforms.Normalize(mean, std)
#     ])
    
#     if os.path.exists(noise_dir):
#         print(f"Loading partitioned and noisy dataset from {noise_dir}...")
#         # Tải các partition đã lưu bằng ImageFolder
#         train_partitions = [ImageFolder(os.path.join(noise_dir, f'partition_{i}'), transform=noisy_transform) 
#                             for i in range(num_partitions)]
        
#         # Tải val và test set như bình thường
#         trainset, testset = get_custom_dataset()
#         num_train = int((1 - val_ratio) * len(trainset))
#         num_val = len(trainset) - num_train
#         _, valset = random_split(trainset, [num_train, num_val], generator=torch.Generator().manual_seed(seed))
        
#         # Chia valset thành các partition
#         partition_len_val = [len(valset) // num_partitions] * num_partitions
#         for i in range(len(valset) % num_partitions):
#             partition_len_val[i] += 1
#         valsets = random_split(valset, partition_len_val, generator=torch.Generator().manual_seed(seed))
        
#         # Tạo DataLoaders
#         trainloaders = [DataLoader(part, batch_size=batch_size, shuffle=True, num_workers=6) 
#                        for part in train_partitions]
#         valloaders = [DataLoader(vs, batch_size=batch_size, shuffle=False, num_workers=6) 
#                       for vs in valsets]
#         testloader = DataLoader(testset, batch_size=batch_size, shuffle=False, num_workers=6)
        
#         print("Dữ liệu đã được tải thành công từ thư mục lưu trữ.")
#     else:
#         print(f"Creating partitioned and noisy dataset and saving to {noise_dir}...")
#         os.makedirs(noise_dir, exist_ok=True)
        
#         trainset, testset = get_custom_dataset()
#         num_train = int((1 - val_ratio) * len(trainset))
#         num_val = len(trainset) - num_train
#         train_subset, valset = random_split(trainset, [num_train, num_val], generator=torch.Generator().manual_seed(seed))
        
#         # Lấy nhãn của toàn bộ trainset
#         train_labels = np.array([train_subset.dataset.targets[i] for i in train_subset.indices])
#         num_labels = len(np.unique(train_labels))
#         min_size = 0
#         min_require_size = 2
#         N = len(train_subset)
        
#         # Phân chia dữ liệu theo phân phối Dirichlet
#         while min_size < min_require_size:
#             partition_indices = [[] for _ in range(num_partitions)]
#             for label in range(num_labels):
#                 idx_label = np.where(train_labels == label)[0]
#                 idx_label = [train_subset.indices[j] for j in idx_label]
#                 np.random.shuffle(idx_label)
                
#                 proportions = np.random.dirichlet(np.repeat(beta, num_partitions))
#                 proportions = (np.cumsum(proportions) * len(idx_label)).astype(int)[:-1]
                
#                 splits = np.split(idx_label, proportions)
#                 partition_indices = [idx_j + idx.tolist() for idx_j, idx in zip(partition_indices, splits)]
#             min_size = min([len(idx_j) for idx_j in partition_indices])
        
#         # Lưu các partition với nhiễu Gaussian
#         for i, indices in enumerate(partition_indices):
#             partition_dir = os.path.join(noise_dir, f'partition_{i}')
#             os.makedirs(partition_dir, exist_ok=True)
#             class_dirs = {}
#             for idx in indices:
#                 _, label = train_subset.dataset[idx]
#                 if label not in class_dirs:
#                     class_dirs[label] = os.path.join(partition_dir, f'class_{label}')
#                     os.makedirs(class_dirs[label], exist_ok=True)
            
#             # Thêm nhiễu và lưu ảnh
#             partition_std_dev = sigma * (i + 1) / num_partitions
#             for j, idx in enumerate(indices):
#                 image, label = train_subset.dataset[idx]
#                 noisy_image = apply_gaussian_noise(image, partition_std_dev)
#                 noisy_image = unnormalize_image(noisy_image, mean, std)
#                 noisy_image_pil = transforms.ToPILImage()(noisy_image.clamp(0, 1))
#                 image_filename = f'image_{j}.png'
#                 noisy_image_pil.save(os.path.join(class_dirs[label], image_filename))
        
#         # Tải các partition đã lưu bằng ImageFolder
#         train_partitions = [ImageFolder(os.path.join(noise_dir, f'partition_{i}'), transform=noisy_transform) 
#                             for i in range(num_partitions)]
        
#         # Chia valset thành các partition
#         partition_len_val = [len(valset) // num_partitions] * num_partitions
#         for i in range(len(valset) % num_partitions):
#             partition_len_val[i] += 1
#         valsets = random_split(valset, partition_len_val, generator=torch.Generator().manual_seed(seed))
        
#         # Tạo DataLoaders
#         trainloaders = [DataLoader(part, batch_size=batch_size, shuffle=True, num_workers=6) 
#                        for part in train_partitions]
#         valloaders = [DataLoader(vs, batch_size=batch_size, shuffle=False, num_workers=6) 
#                       for vs in valsets]
#         testloader = DataLoader(testset, batch_size=batch_size, shuffle=False, num_workers=6)
        
#         print(f"Dữ liệu đã được phân chia và lưu tại {noise_dir}")
    
#     # Phân tích phân bố lớp
#     class_distributions = []
#     for i, trainloader in enumerate(trainloaders):
#         class_counts = Counter()
#         for _, labels in trainloader:
#             class_counts.update(labels.numpy())
#         class_distributions.append(class_counts)
#         print(f'Partition {i} class distribution: {dict(class_counts)}')
    
#     # Vẽ biểu đồ phân bố lớp
#     partitions = range(num_partitions)
#     class_counts = [{cls: class_distributions[i].get(cls, 0) for cls in range(num_labels)} for i in partitions]
    
#     bar_width = 0.5
#     plt.figure(figsize=(12, 8))
#     for cls in range(num_labels):
#         counts = [class_counts[i].get(cls, 0) for i in partitions]
#         plt.bar(partitions, counts, bar_width, label=f'Class {cls}')
    
#     plt.xlabel('Partition')
#     plt.ylabel('Number of Samples')
#     plt.title('Class Distribution in Each Partition')
#     plt.legend()
#     plt.grid(True)
#     output_dir = 'running_outputs'
#     os.makedirs(output_dir, exist_ok=True)
#     plt.savefig(os.path.join(output_dir, 'data_partition_combined.png'))
#     plt.close()
    
#     print(f'Number of train samples: {len(train_subset)}, val samples: {len(valset)}, test samples: {len(testloader.dataset)}')
    
#     return trainloaders, valloaders, testloader
def prepare_imbalanced_and_noisy_data(num_partitions: int, batch_size: int, 
                                      val_ratio: float = 0.1, beta: float = 0.5, 
                                      sigma: float = 0.05, seed: int = 42):
    """
    Phân chia dữ liệu với phân phối không cân bằng và thêm nhiễu Gaussian.
    Nếu dữ liệu đã được lưu, tải từ thư mục lưu trữ; nếu chưa, thực hiện phân chia và lưu.
    """
    # Định nghĩa thư mục lưu trữ dựa trên các tham số
    # noise_dir = f"data_partition_combined_{num_partitions}_beta_{beta}_sigma_{sigma}"
    noise_dir = f"{BASE_FOLDER_NOISE}/chest_xray_noise_drl_label{beta}_{sigma}"

    mean = [0.485, 0.456, 0.406]
    std = [0.229, 0.224, 0.225]
    noisy_transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize(mean, std)
    ])
    
    if os.path.exists(noise_dir):
        print(f"Loading partitioned and noisy dataset from {noise_dir}...")
        # Tải các partition đã lưu bằng ImageFolder
        train_partitions = [ImageFolder(os.path.join(noise_dir, f'partition_{i}'), transform=noisy_transform) 
                            for i in range(num_partitions)]
        
        # Lấy số lượng lớp từ một trong các partition
        if len(train_partitions) > 0:
            num_labels = len(train_partitions[0].classes)
        else:
            raise ValueError("Không tìm thấy partition nào trong thư mục lưu trữ.")
        
        # Tải val và test set như bình thường
        trainset, testset = get_custom_dataset()
        num_train = int((1 - val_ratio) * len(trainset))
        num_val = len(trainset) - num_train
        _, valset = random_split(trainset, [num_train, num_val], generator=torch.Generator().manual_seed(seed))
        
        # Chia valset thành các partition
        partition_len_val = [len(valset) // num_partitions] * num_partitions
        for i in range(len(valset) % num_partitions):
            partition_len_val[i] += 1
        valsets = random_split(valset, partition_len_val, generator=torch.Generator().manual_seed(seed))
        
        # Tạo DataLoaders
        trainloaders = [DataLoader(part, batch_size=batch_size, shuffle=True, num_workers=6) 
                       for part in train_partitions]
        valloaders = [DataLoader(vs, batch_size=batch_size, shuffle=False, num_workers=6) 
                      for vs in valsets]
        testloader = DataLoader(testset, batch_size=batch_size, shuffle=False, num_workers=6)
        
        print("Dữ liệu đã được tải thành công từ thư mục lưu trữ.")
    else:
        print(f"Creating partitioned and noisy dataset and saving to {noise_dir}...")
        os.makedirs(noise_dir, exist_ok=True)
        
        trainset, testset = get_custom_dataset()
        num_train = int((1 - val_ratio) * len(trainset))
        num_val = len(trainset) - num_train
        train_subset, valset = random_split(trainset, [num_train, num_val], generator=torch.Generator().manual_seed(seed))
        
        # Lấy nhãn của toàn bộ trainset
        train_labels = np.array([train_subset.dataset.targets[i] for i in train_subset.indices])
        num_labels = len(np.unique(train_labels))
        min_size = 0
        min_require_size = 2
        N = len(train_subset)
        
        # Phân chia dữ liệu theo phân phối Dirichlet
        while min_size < min_require_size:
            partition_indices = [[] for _ in range(num_partitions)]
            for label in range(num_labels):
                idx_label = np.where(train_labels == label)[0]
                idx_label = [train_subset.indices[j] for j in idx_label]
                np.random.shuffle(idx_label)
                
                proportions = np.random.dirichlet(np.repeat(beta, num_partitions))
                proportions = (np.cumsum(proportions) * len(idx_label)).astype(int)[:-1]
                
                splits = np.split(idx_label, proportions)
                partition_indices = [idx_j + idx.tolist() for idx_j, idx in zip(partition_indices, splits)]
            min_size = min([len(idx_j) for idx_j in partition_indices])
        
        # Lưu các partition với nhiễu Gaussian
        for i, indices in enumerate(partition_indices):
            partition_dir = os.path.join(noise_dir, f'partition_{i}')
            os.makedirs(partition_dir, exist_ok=True)
            class_dirs = {}
            for idx in indices:
                _, label = train_subset.dataset[idx]
                if label not in class_dirs:
                    class_dirs[label] = os.path.join(partition_dir, f'class_{label}')
                    os.makedirs(class_dirs[label], exist_ok=True)
            
            # Thêm nhiễu và lưu ảnh
            partition_std_dev = sigma * (i + 1) / num_partitions
            for j, idx in enumerate(indices):
                image, label = train_subset.dataset[idx]
                noisy_image = apply_gaussian_noise(image, partition_std_dev)
                noisy_image = unnormalize_image(noisy_image, mean, std)
                noisy_image_pil = transforms.ToPILImage()(noisy_image.clamp(0, 1))
                image_filename = f'image_{j}.png'
                noisy_image_pil.save(os.path.join(class_dirs[label], image_filename))
        
        # Tải các partition đã lưu bằng ImageFolder
        train_partitions = [ImageFolder(os.path.join(noise_dir, f'partition_{i}'), transform=noisy_transform) 
                            for i in range(num_partitions)]
        
        # Chia valset thành các partition
        partition_len_val = [len(valset) // num_partitions] * num_partitions
        for i in range(len(valset) % num_partitions):
            partition_len_val[i] += 1
        valsets = random_split(valset, partition_len_val, generator=torch.Generator().manual_seed(seed))
        
        # Tạo DataLoaders
        trainloaders = [DataLoader(part, batch_size=batch_size, shuffle=True, num_workers=6) 
                       for part in train_partitions]
        valloaders = [DataLoader(vs, batch_size=batch_size, shuffle=False, num_workers=6) 
                      for vs in valsets]
        testloader = DataLoader(testset, batch_size=batch_size, shuffle=False, num_workers=6)
        
        print(f"Dữ liệu đã được phân chia và lưu tại {noise_dir}")
    
    # Phân tích phân bố lớp
    class_distributions = []
    for i, trainloader in enumerate(trainloaders):
        class_counts = Counter()
        for _, labels in trainloader:
            class_counts.update(labels.numpy())
        class_distributions.append(class_counts)
        print(f'Partition {i} class distribution: {dict(class_counts)}')
    
    # Vẽ biểu đồ phân bố lớp
    partitions = range(num_partitions)
    class_counts_list = []
    for i in partitions:
        counts = {cls: class_distributions[i].get(cls, 0) for cls in range(num_labels)}
        class_counts_list.append(counts)
    
    bar_width = 0.5
    plt.figure(figsize=(12, 8))
    bottom = np.zeros(num_partitions)
    colors = plt.cm.tab10.colors  # Sử dụng bảng màu có sẵn
    
    for cls in range(num_labels):
        counts = [class_counts_list[i].get(cls, 0) for i in partitions]
        plt.bar(partitions, counts, bar_width, bottom=bottom, label=f'Class {cls}', color=colors[cls % len(colors)])
        bottom += counts
    
    plt.xlabel('Partition')
    plt.ylabel('Number of Samples')
    plt.title('Class Distribution in Each Partition')
    plt.legend()
    plt.grid(True)
    output_dir = 'running_outputs'
    os.makedirs(output_dir, exist_ok=True)
    plt.savefig(os.path.join(output_dir, 'data_partition_combined.png'))
    plt.close()
    
    print(f'Number of train samples: {sum(len(loader.dataset) for loader in trainloaders)}, '
          f'val samples: {len(valset)}, test samples: {len(testloader.dataset)}')
    
    return trainloaders, valloaders, testloader

In [8]:
# prepare_imbalanced_and_noisy_data(4, 10, 0, 0.5, 1, 42)

In [15]:
def prepare_quantity_drl_and_noisy_data(num_partitions: int, batch_size: int, 
                                      val_ratio: float = 0.1, beta: float = 10, 
                                      sigma: float = 0.05, seed: int = 42):
    """
    Phân chia dữ liệu với phân phối không cân bằng theo quantity_skew_dirichlet và thêm nhiễu Gaussian.
    """
    noise_dir = f"{BASE_FOLDER_NOISE}/chest_xray_noise_drl_quantity_skew_{beta}_{sigma}"

    mean = [0.485, 0.456, 0.406]
    std = [0.229, 0.224, 0.225]
    noisy_transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize(mean, std)
    ])
    
    if os.path.exists(noise_dir):
        print(f"Loading partitioned and noisy dataset from {noise_dir}...")
        train_partitions = [ImageFolder(os.path.join(noise_dir, f'partition_{i}'), transform=noisy_transform) 
                            for i in range(num_partitions)]
        
        trainloaders = [DataLoader(part, batch_size=batch_size, shuffle=True, num_workers=6) 
                       for part in train_partitions]
        
        trainset, testset = get_custom_dataset()
        num_train = int((1 - val_ratio) * len(trainset))
        num_val = len(trainset) - num_train
        _, valset = random_split(trainset, [num_train, num_val], generator=torch.Generator().manual_seed(seed))
        
        partition_len_val = [len(valset) // num_partitions] * num_partitions
        for i in range(len(valset) % num_partitions):
            partition_len_val[i] += 1
        valsets = random_split(valset, partition_len_val, generator=torch.Generator().manual_seed(seed))
        
        valloaders = [DataLoader(vs, batch_size=batch_size, shuffle=False, num_workers=6) 
                      for vs in valsets]
        testloader = DataLoader(testset, batch_size=batch_size, shuffle=False, num_workers=6)
        
        class_distributions = []
        for i, trainloader in enumerate(trainloaders):
            class_counts = Counter()
            for _, labels in trainloader:
                class_counts.update(labels.numpy())
            class_distributions.append(class_counts)
            print(f'Partition {i} class distribution: {dict(class_counts)}')
        
        partitions = range(num_partitions)
        class_0_counts = [class_distributions[i][0] for i in partitions]
        class_1_counts = [class_distributions[i][1] for i in partitions]

        bar_width = 0.5
        plt.figure(figsize=(12, 8))
        plt.bar(partitions, class_0_counts, bar_width, label='Class 0', color='blue')
        plt.bar(partitions, class_1_counts, bar_width, bottom=class_0_counts, label='Class 1', color='red')
        plt.xlabel('Partition')
        plt.ylabel('Number of Samples')
        plt.title('Class Distribution in Each Partition')
        plt.legend()
        plt.grid(True)
        # plt.show()
        #  Lưu đồ thị vào thư mục running_outputs với tên data_partition
        output_dir = 'running_outputs'
        os.makedirs(output_dir, exist_ok=True)
        plt.savefig(os.path.join(output_dir, 'data_partition.png'))
        plt.close()

        print(f'Number of train samples: {len(trainset)}, val samples: {len(valset)}, test samples: {len(testloader.dataset)}')

        print("Dữ liệu đã được tải thành công từ thư mục lưu trữ.")
    else:
        print(f"Creating partitioned and noisy dataset and saving to {noise_dir}...")
        os.makedirs(noise_dir, exist_ok=True)
        
        trainloaders, valloaders, testloader = prepare_quantity_skew_dirichlet(
            num_partitions=num_partitions, batch_size=batch_size, val_ratio=val_ratio, beta=beta, seed=seed)
        
        # Thêm nhiễu Gaussian và lưu dữ liệu
        for i, trainloader in enumerate(trainloaders):
            partition_dir = os.path.join(noise_dir, f'partition_{i}')
            os.makedirs(partition_dir, exist_ok=True)
            for images, labels in trainloader:
                for j, (image, label) in enumerate(zip(images, labels)):
                    class_dir = os.path.join(partition_dir, f'class_{label.item()}')
                    os.makedirs(class_dir, exist_ok=True)
                    
                    # Thêm nhiễu Gaussian
                    partition_std_dev = sigma * (i + 1) / num_partitions
                    noisy_image = apply_gaussian_noise(image, partition_std_dev)
                    noisy_image = unnormalize_image(noisy_image, mean, std)
                    noisy_image_pil = transforms.ToPILImage()(noisy_image.clamp(0, 1))
                    image_filename = f'image_{j}.png'
                    noisy_image_pil.save(os.path.join(class_dir, image_filename))
        
        print(f"Dữ liệu đã được phân chia và lưu tại {noise_dir}")

       
    return trainloaders, valloaders, testloader


In [22]:
prepare_quantity_drl_and_noisy_data(60, 10, 0, 0.5, 0.1, 42)

Loading partitioned and noisy dataset from /media/namvq/Data/code_chinh_sua/fedavg/chest_xray_noise_drl_quantity_skew_0.5_0.1...
Partition 0 class distribution: {0: 10, 1: 10}
Partition 1 class distribution: {1: 7, 0: 3}
Partition 2 class distribution: {0: 10, 1: 10}
Partition 3 class distribution: {0: 2}
Partition 4 class distribution: {1: 1, 0: 1}
Partition 5 class distribution: {1: 10, 0: 7}
Partition 6 class distribution: {0: 9, 1: 10}
Partition 7 class distribution: {0: 7}
Partition 8 class distribution: {0: 10, 1: 10}
Partition 9 class distribution: {1: 10, 0: 10}
Partition 10 class distribution: {1: 8, 0: 6}
Partition 11 class distribution: {0: 10, 1: 10}
Partition 12 class distribution: {0: 4}
Partition 13 class distribution: {0: 8, 1: 10}
Partition 14 class distribution: {1: 10, 0: 7}
Partition 15 class distribution: {1: 10, 0: 10}
Partition 16 class distribution: {1: 10, 0: 8}
Partition 17 class distribution: {1: 10, 0: 6}
Partition 18 class distribution: {0: 2, 1: 3}
Partiti

([<torch.utils.data.dataloader.DataLoader at 0x7dde47188100>,
  <torch.utils.data.dataloader.DataLoader at 0x7dde471881c0>,
  <torch.utils.data.dataloader.DataLoader at 0x7dde47188280>,
  <torch.utils.data.dataloader.DataLoader at 0x7dde47188370>,
  <torch.utils.data.dataloader.DataLoader at 0x7dde47188460>,
  <torch.utils.data.dataloader.DataLoader at 0x7dde47188550>,
  <torch.utils.data.dataloader.DataLoader at 0x7dde47188640>,
  <torch.utils.data.dataloader.DataLoader at 0x7dde47188730>,
  <torch.utils.data.dataloader.DataLoader at 0x7dde47188820>,
  <torch.utils.data.dataloader.DataLoader at 0x7dde47188910>,
  <torch.utils.data.dataloader.DataLoader at 0x7dde47188a00>,
  <torch.utils.data.dataloader.DataLoader at 0x7dde47188af0>,
  <torch.utils.data.dataloader.DataLoader at 0x7dde47188be0>,
  <torch.utils.data.dataloader.DataLoader at 0x7dde47188cd0>,
  <torch.utils.data.dataloader.DataLoader at 0x7dde47188dc0>,
  <torch.utils.data.dataloader.DataLoader at 0x7dde47188eb0>,
  <torch

In [21]:
prepare_quantity_drl_and_noisy_data(60, 10, 0, 0.5, 0.3, 42)

Creating partitioned and noisy dataset and saving to /media/namvq/Data/code_chinh_sua/fedavg/chest_xray_noise_drl_quantity_skew_0.5_0.3...
Partition sizes: [111, 400, 210, 0, 151, 59, 2, 162, 3, 382, 46, 102, 2, 7, 8, 6, 4, 431, 9, 1, 2, 241, 119, 43, 30, 112, 8, 12, 1, 9, 29, 29, 24, 2, 106, 110, 0, 70, 134, 86, 28, 121, 154, 6, 420, 1, 15, 7, 233, 10, 175, 2, 190, 0, 112, 249, 164, 24, 20, 22]
Min partition size: 0
Partition sizes: [4, 1, 20, 80, 92, 228, 39, 172, 3, 389, 229, 5, 53, 1, 421, 6, 5, 49, 9, 1, 0, 22, 51, 57, 267, 1, 1, 407, 167, 13, 101, 49, 64, 24, 14, 10, 1, 208, 45, 58, 0, 159, 57, 0, 28, 160, 32, 27, 38, 11, 21, 11, 22, 21, 234, 36, 30, 141, 6, 815]
Min partition size: 0
Partition sizes: [66, 73, 5, 0, 166, 28, 28, 13, 94, 64, 176, 6, 328, 145, 4, 33, 23, 47, 9, 9, 305, 323, 2, 92, 410, 95, 5, 570, 42, 0, 2, 0, 95, 38, 10, 175, 30, 38, 10, 20, 13, 549, 15, 20, 282, 6, 0, 19, 31, 42, 1, 4, 85, 62, 334, 2, 59, 100, 12, 1]
Min partition size: 0
Partition sizes: [20, 17

KeyboardInterrupt: 

In [None]:
prepare_quantity_drl_and_noisy_data(60, 10, 0, 0.5, 0.5, 42)

In [23]:
def prepare_quantity_skew_dirichlet(num_partitions: int, batch_size: int, val_ratio: float = 0.1, beta: float = 10, seed: int = 42):
    trainset, testset = get_custom_dataset()
    num_train = int((1 - val_ratio) * len(trainset))
    num_val = len(trainset) - num_train
    trainset, valset = random_split(trainset, [num_train, num_val], generator=torch.Generator().manual_seed(seed))

    all_indices = np.array(trainset.indices)
    train_labels = np.array(trainset.dataset.targets)[all_indices]  # Nhãn của toàn bộ dữ liệu trainset
    num_labels = len(np.unique(train_labels))  # Số lượng nhãn (classes)

    # Đảm bảo mỗi partition có ít nhất một mẫu của mỗi lớp
    partition_indices = [[] for _ in range(num_partitions)]
    for label in range(num_labels):
        label_indices = np.where(train_labels == label)[0]
        np.random.shuffle(label_indices)

        # Gán một mẫu của lớp `label` cho mỗi partition
        if len(label_indices) < num_partitions:
            raise ValueError(f"Không đủ mẫu cho lớp {label} để phân phối cho tất cả partitions.")
        for i in range(num_partitions):
            partition_indices[i].append(label_indices[i])
        
        # Phân phối phần còn lại của lớp `label` theo Dirichlet
        remaining_indices = label_indices[num_partitions:]
        if len(remaining_indices) > 0:
            proportions = np.random.dirichlet(np.repeat(beta, num_partitions))
            proportions = (proportions * len(remaining_indices)).astype(int)

            # Điều chỉnh proportions để đảm bảo tổng đúng bằng số mẫu còn lại
            while proportions.sum() < len(remaining_indices):
                proportions[np.argmax(proportions)] += 1
            while proportions.sum() > len(remaining_indices):
                proportions[np.argmax(proportions)] -= 1

            splits = np.split(remaining_indices, np.cumsum(proportions)[:-1])
            for i, split in enumerate(splits):
                partition_indices[i].extend(split.tolist())

    # Kiểm tra kích thước tối thiểu của các partition
    min_size = min([len(part) for part in partition_indices])
    if min_size < 1:
        raise ValueError("Một partition không đủ mẫu sau khi phân phối.")

    # Tạo các tập train và val từ partition_indices
    trainsets = [Subset(trainset.dataset, indices) for indices in partition_indices]

    # Phân chia valset thành các partitions
    partition_len_val = [len(valset) // num_partitions] * num_partitions
    for i in range(len(valset) % num_partitions):
        partition_len_val[i] += 1
    valsets = random_split(valset, partition_len_val, generator=torch.Generator().manual_seed(seed))

    # Tạo DataLoaders
    trainloaders = [DataLoader(ts, batch_size=batch_size, shuffle=True, num_workers=6) for ts in trainsets]
    valloaders = [DataLoader(vs, batch_size=batch_size, shuffle=False, num_workers=6) for vs in valsets]
    testloader = DataLoader(testset, batch_size=batch_size, shuffle=False, num_workers=6)

    # Phân tích phân bố lớp
    class_distributions = []
    for i, trainloader in enumerate(trainloaders):
        class_counts = Counter()
        for _, labels in trainloader:
            class_counts.update(labels.numpy())
        class_distributions.append(class_counts)
        print(f'Partition {i} class distribution: {dict(class_counts)}')

    # Vẽ biểu đồ phân bố lớp
    partitions = range(num_partitions)
    bar_width = 0.5
    plt.figure(figsize=(12, 8))
    bottom = np.zeros(num_partitions)
    colors = plt.cm.tab10.colors

    for cls in range(num_labels):
        counts = [class_distributions[i].get(cls, 0) for i in partitions]
        plt.bar(partitions, counts, bar_width, bottom=bottom, label=f'Class {cls}', color=colors[cls % len(colors)])
        bottom += counts

    plt.xlabel('Partition')
    plt.ylabel('Number of Samples')
    plt.title('Class Distribution in Each Partition')
    plt.legend()
    plt.grid(True)
    output_dir = 'running_outputs'
    os.makedirs(output_dir, exist_ok=True)
    plt.savefig(os.path.join(output_dir, 'data_partition_quantity_skew.png'))
    plt.close()

    print(f'Number of train samples: {len(trainset)}, val samples: {len(valset)}, test samples: {len(testloader.dataset)}')

    return trainloaders, valloaders, testloader


In [27]:
def prepare_quantity_skew_dirichlet_with_noise(num_partitions: int, batch_size: int, val_ratio: float = 0.1, 
                                               beta: float = 10, sigma: float = 0.05, seed: int = 42):
    """
    Phân phối dữ liệu theo quantity skew Dirichlet, thêm nhiễu Gaussian vào từng partition.
    """
    trainset, testset = get_custom_dataset()
    num_train = int((1 - val_ratio) * len(trainset))
    num_val = len(trainset) - num_train
    trainset, valset = random_split(trainset, [num_train, num_val], generator=torch.Generator().manual_seed(seed))

    all_indices = np.array(trainset.indices)
    train_labels = np.array(trainset.dataset.targets)[all_indices]
    num_labels = len(np.unique(train_labels))

    # Đảm bảo mỗi partition có ít nhất một mẫu của mỗi lớp
    partition_indices = [[] for _ in range(num_partitions)]
    for label in range(num_labels):
        label_indices = np.where(train_labels == label)[0]
        np.random.shuffle(label_indices)

        # Gán ít nhất 1 mẫu của mỗi lớp cho từng partition
        if len(label_indices) < num_partitions:
            raise ValueError(f"Không đủ mẫu cho lớp {label} để phân phối cho tất cả partitions.")
        for i in range(num_partitions):
            partition_indices[i].append(label_indices[i])

        # Phân phối phần còn lại của lớp theo Dirichlet
        remaining_indices = label_indices[num_partitions:]
        if len(remaining_indices) > 0:
            proportions = np.random.dirichlet(np.repeat(beta, num_partitions))
            proportions = (proportions * len(remaining_indices)).astype(int)

            # Điều chỉnh proportions để tổng khớp
            while proportions.sum() < len(remaining_indices):
                proportions[np.argmax(proportions)] += 1
            while proportions.sum() > len(remaining_indices):
                proportions[np.argmax(proportions)] -= 1

            splits = np.split(remaining_indices, np.cumsum(proportions)[:-1])
            for i, split in enumerate(splits):
                partition_indices[i].extend(split.tolist())

    # Kiểm tra tính hợp lệ
    min_size = min([len(part) for part in partition_indices])
    if min_size < 1:
        raise ValueError("Một partition không đủ mẫu sau khi phân phối.")

    # Thêm nhiễu Gaussian và lưu dữ liệu
    noise_dir = f"{BASE_FOLDER_NOISE}/quantity_skew_dirichlet_with_noise_beta_{beta}_sigma_{sigma}"
    os.makedirs(noise_dir, exist_ok=True)

    trainsets = []
    for i, indices in enumerate(partition_indices):
        partition_dir = os.path.join(noise_dir, f'partition_{i}')
        os.makedirs(partition_dir, exist_ok=True)

        partition_std_dev = sigma * (i + 1) / num_partitions
        noisy_data = []

        for idx in indices:
            image, label = trainset.dataset[idx]
            noisy_image = apply_gaussian_noise(image, partition_std_dev)
            noisy_data.append((noisy_image, label))

            # Lưu ảnh nhiễu Gaussian
            class_dir = os.path.join(partition_dir, f'class_{label}')
            os.makedirs(class_dir, exist_ok=True)
            noisy_image = unnormalize_image(noisy_image, [0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
            noisy_image_pil = transforms.ToPILImage()(noisy_image.clamp(0, 1))
            image_filename = f'image_{idx}.png'
            noisy_image_pil.save(os.path.join(class_dir, image_filename))

        trainsets.append(noisy_data)

    trainloaders = [DataLoader(ts, batch_size=batch_size, shuffle=True, num_workers=6) for ts in trainsets]

    # Phân chia valset thành các partitions
    partition_len_val = [len(valset) // num_partitions] * num_partitions
    for i in range(len(valset) % num_partitions):
        partition_len_val[i] += 1
    valsets = random_split(valset, partition_len_val, generator=torch.Generator().manual_seed(seed))

    valloaders = [DataLoader(vs, batch_size=batch_size, shuffle=False, num_workers=6) for vs in valsets]
    testloader = DataLoader(testset, batch_size=batch_size, shuffle=False, num_workers=6)

    # Phân tích và hiển thị phân bố lớp
    class_distributions = []
    for i, trainloader in enumerate(trainloaders):
        class_counts = Counter()
        for _, labels in trainloader:
            class_counts.update(labels.numpy())
        class_distributions.append(class_counts)
        print(f'Partition {i} class distribution: {dict(class_counts)}')

    # Vẽ biểu đồ phân bố
    partitions = range(num_partitions)
    bar_width = 0.5
    plt.figure(figsize=(12, 8))
    bottom = np.zeros(num_partitions)
    colors = plt.cm.tab10.colors

    for cls in range(num_labels):
        counts = [class_distributions[i].get(cls, 0) for i in partitions]
        plt.bar(partitions, counts, bar_width, bottom=bottom, label=f'Class {cls}', color=colors[cls % len(colors)])
        bottom += counts

    plt.xlabel('Partition')
    plt.ylabel('Number of Samples')
    plt.title('Class Distribution in Each Partition')
    plt.legend()
    plt.grid(True)
    output_dir = 'running_outputs'
    os.makedirs(output_dir, exist_ok=True)
    plt.savefig(os.path.join(output_dir, 'data_partition_quantity_skew_with_noise.png'))
    plt.close()

    print(f'Number of train samples: {len(trainset)}, val samples: {len(valset)}, test samples: {len(testloader.dataset)}')

    return trainloaders, valloaders, testloader


In [28]:
prepare_quantity_skew_dirichlet_with_noise(4, 10, 0, 0.5, 0.5, 42)

Partition 0 class distribution: {1: 49, 0: 17}
Partition 1 class distribution: {1: 55, 0: 18}
Partition 2 class distribution: {1: 18, 0: 8}
Partition 3 class distribution: {1: 18, 0: 8}
Number of train samples: 191, val samples: 0, test samples: 129


([<torch.utils.data.dataloader.DataLoader at 0x7dde46eaf400>,
  <torch.utils.data.dataloader.DataLoader at 0x7dde40aca2c0>,
  <torch.utils.data.dataloader.DataLoader at 0x7dde40ac9480>,
  <torch.utils.data.dataloader.DataLoader at 0x7dde40ac9210>],
 [<torch.utils.data.dataloader.DataLoader at 0x7dde40acba90>,
  <torch.utils.data.dataloader.DataLoader at 0x7dde40ac9240>,
  <torch.utils.data.dataloader.DataLoader at 0x7dde47023c10>,
  <torch.utils.data.dataloader.DataLoader at 0x7dde47023d00>],
 <torch.utils.data.dataloader.DataLoader at 0x7dde46ead570>)

In [3]:
def prepare_quantity_skew_dirichlet_with_noise(num_partitions: int, batch_size: int, val_ratio: float = 0.1, 
                                               beta: float = 10, sigma: float = 0.05, seed: int = 42):
    """
    Phân phối dữ liệu theo quantity skew Dirichlet, thêm nhiễu Gaussian vào từng partition.
    Nếu dữ liệu đã được lưu, chỉ cần tải lại mà không tạo mới.
    """
    noise_dir = f"{BASE_FOLDER_NOISE}/quantity_skew_dirichlet_with_noise_beta_{beta}_sigma_{sigma}"

    mean = [0.485, 0.456, 0.406]
    std = [0.229, 0.224, 0.225]
    noisy_transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize(mean, std)
    ])

    # Nếu thư mục tồn tại, tải dữ liệu
    if os.path.exists(noise_dir):
        print(f"Loading partitioned and noisy dataset from {noise_dir}...")
        train_partitions = [ImageFolder(os.path.join(noise_dir, f'partition_{i}'), transform=noisy_transform) 
                            for i in range(num_partitions)]
        num_labels = len(train_partitions[0].classes)

        trainloaders = [DataLoader(part, batch_size=batch_size, shuffle=True, num_workers=6) 
                       for part in train_partitions]

        trainset, testset = get_custom_dataset()
        num_train = int((1 - val_ratio) * len(trainset))
        num_val = len(trainset) - num_train
        _, valset = random_split(trainset, [num_train, num_val], generator=torch.Generator().manual_seed(seed))
        
        partition_len_val = [len(valset) // num_partitions] * num_partitions
        for i in range(len(valset) % num_partitions):
            partition_len_val[i] += 1
        valsets = random_split(valset, partition_len_val, generator=torch.Generator().manual_seed(seed))

        valloaders = [DataLoader(vs, batch_size=batch_size, shuffle=False, num_workers=6) for vs in valsets]
        testloader = DataLoader(testset, batch_size=batch_size, shuffle=False, num_workers=6)

        print("Dữ liệu đã được tải thành công từ thư mục lưu trữ.")
        # return trainloaders, valloaders, testloader

    else:
        # Nếu không, tạo mới dữ liệu và thêm nhiễu
        print(f"Creating partitioned and noisy dataset and saving to {noise_dir}...")
        os.makedirs(noise_dir, exist_ok=True)

        trainset, testset = get_custom_dataset()
        num_train = int((1 - val_ratio) * len(trainset))
        num_val = len(trainset) - num_train
        trainset, valset = random_split(trainset, [num_train, num_val], generator=torch.Generator().manual_seed(seed))

        all_indices = np.array(trainset.indices)
        train_labels = np.array(trainset.dataset.targets)[all_indices]
        num_labels = len(np.unique(train_labels))

        # Đảm bảo mỗi partition có ít nhất một mẫu của mỗi lớp
        partition_indices = [[] for _ in range(num_partitions)]
        for label in range(num_labels):
            label_indices = np.where(train_labels == label)[0]
            np.random.shuffle(label_indices)

            # Gán ít nhất 1 mẫu của mỗi lớp cho từng partition
            if len(label_indices) < num_partitions:
                raise ValueError(f"Không đủ mẫu cho lớp {label} để phân phối cho tất cả partitions.")
            for i in range(num_partitions):
                partition_indices[i].append(label_indices[i])

            # Phân phối phần còn lại của lớp theo Dirichlet
            remaining_indices = label_indices[num_partitions:]
            if len(remaining_indices) > 0:
                proportions = np.random.dirichlet(np.repeat(beta, num_partitions))
                proportions = (proportions * len(remaining_indices)).astype(int)

                # Điều chỉnh proportions để tổng khớp
                while proportions.sum() < len(remaining_indices):
                    proportions[np.argmax(proportions)] += 1
                while proportions.sum() > len(remaining_indices):
                    proportions[np.argmax(proportions)] -= 1

                splits = np.split(remaining_indices, np.cumsum(proportions)[:-1])
                for i, split in enumerate(splits):
                    partition_indices[i].extend(split.tolist())

        # Thêm nhiễu Gaussian và lưu dữ liệu
        trainsets = []
        for i, indices in enumerate(partition_indices):
            partition_dir = os.path.join(noise_dir, f'partition_{i}')
            os.makedirs(partition_dir, exist_ok=True)

            partition_std_dev = sigma * (i + 1) / num_partitions
            noisy_data = []

            for idx in indices:
                image, label = trainset.dataset[idx]
                noisy_image = apply_gaussian_noise(image, partition_std_dev)
                noisy_data.append((noisy_image, label))

                # Lưu ảnh nhiễu Gaussian
                class_dir = os.path.join(partition_dir, f'class_{label}')
                os.makedirs(class_dir, exist_ok=True)
                noisy_image = unnormalize_image(noisy_image, mean, std)
                noisy_image_pil = transforms.ToPILImage()(noisy_image.clamp(0, 1))
                image_filename = f'image_{idx}.png'
                noisy_image_pil.save(os.path.join(class_dir, image_filename))

            trainsets.append(noisy_data)

        trainloaders = [DataLoader(ts, batch_size=batch_size, shuffle=True, num_workers=6) for ts in trainsets]

        # Phân chia valset thành các partitions
        partition_len_val = [len(valset) // num_partitions] * num_partitions
        for i in range(len(valset) % num_partitions):
            partition_len_val[i] += 1
        valsets = random_split(valset, partition_len_val, generator=torch.Generator().manual_seed(seed))

        valloaders = [DataLoader(vs, batch_size=batch_size, shuffle=False, num_workers=6) for vs in valsets]
        testloader = DataLoader(testset, batch_size=batch_size, shuffle=False, num_workers=6)

        print(f"Dữ liệu đã được phân chia và lưu tại {noise_dir}")
    
    # Phân tích phân bố lớp
    class_distributions = []
    for i, trainloader in enumerate(trainloaders):
        class_counts = Counter()
        for _, labels in trainloader:
            class_counts.update(labels.numpy())
        class_distributions.append(class_counts)
        print(f'Partition {i} class distribution: {dict(class_counts)}')
    
    # Vẽ biểu đồ phân bố lớp
    partitions = range(num_partitions)
    class_counts_list = []
    for i in partitions:
        counts = {cls: class_distributions[i].get(cls, 0) for cls in range(num_labels)}
        class_counts_list.append(counts)
    
    bar_width = 0.5
    plt.figure(figsize=(12, 8))
    bottom = np.zeros(num_partitions)
    colors = plt.cm.tab10.colors  # Sử dụng bảng màu có sẵn
    
    for cls in range(num_labels):
        counts = [class_counts_list[i].get(cls, 0) for i in partitions]
        plt.bar(partitions, counts, bar_width, bottom=bottom, label=f'Class {cls}', color=colors[cls % len(colors)])
        bottom += counts
    
    plt.xlabel('Partition')
    plt.ylabel('Number of Samples')
    plt.title('Class Distribution in Each Partition')
    plt.legend()
    plt.grid(True)
    output_dir = 'running_outputs'
    os.makedirs(output_dir, exist_ok=True)
    plt.savefig(os.path.join(output_dir, 'data_partition_combined.png'))
    plt.close()
    
    print(f'Number of train samples: {sum(len(loader.dataset) for loader in trainloaders)}, '
          f'val samples: {len(valset)}, test samples: {len(testloader.dataset)}')
    
    return trainloaders, valloaders, testloader


In [4]:
prepare_quantity_skew_dirichlet_with_noise(60, 10, 0, 0.5, 0.1, 42)

Creating partitioned and noisy dataset and saving to /media/namvq/Data/code_chinh_sua/fedavg/quantity_skew_dirichlet_with_noise_beta_0.5_sigma_0.1...
Dữ liệu đã được phân chia và lưu tại /media/namvq/Data/code_chinh_sua/fedavg/quantity_skew_dirichlet_with_noise_beta_0.5_sigma_0.1
Partition 0 class distribution: {0: 9, 1: 17}
Partition 1 class distribution: {0: 38, 1: 128}
Partition 2 class distribution: {1: 228, 0: 64}
Partition 3 class distribution: {0: 19, 1: 33}
Partition 4 class distribution: {1: 23, 0: 8}
Partition 5 class distribution: {1: 137, 0: 52}
Partition 6 class distribution: {1: 3, 0: 1}
Partition 7 class distribution: {1: 69, 0: 24}
Partition 8 class distribution: {1: 173, 0: 48}
Partition 9 class distribution: {1: 32, 0: 18}
Partition 10 class distribution: {0: 22, 1: 84}
Partition 11 class distribution: {0: 3, 1: 9}
Partition 12 class distribution: {1: 99, 0: 26}
Partition 13 class distribution: {1: 27, 0: 14}
Partition 14 class distribution: {0: 27, 1: 94}
Partition 1

([<torch.utils.data.dataloader.DataLoader at 0x7c94b4f05d20>,
  <torch.utils.data.dataloader.DataLoader at 0x7c94c4147f10>,
  <torch.utils.data.dataloader.DataLoader at 0x7c94b4f14e80>,
  <torch.utils.data.dataloader.DataLoader at 0x7c94b4f11330>,
  <torch.utils.data.dataloader.DataLoader at 0x7c94b4f10ac0>,
  <torch.utils.data.dataloader.DataLoader at 0x7c94b4f11630>,
  <torch.utils.data.dataloader.DataLoader at 0x7c94b4f4db70>,
  <torch.utils.data.dataloader.DataLoader at 0x7c94b4f4cb50>,
  <torch.utils.data.dataloader.DataLoader at 0x7c94b4f4c910>,
  <torch.utils.data.dataloader.DataLoader at 0x7c94b4f4ca90>,
  <torch.utils.data.dataloader.DataLoader at 0x7c94b4f4c7c0>,
  <torch.utils.data.dataloader.DataLoader at 0x7c94b4f4c580>,
  <torch.utils.data.dataloader.DataLoader at 0x7c94b4f4c4c0>,
  <torch.utils.data.dataloader.DataLoader at 0x7c94b4f4c5b0>,
  <torch.utils.data.dataloader.DataLoader at 0x7c94b4f4c430>,
  <torch.utils.data.dataloader.DataLoader at 0x7c94b4f4f0a0>,
  <torch

In [5]:
prepare_quantity_skew_dirichlet_with_noise(60, 10, 0, 0.5, 0.3, 42)

Creating partitioned and noisy dataset and saving to /media/namvq/Data/code_chinh_sua/fedavg/quantity_skew_dirichlet_with_noise_beta_0.5_sigma_0.3...


KeyboardInterrupt: 

In [16]:
prepare_quantity_skew_dirichlet_with_noise(60, 10, 0, 0.5, 0.5, 42)

Creating partitioned and noisy dataset and saving to /media/namvq/Data/code_chinh_sua/fedavg/quantity_skew_dirichlet_with_noise_beta_0.5_sigma_0.5...
Dữ liệu đã được phân chia và lưu tại /media/namvq/Data/code_chinh_sua/fedavg/quantity_skew_dirichlet_with_noise_beta_0.5_sigma_0.5
Partition 0 class distribution: {0: 36, 1: 120}
Partition 1 class distribution: {0: 35, 1: 167}
Partition 2 class distribution: {1: 47, 0: 13}
Partition 3 class distribution: {1: 80, 0: 31}
Partition 4 class distribution: {1: 4, 0: 1}
Partition 5 class distribution: {0: 1, 1: 5}
Partition 6 class distribution: {1: 106, 0: 34}
Partition 7 class distribution: {0: 4, 1: 2}
Partition 8 class distribution: {1: 197, 0: 63}
Partition 9 class distribution: {1: 30, 0: 6}
Partition 10 class distribution: {0: 10, 1: 14}
Partition 11 class distribution: {1: 3, 0: 1}
Partition 12 class distribution: {1: 76, 0: 29}
Partition 13 class distribution: {1: 59, 0: 21}
Partition 14 class distribution: {1: 110, 0: 41}
Partition 15 

([<torch.utils.data.dataloader.DataLoader at 0x7fae21878970>,
  <torch.utils.data.dataloader.DataLoader at 0x7fae21d83700>,
  <torch.utils.data.dataloader.DataLoader at 0x7fad8c088730>,
  <torch.utils.data.dataloader.DataLoader at 0x7fae218a9ed0>,
  <torch.utils.data.dataloader.DataLoader at 0x7fae21aaaad0>,
  <torch.utils.data.dataloader.DataLoader at 0x7fad8c06ed40>,
  <torch.utils.data.dataloader.DataLoader at 0x7fad8c06f520>,
  <torch.utils.data.dataloader.DataLoader at 0x7fae3009aa40>,
  <torch.utils.data.dataloader.DataLoader at 0x7fae3009b490>,
  <torch.utils.data.dataloader.DataLoader at 0x7fae3009be50>,
  <torch.utils.data.dataloader.DataLoader at 0x7fae3009abc0>,
  <torch.utils.data.dataloader.DataLoader at 0x7fae30099b40>,
  <torch.utils.data.dataloader.DataLoader at 0x7fae30098e20>,
  <torch.utils.data.dataloader.DataLoader at 0x7fae30098520>,
  <torch.utils.data.dataloader.DataLoader at 0x7fae3009a5c0>,
  <torch.utils.data.dataloader.DataLoader at 0x7fae3009bee0>,
  <torch

In [6]:
trainset, testset = get_custom_dataset()

In [10]:
trainset.targets

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,


In [13]:
def prepare_quantity_skew_dirichlet_with_noise(num_partitions: int, batch_size: int, val_ratio: float = 0.1, 
                                               beta: float = 10, sigma: float = 0.05, seed: int = 42):
    """
    Phân phối dữ liệu theo quantity skew Dirichlet, thêm nhiễu Gaussian vào từng partition.
    Nếu dữ liệu đã được lưu, chỉ cần tải lại mà không tạo mới.
    """
    noise_dir = f"{BASE_FOLDER_NOISE}/quantity_skew_dirichlet_with_noise_beta_{beta}_sigma_{sigma}"

    mean = [0.485, 0.456, 0.406]
    std = [0.229, 0.224, 0.225]
    noisy_transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize(mean, std)
    ])

    # Nếu thư mục tồn tại, tải dữ liệu
    if os.path.exists(noise_dir):
        print(f"Loading partitioned and noisy dataset from {noise_dir}...")
        train_partitions = [ImageFolder(os.path.join(noise_dir, f'partition_{i}'), transform=noisy_transform) 
                            for i in range(num_partitions)]
        num_labels = len(train_partitions[0].classes)

        trainloaders = [DataLoader(part, batch_size=batch_size, shuffle=True, num_workers=6) 
                       for part in train_partitions]

        trainset, testset = get_custom_dataset()
        num_train = int((1 - val_ratio) * len(trainset))
        num_val = len(trainset) - num_train
        _, valset = random_split(trainset, [num_train, num_val], generator=torch.Generator().manual_seed(seed))
        
        partition_len_val = [len(valset) // num_partitions] * num_partitions
        for i in range(len(valset) % num_partitions):
            partition_len_val[i] += 1
        valsets = random_split(valset, partition_len_val, generator=torch.Generator().manual_seed(seed))

        valloaders = [DataLoader(vs, batch_size=batch_size, shuffle=False, num_workers=6) for vs in valsets]
        testloader = DataLoader(testset, batch_size=batch_size, shuffle=False, num_workers=6)

        print("Dữ liệu đã được tải thành công từ thư mục lưu trữ.")
        # return trainloaders, valloaders, testloader

    else:
        print(f"Creating partitioned and noisy dataset and saving to {noise_dir}...")
        os.makedirs(noise_dir, exist_ok=True)

        trainset, testset = get_custom_dataset()
        num_train = int((1 - val_ratio) * len(trainset))
        num_val = len(trainset) - num_train
        trainset, valset = random_split(trainset, [num_train, num_val], generator=torch.Generator().manual_seed(seed))

        all_indices = np.array(trainset.indices)
        train_labels = np.array(trainset.dataset.targets)[all_indices]
        num_labels = len(np.unique(train_labels))
        
        # Verify we have enough samples per class
        for label in range(num_labels):
            label_count = np.sum(train_labels == label)
            if label_count < num_partitions:
                raise ValueError(f"Class {label} has only {label_count} samples, need at least {num_partitions}")

        # Initialize partitions with minimum samples
        partition_indices = [[] for _ in range(num_partitions)]
        
        # First pass: ensure minimum samples
        for label in range(num_labels):
            label_indices = np.where(train_labels == label)[0]
            np.random.shuffle(label_indices)
            
            # Assign one sample per partition
            for i in range(num_partitions):
                partition_indices[i].append(label_indices[i])
            
            # Distribute remaining samples using Dirichlet
            remaining_indices = label_indices[num_partitions:]
            if len(remaining_indices) > 0:
                proportions = np.random.dirichlet(np.repeat(beta, num_partitions))
                # Convert to actual counts
                counts = (proportions * len(remaining_indices)).astype(int)
                
                # Adjust for rounding errors
                remainder = len(remaining_indices) - counts.sum()
                if remainder > 0:
                    counts[np.argmax(proportions)] += remainder
                
                # Split and distribute remaining indices
                start_idx = 0
                for i in range(num_partitions):
                    end_idx = start_idx + counts[i]
                    if end_idx > start_idx:
                        partition_indices[i].extend(remaining_indices[start_idx:end_idx])
                    start_idx = end_idx

        # Verify distribution
        for i, indices in enumerate(partition_indices):
            labels = train_labels[indices]
            unique_labels = np.unique(labels)
            if len(unique_labels) != num_labels:
                raise ValueError(f"Partition {i} is missing classes: {set(range(num_labels)) - set(unique_labels)}")

        # Thêm nhiễu Gaussian và lưu dữ liệu
        trainsets = []
        for i, indices in enumerate(partition_indices):
            partition_dir = os.path.join(noise_dir, f'partition_{i}')
            os.makedirs(partition_dir, exist_ok=True)

            partition_std_dev = sigma * (i + 1) / num_partitions
            noisy_data = []

            for idx in indices:
                image, label = trainset.dataset[idx]
                noisy_image = apply_gaussian_noise(image, partition_std_dev)
                noisy_data.append((noisy_image, label))

                # Lưu ảnh nhiễu Gaussian
                class_dir = os.path.join(partition_dir, f'class_{label}')
                os.makedirs(class_dir, exist_ok=True)
                noisy_image = unnormalize_image(noisy_image, mean, std)
                noisy_image_pil = transforms.ToPILImage()(noisy_image.clamp(0, 1))
                image_filename = f'image_{idx}.png'
                noisy_image_pil.save(os.path.join(class_dir, image_filename))

            trainsets.append(noisy_data)

        trainloaders = [DataLoader(ts, batch_size=batch_size, shuffle=True, num_workers=6) for ts in trainsets]

        # Phân chia valset thành các partitions
        partition_len_val = [len(valset) // num_partitions] * num_partitions
        for i in range(len(valset) % num_partitions):
            partition_len_val[i] += 1
        valsets = random_split(valset, partition_len_val, generator=torch.Generator().manual_seed(seed))

        valloaders = [DataLoader(vs, batch_size=batch_size, shuffle=False, num_workers=6) for vs in valsets]
        testloader = DataLoader(testset, batch_size=batch_size, shuffle=False, num_workers=6)

        print(f"Dữ liệu đã được phân chia và lưu tại {noise_dir}")
    
    # Phân tích phân bố lớp
    class_distributions = []
    for i, trainloader in enumerate(trainloaders):
        class_counts = Counter()
        for _, labels in trainloader:
            class_counts.update(labels.numpy())
        class_distributions.append(class_counts)
        print(f'Partition {i} class distribution: {dict(class_counts)}')
    
    # Vẽ biểu đồ phân bố lớp
    partitions = range(num_partitions)
    class_counts_list = []
    for i in partitions:
        counts = {cls: class_distributions[i].get(cls, 0) for cls in range(num_labels)}
        class_counts_list.append(counts)
    
    bar_width = 0.5
    plt.figure(figsize=(12, 8))
    bottom = np.zeros(num_partitions)
    colors = plt.cm.tab10.colors  # Sử dụng bảng màu có sẵn
    
    for cls in range(num_labels):
        counts = [class_counts_list[i].get(cls, 0) for i in partitions]
        plt.bar(partitions, counts, bar_width, bottom=bottom, label=f'Class {cls}', color=colors[cls % len(colors)])
        bottom += counts
    
    plt.xlabel('Partition')
    plt.ylabel('Number of Samples')
    plt.title('Class Distribution in Each Partition')
    plt.legend()
    plt.grid(True)
    output_dir = 'running_outputs'
    os.makedirs(output_dir, exist_ok=True)
    plt.savefig(os.path.join(output_dir, 'data_partition_combined.png'))
    plt.close()
    
    print(f'Number of train samples: {sum(len(loader.dataset) for loader in trainloaders)}, '
          f'val samples: {len(valset)}, test samples: {len(testloader.dataset)}')
    
    return trainloaders, valloaders, testloader


In [15]:
prepare_quantity_skew_dirichlet_with_noise(60, 10, 0, 0.5, 0.5, 42)

Creating partitioned and noisy dataset and saving to /media/namvq/Data/code_chinh_sua/fedavg/quantity_skew_dirichlet_with_noise_beta_0.5_sigma_0.5...
Dữ liệu đã được phân chia và lưu tại /media/namvq/Data/code_chinh_sua/fedavg/quantity_skew_dirichlet_with_noise_beta_0.5_sigma_0.5
Partition 0 class distribution: {1: 15, 0: 5}
Partition 1 class distribution: {1: 287, 0: 77}
Partition 2 class distribution: {0: 42, 1: 117}
Partition 3 class distribution: {1: 4}
Partition 4 class distribution: {1: 22, 0: 14}
Partition 5 class distribution: {0: 11, 1: 12}
Partition 6 class distribution: {1: 69, 0: 24}
Partition 7 class distribution: {1: 17, 0: 6}
Partition 8 class distribution: {1: 12, 0: 4}
Partition 9 class distribution: {1: 48, 0: 17}
Partition 10 class distribution: {1: 10, 0: 10}
Partition 11 class distribution: {0: 4, 1: 5}
Partition 12 class distribution: {0: 38, 1: 86}
Partition 13 class distribution: {1: 34, 0: 14}
Partition 14 class distribution: {0: 61, 1: 198}
Partition 15 class 

([<torch.utils.data.dataloader.DataLoader at 0x7c94c4146a70>,
  <torch.utils.data.dataloader.DataLoader at 0x7c940cb038b0>,
  <torch.utils.data.dataloader.DataLoader at 0x7c9410910610>,
  <torch.utils.data.dataloader.DataLoader at 0x7c9410910340>,
  <torch.utils.data.dataloader.DataLoader at 0x7c94109102b0>,
  <torch.utils.data.dataloader.DataLoader at 0x7c94109111b0>,
  <torch.utils.data.dataloader.DataLoader at 0x7c9410911ab0>,
  <torch.utils.data.dataloader.DataLoader at 0x7c941090faf0>,
  <torch.utils.data.dataloader.DataLoader at 0x7c941090f730>,
  <torch.utils.data.dataloader.DataLoader at 0x7c941090c760>,
  <torch.utils.data.dataloader.DataLoader at 0x7c941090d930>,
  <torch.utils.data.dataloader.DataLoader at 0x7c941090c100>,
  <torch.utils.data.dataloader.DataLoader at 0x7c941090ee90>,
  <torch.utils.data.dataloader.DataLoader at 0x7c941090d8d0>,
  <torch.utils.data.dataloader.DataLoader at 0x7c941090c4f0>,
  <torch.utils.data.dataloader.DataLoader at 0x7c940cb2ffa0>,
  <torch