In [1]:
LINK_DATASET = "/media/namvq/Data/chest_xray"
# LINK_DATASET = "/kaggle/input/chest-xray-pneumonia/chest_xray"
NUM_WORKERS = 6
# BASE_FOLDER_NOISE = "/kaggle/input/chest-xray-noise-60-partitions"
BASE_FOLDER_NOISE = "/media/namvq/Data/code_chinh_sua"

In [2]:
"""Partition the data and create the dataloaders."""

from typing import List, Optional, Tuple

import torch
from omegaconf import DictConfig

from torch.utils.data import DataLoader, random_split, Subset
from torchvision import transforms
from torchvision.datasets import MNIST
from torchvision.transforms import Compose, Normalize, ToTensor
import os
from torchvision.datasets import ImageFolder
from torchvision.transforms import Compose, Resize, Grayscale, ToTensor
from collections import Counter
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
matplotlib.use('Agg')  # Chuyển sang backend không cần GUI

print('BACKEND: ', matplotlib.get_backend())


#/kaggle/input/chest-xray-pneumonia/chest_xray
def get_custom_dataset(data_path: str = LINK_DATASET):
    """Load custom dataset and apply transformations."""
    train_transform = transforms.Compose([
        transforms.Resize(256),  # Kích thước ảnh cho VGG
        transforms.RandomAffine(degrees=0, shear=10),
        transforms.RandomHorizontalFlip(),
        transforms.RandomResizedCrop(224, scale=(0.8, 1.0)),
        transforms.RandomAffine(degrees=0, translate=(0.2, 0)),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406],  # Mean chuẩn của ImageNet
                             [0.229, 0.224, 0.225])  # Std chuẩn của ImageNet
    ])
    test_transform = transforms.Compose([
        transforms.Resize((224, 224)),  # Kích thước ảnh cho VGG
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406],  # Mean chuẩn của ImageNet
                             [0.229, 0.224, 0.225])  # Std chuẩn của ImageNet
    ])
    trainset = ImageFolder(os.path.join(data_path, 'train'), transform=train_transform)
    testset = ImageFolder(os.path.join(data_path, 'test'), transform=test_transform)
    return trainset, testset
#Lay tap val goc co 16 anh thoi
def get_val_dataloader(batch_size: int = 10):
    val_transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ])
    valset = ImageFolder(os.path.join(LINK_DATASET, 'val'), transform=val_transform)
    valloader = DataLoader(valset, batch_size=batch_size, shuffle=False, num_workers=NUM_WORKERS)
    return valloader

def prepare_dataset_for_centralized_train(batch_size: int, val_ratio: float = 0.1, seed: int = 42):
    trainset, testset = get_custom_dataset()
    # Split trainset into trainset and valset
    num_train = int((1 - val_ratio) * len(trainset))
    num_val = len(trainset) - num_train
    trainset, valset = random_split(trainset, [num_train, num_val], torch.Generator().manual_seed(seed))

    trainloader = DataLoader(trainset, batch_size=batch_size, shuffle=True, num_workers=NUM_WORKERS)
    valloader = DataLoader(valset, batch_size=batch_size, shuffle=False, num_workers=NUM_WORKERS)
    testloader = DataLoader(testset, batch_size=batch_size, shuffle=False, num_workers=NUM_WORKERS)

    print(f'Number of train samples: {len(trainset)}, val samples: {len(valset)}, test samples: {len(testloader.dataset)}')
    return trainloader, valloader, testloader


def prepare_dataset(num_partitions: int, batch_size: int, val_ratio: float = 0.1, alpha: float = 100, seed: int = 42):
    """Load custom dataset and generate non-IID partitions using Dirichlet distribution."""
    trainset, testset = get_custom_dataset()
    
    # Split trainset into trainset and valset
    num_train = int((1 - val_ratio) * len(trainset))
    num_val = len(trainset) - num_train
    trainset, valset = random_split(trainset, [num_train, num_val], torch.Generator().manual_seed(seed))
    
    # Get labels for the entire trainset
    train_labels = np.array([trainset.dataset.targets[i] for i in trainset.indices])
    
    # Generate Dirichlet distribution for each class
    class_indices = [np.where(train_labels == i)[0] for i in range(len(np.unique(train_labels)))]
    partition_indices = [[] for _ in range(num_partitions)]
    
    for class_idx in class_indices:
        np.random.shuffle(class_idx)
        proportions = np.random.dirichlet(np.repeat(alpha, num_partitions))
        proportions = (np.cumsum(proportions) * len(class_idx)).astype(int)[:-1]
        class_partitions = np.split(class_idx, proportions)
        for i in range(num_partitions):
            partition_indices[i].extend(class_partitions[i])
    
    # Create Subsets for each partition
    trainsets = [Subset(trainset.dataset, indices) for indices in partition_indices]
    
    # Split valset into partitions
    partition_len_val = [len(valset) // num_partitions] * num_partitions
    for i in range(len(valset) % num_partitions):
        partition_len_val[i] += 1

    valsets = random_split(valset, partition_len_val, torch.Generator().manual_seed(seed))
    
    # Create DataLoaders for each partition
    trainloaders = [DataLoader(ts, batch_size=batch_size, shuffle=True, num_workers=NUM_WORKERS) for ts in trainsets]
    valloaders = [DataLoader(vs, batch_size=batch_size, shuffle=False, num_workers=NUM_WORKERS) for vs in valsets]
    testloader = DataLoader(testset, batch_size=batch_size, shuffle=False, num_workers=NUM_WORKERS)

    # Calculate class distribution for each partition in trainloaders
    class_distributions = []
    for i, trainloader in enumerate(trainloaders):
        class_counts = Counter()
        for _, labels in trainloader:
            class_counts.update(labels.numpy())
        class_distributions.append(class_counts)
        print(f'Partition {i} class distribution: {dict(class_counts)}')

    # Plot class distribution
    partitions = range(num_partitions)
    class_0_counts = [class_distributions[i][0] for i in partitions]
    class_1_counts = [class_distributions[i][1] for i in partitions]

    bar_width = 0.5
    plt.figure(figsize=(12, 8))
    plt.bar(partitions, class_0_counts, bar_width, label='Class 0', color='blue')
    plt.bar(partitions, class_1_counts, bar_width, bottom=class_0_counts, label='Class 1', color='red')
    plt.xlabel('Partition')
    plt.ylabel('Number of Samples')
    plt.title('Class Distribution in Each Partition')
    plt.legend()
    plt.grid(True)
    plt.show()

    print(f'Number of train samples: {len(trainset)}, val samples: {len(valset)}, test samples: {len(testloader.dataset)}')
    return trainloaders, valloaders, testloader

def prepare_partitioned_dataset(num_partitions: int, batch_size: int, val_ratio: float = 0.1, num_labels_each_party: int = 1, seed: int = 42):
    """Load custom dataset and generate partitions where each party has a fixed number of labels."""
    trainset, testset = get_custom_dataset()  # Load datasets

    # Split the trainset into trainset and valset based on the validation ratio
    num_train = int((1 - val_ratio) * len(trainset))
    num_val = len(trainset) - num_train
    trainset, valset = random_split(trainset, [num_train, num_val], generator=torch.Generator().manual_seed(seed))

    # Get labels for the entire trainset
    train_labels = np.array([trainset.dataset.targets[i] for i in trainset.indices])

    # Define partitions: each party has k labels
    num_labels = len(np.unique(train_labels))  # Assuming labels are 0 and 1 for binary classification
    times = [0 for i in range(num_labels)]
    contain = []
    #Phan label cho cac client
    for i in range(num_partitions):
        current = [i%num_labels]
        times[i%num_labels] += 1
        if num_labels_each_party > 1:
            current.append(1-i%num_labels)
            times[1-i%num_labels] += 1
        contain.append(current)
    print(times)
    print(contain)
    # Create Subsets for each partition

    partition_indices = [[] for _ in range(num_partitions)]
    for i in range(num_labels):
        idx_i = np.where(train_labels == i)[0]  # Get indices of label i in train_labels
        idx_i = [trainset.indices[j] for j in idx_i]  # Convert indices to indices in trainset
        # #print label of idx_i
        # print("Label of idx: ", i)
        # for j in range(len(idx_i)):
        #     idx_in_dataset = trainset.indices[idx_i[j]]
        #     print(trainset.dataset.targets[idx_in_dataset])
        np.random.shuffle(idx_i)
        split = np.array_split(idx_i, times[i])
        ids = 0
        for j in range(num_partitions):
            if i in contain[j]:
                partition_indices[j].extend(split[ids])
                ids += 1
    
    trainsets = [Subset(trainset.dataset, indices) for indices in partition_indices]

    # #print label of client 0
    # print("Client 0")
    # for i in range(len(trainsets[0])):
    #     print(trainsets[0][i][1])

    # Split valset into partitions
    partition_len_val = [len(valset) // num_partitions] * num_partitions
    for i in range(len(valset) % num_partitions):
        partition_len_val[i] += 1
    
    valsets = random_split(valset, partition_len_val, generator=torch.Generator().manual_seed(seed))

    # Create DataLoaders for each partition
    trainloaders = [DataLoader(ts, batch_size=batch_size, shuffle=True, num_workers=NUM_WORKERS) for ts in trainsets]
    valloaders = [DataLoader(vs, batch_size=batch_size, shuffle=False, num_workers=NUM_WORKERS) for vs in valsets]
    testloader = DataLoader(testset, batch_size=batch_size, shuffle=False, num_workers=NUM_WORKERS)

    # Calculate class distribution for each partition in trainloaders
    class_distributions = []
    for i, trainloader in enumerate(trainloaders):
        class_counts = Counter()
        for _, labels in trainloader:
            class_counts.update(labels.numpy())
        class_distributions.append(class_counts)
        print(f'Partition {i} class distribution: {dict(class_counts)}')
    # Plot class distribution
    partitions = range(num_partitions)
    class_0_counts = [class_distributions[i][0] for i in partitions]
    class_1_counts = [class_distributions[i][1] for i in partitions]

    bar_width = 0.5
    plt.figure(figsize=(12, 8))
    plt.bar(partitions, class_0_counts, bar_width, label='Class 0', color='blue')
    plt.bar(partitions, class_1_counts, bar_width, bottom=class_0_counts, label='Class 1', color='red')
    plt.xlabel('Partition')
    plt.ylabel('Number of Samples')
    plt.title('Class Distribution in Each Partition')
    plt.legend()
    plt.grid(True)
    # plt.show()

    #  Lưu đồ thị vào thư mục running_outputs với tên data_partition
    output_dir = 'running_outputs'
    os.makedirs(output_dir, exist_ok=True)
    plt.savefig(os.path.join(output_dir, 'data_partition.png'))
    plt.close()



    print(f'Number of train samples: {len(trainset)}, val samples: {len(valset)}, test samples: {len(testloader.dataset)}')
    return trainloaders, valloaders, testloader

def prepare_imbalance_label_dirichlet(num_partitions: int, batch_size: int, val_ratio: float = 0.1, beta: float = 0.5, seed: int = 42):
    """Load custom dataset and generate partitions where each party has a fixed number of labels."""
    trainset, testset = get_custom_dataset()  # Load datasets

    # Split the trainset into trainset and valset based on the validation ratio
    num_train = int((1 - val_ratio) * len(trainset))
    num_val = len(trainset) - num_train
    trainset, valset = random_split(trainset, [num_train, num_val], generator=torch.Generator().manual_seed(seed))

    # Get labels for the entire trainset
    train_labels = np.array([trainset.dataset.targets[i] for i in trainset.indices])

    # Define partitions: each party has k labels
    num_labels = len(np.unique(train_labels))  # Assuming labels are 0 and 1 for binary classification
    min_size = 0
    min_require_size = 2

    N = len(trainset)


    while(min_size < min_require_size):
        partition_indices = [[] for _ in range(num_partitions)]
        for label in range(num_labels):
            idx_label = np.where(train_labels == label)[0]
            idx_label = [trainset.indices[j] for j in idx_label]
            np.random.shuffle(idx_label)

            proportions = np.random.dirichlet(np.repeat(beta, num_partitions))
            # proportions = np.array( [p * len(idx_j) < N/num_partitions] for p, idx_j in zip(proportions, partition_indices))
            proportions = np.array([p if p * len(idx_j) < N / num_partitions else 0 for p, idx_j in zip(proportions, partition_indices)])

            proportions = proportions / np.sum(proportions)
            proportions = (np.cumsum(proportions) * len(idx_label)).astype(int)[:-1]

            partition_indices = [idx_j + idx.tolist() for idx_j, idx in zip(partition_indices, np.split(idx_label, proportions))]
            min_size = min([len(idx_j) for idx_j in partition_indices])
        
    trainsets = [Subset(trainset.dataset, indices) for indices in partition_indices]

    partition_len_val = [len(valset) // num_partitions] * num_partitions
    for i in range(len(valset) % num_partitions):
        partition_len_val[i] += 1
    
    valsets = random_split(valset, partition_len_val, generator=torch.Generator().manual_seed(seed))

    trainloaders = [DataLoader(ts, batch_size=batch_size, shuffle=True, num_workers=NUM_WORKERS) for ts in trainsets]
    valloaders = [DataLoader(vs, batch_size=batch_size, shuffle=False, num_workers=NUM_WORKERS) for vs in valsets]
    testloader = DataLoader(testset, batch_size=batch_size, shuffle=False, num_workers=NUM_WORKERS)

    class_distributions = []
    for i, trainloader in enumerate(trainloaders):
        class_counts = Counter()
        for _, labels in trainloader:
            class_counts.update(labels.numpy())
        class_distributions.append(class_counts)
        print(f'Partition {i} class distribution: {dict(class_counts)}')
    # Plot class distribution
    partitions = range(num_partitions)
    class_0_counts = [class_distributions[i][0] for i in partitions]
    class_1_counts = [class_distributions[i][1] for i in partitions]

    bar_width = 0.5
    plt.figure(figsize=(12, 8))
    plt.bar(partitions, class_0_counts, bar_width, label='Class 0', color='blue')
    plt.bar(partitions, class_1_counts, bar_width, bottom=class_0_counts, label='Class 1', color='red')
    plt.xlabel('Partition')
    plt.ylabel('Number of Samples')
    plt.title('Class Distribution in Each Partition')
    plt.legend()
    plt.grid(True)
    #  Lưu đồ thị vào thư mục running_outputs với tên data_partition
    output_dir = 'running_outputs'
    os.makedirs(output_dir, exist_ok=True)
    plt.savefig(os.path.join(output_dir, 'data_partition.png'))
    plt.close()

    print(f'Number of train samples: {len(trainset)}, val samples: {len(valset)}, test samples: {len(testloader.dataset)}')

    return trainloaders, valloaders, testloader



def apply_gaussian_noise(tensor, std_dev):
    noise = torch.randn_like(tensor) * std_dev
    return tensor + noise

# Hàm đảo ngược chuẩn hóa
def unnormalize_image(image_tensor, mean, std):
    # Đảo ngược Normalize: (image * std) + mean
    for t, m, s in zip(image_tensor, mean, std):
        t.mul_(s).add_(m)  # Thực hiện từng kênh
    return image_tensor

# Hàm hiển thị ảnh từ một tensor
def display_image(image_tensor, mean, std):
    # Đảo ngược chuẩn hóa
    image_tensor = unnormalize_image(image_tensor, mean, std)
    # Chuyển tensor thành NumPy array và điều chỉnh thứ tự kênh màu (CHW -> HWC)
    image_numpy = image_tensor.permute(1, 2, 0).numpy()
    # Cắt giá trị ảnh về phạm vi [0, 1] để hiển thị đúng
    image_numpy = image_numpy.clip(0, 1)
    # Trả về ảnh NumPy
    return image_numpy

# def prepare_noise_based_imbalance(num_partitions: int, batch_size: int, val_ratio: float = 0.1, sigma: float = 0.05, seed: int = 42):
#     """
#     Chia du lieu ngau nhien va deu cho cac ben, sau do them noise vao cac ben
#     moi ben i co noise khac nhau Gauss(0, sigma*i/N)
#     """
#     trainset, testset = get_custom_dataset()
#     num_train = int((1 - val_ratio) * len(trainset))
#     num_val = len(trainset) - num_train
#     trainset, valset = random_split(trainset, [num_train, num_val], generator=torch.Generator().manual_seed(seed))

#     indices = trainset.indices

#     np.random.shuffle(indices)

#     partition_indices = np.array_split(indices, num_partitions)

#     train_partitions = []

#     for i, part_indices in enumerate(partition_indices):
#         partition_std_dev = sigma * (i + 1) / num_partitions
#         partition_set = Subset(trainset.dataset, part_indices)
        
#         noisy_samples = [apply_gaussian_noise(sample[0], partition_std_dev) for sample in partition_set]
#         noisy_dataset = [(noisy_samples[j], trainset.dataset[part_indices[j]][1]) for j in range(len(part_indices))]
#         # train_partitions.append((noisy_samples, [sample[1] for sample in partition_set]))
#         train_partitions.append(noisy_dataset)
#     trainloaders = [DataLoader(train_partitions[i], batch_size=batch_size, shuffle=True, num_workers=4) for i in range(num_partitions)]
#     partition_len_val = [len(valset) // num_partitions] * num_partitions
#     for i in range(len(valset) % num_partitions):
#         partition_len_val[i] += 1
    
#     valsets = random_split(valset, partition_len_val, generator=torch.Generator().manual_seed(seed))
#     valloaders = [DataLoader(vs, batch_size=batch_size, shuffle=False, num_workers=NUM_WORKERS) for vs in valsets]
#     testloader = DataLoader(testset, batch_size=batch_size, shuffle=False, num_workers=NUM_WORKERS)

# ####
#     class_distributions = []
#     for i, trainloader in enumerate(trainloaders):
#         class_counts = Counter()
#         for _, labels in trainloader:
#             class_counts.update(labels.numpy())
#         class_distributions.append(class_counts)
#         print(f'Partition {i} class distribution: {dict(class_counts)}')
    
#     partitions = range(num_partitions)
#     class_0_counts = [class_distributions[i][0] for i in partitions]
#     class_1_counts = [class_distributions[i][1] for i in partitions]

#     bar_width = 0.5
#     plt.figure(figsize=(12, 8))
#     plt.bar(partitions, class_0_counts, bar_width, label='Class 0', color='blue')
#     plt.bar(partitions, class_1_counts, bar_width, bottom=class_0_counts, label='Class 1', color='red')
#     plt.xlabel('Partition')
#     plt.ylabel('Number of Samples')
#     plt.title('Class Distribution in Each Partition')
#     plt.legend()
#     plt.grid(True)
#     # plt.show()
#     #  Lưu đồ thị vào thư mục running_outputs với tên data_partition
#     output_dir = 'running_outputs'
#     os.makedirs(output_dir, exist_ok=True)
#     plt.savefig(os.path.join(output_dir, 'data_partition.png'))
#     plt.close()

#     #Lưu ảnh nhiễu vào running_outputs
#     # Mean và std từ Normalize
#     mean = [0.485, 0.456, 0.406]
#     std = [0.229, 0.224, 0.225]

#     # Tạo thư mục lưu ảnh nếu chưa tồn tại
#     output_dir = "running_outputs"
#     os.makedirs(output_dir, exist_ok=True)

#     # Khởi tạo một lưới 10x6 để hiển thị ảnh
#     fig, axes = plt.subplots(10, 6, figsize=(15, 25))

#     # Duyệt qua 60 trainloaders và hiển thị ảnh đầu tiên
#     for i, trainloader in enumerate(trainloaders[:num_partitions]):
#         # Lấy ảnh đầu tiên từ trainloader
#         image_tensor = trainloader.dataset[0][0].clone()  # Clone để tránh thay đổi dữ liệu gốc
        
#         # Tìm vị trí hàng, cột trong lưới
#         row, col = divmod(i, 6)
#         plt.sca(axes[row, col])  # Đặt trục hiện tại là vị trí hàng, cột trong lưới
        
#         # Hiển thị ảnh
#         image_numpy = display_image(image_tensor, mean, std)
#         axes[row, col].imshow(image_numpy)
#         axes[row, col].axis('off')
#     plt.title(f"Noise image with sigma from {sigma * 1 / num_partitions} to {sigma}")
#     # Điều chỉnh layout để không bị chồng lấn
#     plt.tight_layout()

#     # Lưu ảnh thay vì hiển thị
#     output_path = os.path.join(output_dir, "image_noise.png")
#     plt.savefig(output_path, dpi=300)  # Lưu ảnh với chất lượng cao

#     plt.close()  # Đóng figure

#     print(f"Ảnh đã được lưu tại {output_path}")

#     print(f'Number of train samples: {len(trainset)}, val samples: {len(valset)}, test samples: {len(testloader.dataset)}')

# ###
#     return trainloaders, valloaders, testloader

def prepare_noise_based_imbalance(num_partitions: int, batch_size: int, val_ratio: float = 0.1, sigma: float = 0.05, seed: int = 42):
    """
    Chia dữ liệu ngẫu nhiên và đều cho các bên, sau đó thêm noise vào các bên.
    Mỗi bên i có noise khác nhau Gauss(0, sigma*i/N). Nếu dữ liệu đã tồn tại, tải từ thư mục dataset_noise_{sigma}.
    """
    # noise_dir = f'chest_xray_noise_{sigma}'
    # noise_dir = f'/kaggle/input/chest-xray-noise-60-partitions/chest_xray_noise_{sigma}'
    noise_dir = f"{BASE_FOLDER_NOISE}/chest_xray_noise_{sigma}"
    mean = [0.485, 0.456, 0.406]
    std = [0.229, 0.224, 0.225]
    noisy_transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize(mean, std)
    ])
    
    if os.path.exists(noise_dir):
        print(f"Loading noisy dataset from {noise_dir}...")
        # Sử dụng ImageFolder để tải dữ liệu đã được thêm nhiễu với transform phù hợp
        train_partitions = [ImageFolder(os.path.join(noise_dir, f'partition_{i}'), transform=noisy_transform) for i in range(num_partitions)]
        
        # Tải val và test set như bình thường
        trainset, testset = get_custom_dataset()
        num_train = int((1 - val_ratio) * len(trainset))
        num_val = len(trainset) - num_train
        _, valset = random_split(trainset, [num_train, num_val], generator=torch.Generator().manual_seed(seed))
        
        # Chia valset thành các partition
        partition_len_val = [len(valset) // num_partitions] * num_partitions
        for i in range(len(valset) % num_partitions):
            partition_len_val[i] += 1
        valsets = random_split(valset, partition_len_val, generator=torch.Generator().manual_seed(seed))
        
        # Tạo DataLoaders
        trainloaders = [DataLoader(part, batch_size=batch_size, shuffle=True, num_workers=6) for part in train_partitions]
        valloaders = [DataLoader(vs, batch_size=batch_size, shuffle=False, num_workers=6) for vs in valsets]
        testloader = DataLoader(testset, batch_size=batch_size, shuffle=False, num_workers=6)
        
        print("Dữ liệu đã được tải thành công từ thư mục lưu trữ.")
    else:
        print(f"Creating noisy dataset and saving to {noise_dir}...")
        os.makedirs(noise_dir, exist_ok=True)
        
        trainset, testset = get_custom_dataset()
        num_train = int((1 - val_ratio) * len(trainset))
        num_val = len(trainset) - num_train
        trainset, valset = random_split(trainset, [num_train, num_val], generator=torch.Generator().manual_seed(seed))
    
        indices = trainset.indices
        np.random.shuffle(indices)
        partition_indices = np.array_split(indices, num_partitions)
    
        # Mean và std từ Normalize đã được định nghĩa trước
        for i, part_indices in enumerate(partition_indices):
            partition_std_dev = sigma * (i + 1) / num_partitions
            partition_set = Subset(trainset.dataset, part_indices)
            
            # Tạo thư mục cho partition và các lớp
            partition_dir = os.path.join(noise_dir, f'partition_{i}')
            os.makedirs(partition_dir, exist_ok=True)
            class_dirs = {}
            for _, label in partition_set:
                if label not in class_dirs:
                    class_dirs[label] = os.path.join(partition_dir, f'class_{label}')
                    os.makedirs(class_dirs[label], exist_ok=True)
            
            for j, (image, label) in enumerate(partition_set):
                noisy_image = apply_gaussian_noise(image, partition_std_dev)
                # Đảo ngược chuẩn hóa để lưu ảnh đúng định dạng
                noisy_image = unnormalize_image(noisy_image, mean, std)
                # Chuyển tensor thành PIL Image
                noisy_image_pil = transforms.ToPILImage()(noisy_image.clamp(0, 1))
                # Lưu ảnh với tên duy nhất
                image_filename = f'image_{j}.png'
                noisy_image_pil.save(os.path.join(class_dirs[label], image_filename))
        
        # Tải dữ liệu từ thư mục đã lưu với transform phù hợp
        train_partitions = [ImageFolder(os.path.join(noise_dir, f'partition_{i}'), transform=noisy_transform) for i in range(num_partitions)]
        
        # Chia valset thành các partition
        partition_len_val = [len(valset) // num_partitions] * num_partitions
        for i in range(len(valset) % num_partitions):
            partition_len_val[i] += 1
        valsets = random_split(valset, partition_len_val, generator=torch.Generator().manual_seed(seed))
        
        # Tạo DataLoaders
        trainloaders = [DataLoader(part, batch_size=batch_size, shuffle=True, num_workers=6) for part in train_partitions]
        valloaders = [DataLoader(vs, batch_size=batch_size, shuffle=False, num_workers=6) for vs in valsets]
        testloader = DataLoader(testset, batch_size=batch_size, shuffle=False, num_workers=6)
    
    # Phân tích phân bố lớp
    class_distributions = []
    for i, trainloader in enumerate(trainloaders):
        class_counts = Counter()
        for _, labels in trainloader:
            class_counts.update(labels.numpy())
        class_distributions.append(class_counts)
        print(f'Partition {i} class distribution: {dict(class_counts)}')
    
    partitions = range(num_partitions)
    class_0_counts = [class_distributions[i].get(0, 0) for i in partitions]
    class_1_counts = [class_distributions[i].get(1, 0) for i in partitions]
    
    bar_width = 0.5
    plt.figure(figsize=(12, 8))
    plt.bar(partitions, class_0_counts, bar_width, label='Class 0', color='blue')
    plt.bar(partitions, class_1_counts, bar_width, bottom=class_0_counts, label='Class 1', color='red')
    plt.xlabel('Partition')
    plt.ylabel('Number of Samples')
    plt.title('Class Distribution in Each Partition')
    plt.legend()
    plt.grid(True)
    # Lưu đồ thị vào thư mục running_outputs với tên data_partition
    output_dir = 'running_outputs'
    os.makedirs(output_dir, exist_ok=True)
    plt.savefig(os.path.join(output_dir, 'data_partition.png'))
    plt.close()
    
    # Lưu ảnh nhiễu vào running_outputs
    # Tạo thư mục lưu ảnh nếu chưa tồn tại
    output_dir = "running_outputs"
    os.makedirs(output_dir, exist_ok=True)
    
    # Khởi tạo một lưới 10x6 để hiển thị ảnh
    fig, axes = plt.subplots(10, 6, figsize=(15, 25))
    
    # Duyệt qua trainloaders và hiển thị ảnh đầu tiên từ mỗi partition
    for i, trainloader in enumerate(trainloaders[:min(num_partitions, 60)]):
        if len(trainloader.dataset) == 0:
            continue
        # Lấy ảnh đầu tiên từ trainloader
        image_tensor, label = trainloader.dataset[0]
        
        # Tìm vị trí hàng, cột trong lưới
        row, col = divmod(i, 6)
        if row >= 10:
            break  # Chỉ hiển thị tối đa 60 ảnh
        # Hiển thị ảnh
        image_numpy = unnormalize_image(image_tensor.clone(), mean, std).permute(1, 2, 0).numpy().clip(0, 1)
        axes[row, col].imshow(image_numpy)
        axes[row, col].axis('off')
    # Điều chỉnh layout để không bị chồng lấn
    plt.tight_layout()
    
    # Lưu ảnh thay vì hiển thị
    output_path = os.path.join(output_dir, "image_noise.png")
    plt.savefig(output_path, dpi=300)  # Lưu ảnh với chất lượng cao
    
    plt.close()  # Đóng figure
    
    print(f"Ảnh minh họa đã được lưu tại {output_path}")
    print(f'Number of train samples: {len(trainset)}, val samples: {len(valset)}, test samples: {len(testloader.dataset)}')
    
    return trainloaders, valloaders, testloader
def prepare_quantity_skew_dirichlet(num_partitions: int, batch_size: int, val_ratio: float = 0.1, beta: float = 10, seed: int = 42):
    trainset, testset = get_custom_dataset()
    num_train = int((1 - val_ratio) * len(trainset))
    num_val = len(trainset) - num_train
    trainset, valset = random_split(trainset, [num_train, num_val], generator=torch.Generator().manual_seed(seed))

    all_indices = trainset.indices

    min_size = 0
    while min_size < 1:
        proportions = np.random.dirichlet(np.repeat(beta, num_partitions))
        proportions = (np.cumsum(proportions) * len(all_indices)).astype(int)[:-1]

        partition_indices = np.split(all_indices, proportions)

        min_size = min([len(partition) for partition in partition_indices])
        print('Partition sizes:', [len(partition) for partition in partition_indices])
        print('Min partition size:', min_size)

    trainsets = [Subset(trainset.dataset, indices) for indices in partition_indices]

    partition_len_val = [len(valset) // num_partitions] * num_partitions
    for i in range(len(valset) % num_partitions):
        partition_len_val[i] += 1
    
    valsets = random_split(valset, partition_len_val, generator=torch.Generator().manual_seed(seed))

    trainloaders = [DataLoader(ts, batch_size=batch_size, shuffle=True, num_workers=NUM_WORKERS) for ts in trainsets]
    valloaders = [DataLoader(vs, batch_size=batch_size, shuffle=False, num_workers=NUM_WORKERS) for vs in valsets]
    testloader = DataLoader(testset, batch_size=batch_size, shuffle=False, num_workers=NUM_WORKERS)

    class_distributions = []
    for i, trainloader in enumerate(trainloaders):
        class_counts = Counter()
        for _, labels in trainloader:
            class_counts.update(labels.numpy())
        class_distributions.append(class_counts)
        print(f'Partition {i} class distribution: {dict(class_counts)}')
    
    partitions = range(num_partitions)
    class_0_counts = [class_distributions[i][0] for i in partitions]
    class_1_counts = [class_distributions[i][1] for i in partitions]

    bar_width = 0.5
    plt.figure(figsize=(12, 8))
    plt.bar(partitions, class_0_counts, bar_width, label='Class 0', color='blue')
    plt.bar(partitions, class_1_counts, bar_width, bottom=class_0_counts, label='Class 1', color='red')
    plt.xlabel('Partition')
    plt.ylabel('Number of Samples')
    plt.title('Class Distribution in Each Partition')
    plt.legend()
    plt.grid(True)
    # plt.show()
    #  Lưu đồ thị vào thư mục running_outputs với tên data_partition
    output_dir = 'running_outputs'
    os.makedirs(output_dir, exist_ok=True)
    plt.savefig(os.path.join(output_dir, 'data_partition.png'))
    plt.close()

    print(f'Number of train samples: {len(trainset)}, val samples: {len(valset)}, test samples: {len(testloader.dataset)}')

    return trainloaders, valloaders, testloader

def prepare_label_drl_and_noisy_data(num_partitions: int, batch_size: int, 
                                      val_ratio: float = 0.1, beta: float = 0.5, 
                                      sigma: float = 0.05, seed: int = 42):
    """
    Phân chia dữ liệu với phân phối không cân bằng và thêm nhiễu Gaussian.
    Nếu dữ liệu đã được lưu, tải từ thư mục lưu trữ; nếu chưa, thực hiện phân chia và lưu.
    """
    # Định nghĩa thư mục lưu trữ dựa trên các tham số
    # noise_dir = f"data_partition_combined_{num_partitions}_beta_{beta}_sigma_{sigma}"
    noise_dir = f"{BASE_FOLDER_NOISE}/chest_xray_noise_drl_label{beta}_{sigma}"

    mean = [0.485, 0.456, 0.406]
    std = [0.229, 0.224, 0.225]
    noisy_transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize(mean, std)
    ])
    
    if os.path.exists(noise_dir):
        print(f"Loading partitioned and noisy dataset from {noise_dir}...")
        # Tải các partition đã lưu bằng ImageFolder
        train_partitions = [ImageFolder(os.path.join(noise_dir, f'partition_{i}'), transform=noisy_transform) 
                            for i in range(num_partitions)]
        
        # Lấy số lượng lớp từ một trong các partition
        if len(train_partitions) > 0:
            num_labels = len(train_partitions[0].classes)
        else:
            raise ValueError("Không tìm thấy partition nào trong thư mục lưu trữ.")
        
        # Tải val và test set như bình thường
        trainset, testset = get_custom_dataset()
        num_train = int((1 - val_ratio) * len(trainset))
        num_val = len(trainset) - num_train
        _, valset = random_split(trainset, [num_train, num_val], generator=torch.Generator().manual_seed(seed))
        
        # Chia valset thành các partition
        partition_len_val = [len(valset) // num_partitions] * num_partitions
        for i in range(len(valset) % num_partitions):
            partition_len_val[i] += 1
        valsets = random_split(valset, partition_len_val, generator=torch.Generator().manual_seed(seed))
        
        # Tạo DataLoaders
        trainloaders = [DataLoader(part, batch_size=batch_size, shuffle=True, num_workers=6) 
                       for part in train_partitions]
        valloaders = [DataLoader(vs, batch_size=batch_size, shuffle=False, num_workers=6) 
                      for vs in valsets]
        testloader = DataLoader(testset, batch_size=batch_size, shuffle=False, num_workers=6)
        
        print("Dữ liệu đã được tải thành công từ thư mục lưu trữ.")
    else:
        print(f"Creating partitioned and noisy dataset and saving to {noise_dir}...")
        os.makedirs(noise_dir, exist_ok=True)
        
        trainset, testset = get_custom_dataset()
        num_train = int((1 - val_ratio) * len(trainset))
        num_val = len(trainset) - num_train
        train_subset, valset = random_split(trainset, [num_train, num_val], generator=torch.Generator().manual_seed(seed))
        
        # Lấy nhãn của toàn bộ trainset
        train_labels = np.array([train_subset.dataset.targets[i] for i in train_subset.indices])
        num_labels = len(np.unique(train_labels))
        # min_size = 0
        # min_require_size = 2
        # N = len(train_subset)
        
        # # Phân chia dữ liệu theo phân phối Dirichlet
        # while min_size < min_require_size:
        #     partition_indices = [[] for _ in range(num_partitions)]
        #     for label in range(num_labels):
        #         idx_label = np.where(train_labels == label)[0]
        #         idx_label = [train_subset.indices[j] for j in idx_label]
        #         np.random.shuffle(idx_label)
                
        #         proportions = np.random.dirichlet(np.repeat(beta, num_partitions))
        #         proportions = (np.cumsum(proportions) * len(idx_label)).astype(int)[:-1]
                
        #         splits = np.split(idx_label, proportions)
        #         partition_indices = [idx_j + idx.tolist() for idx_j, idx in zip(partition_indices, splits)]
        #     min_size = min([len(idx_j) for idx_j in partition_indices])
        
        min_size = 0
        N = len(train_subset)

        # Phân chia dữ liệu theo phân phối Dirichlet
        while min_size < 1:
            partition_indices = [[] for _ in range(num_partitions)]
            for label in range(num_labels):
                idx_label = np.where(train_labels == label)[0]
                idx_label = [train_subset.indices[j] for j in idx_label]
                np.random.shuffle(idx_label)

                if len(idx_label) < num_partitions:
                    raise ValueError(f"Không đủ mẫu lớp {label} để phân phối cho tất cả partitions.")

                # Gán một mẫu cho mỗi partition
                for i in range(num_partitions):
                    partition_indices[i].append(idx_label[i])

                # Phân phối các mẫu còn lại theo Dirichlet
                remaining_idx = idx_label[num_partitions:]
                if len(remaining_idx) > 0:
                    proportions = np.random.dirichlet(np.repeat(beta, num_partitions))
                    proportions = (proportions * len(remaining_idx)).astype(int)

                    # Điều chỉnh proportions để đảm bảo tổng đúng bằng số mẫu còn lại
                    while proportions.sum() < len(remaining_idx):
                        proportions[np.argmax(proportions)] += 1
                    while proportions.sum() > len(remaining_idx):
                        proportions[np.argmax(proportions)] -= 1

                    splits = np.split(remaining_idx, np.cumsum(proportions)[:-1])
                    for i, idx in enumerate(splits):
                        partition_indices[i].extend(idx.tolist())

            min_sizes = [len(set(train_labels[[idx for idx in part]])) for part in partition_indices]
            min_size = min(min_sizes)
        # Lưu các partition với nhiễu Gaussian
        for i, indices in enumerate(partition_indices):
            partition_dir = os.path.join(noise_dir, f'partition_{i}')
            os.makedirs(partition_dir, exist_ok=True)
            for label in range(num_labels):
                class_dir = os.path.join(partition_dir, f'class_{label}')
                os.makedirs(class_dir, exist_ok=True)
            class_dirs = {}
            for idx in indices:
                _, label = train_subset.dataset[idx]
                if label not in class_dirs:
                    class_dirs[label] = os.path.join(partition_dir, f'class_{label}')
                    os.makedirs(class_dirs[label], exist_ok=True)
            
            # Thêm nhiễu và lưu ảnh
            partition_std_dev = sigma * (i + 1) / num_partitions
            for j, idx in enumerate(indices):
                image, label = train_subset.dataset[idx]
                noisy_image = apply_gaussian_noise(image, partition_std_dev)
                noisy_image = unnormalize_image(noisy_image, mean, std)
                noisy_image_pil = transforms.ToPILImage()(noisy_image.clamp(0, 1))
                image_filename = f'image_{j}.png'
                noisy_image_pil.save(os.path.join(class_dirs[label], image_filename))
        
        # Tải các partition đã lưu bằng ImageFolder
        train_partitions = [ImageFolder(os.path.join(noise_dir, f'partition_{i}'), transform=noisy_transform) 
                            for i in range(num_partitions)]
        
        # Chia valset thành các partition
        partition_len_val = [len(valset) // num_partitions] * num_partitions
        for i in range(len(valset) % num_partitions):
            partition_len_val[i] += 1
        valsets = random_split(valset, partition_len_val, generator=torch.Generator().manual_seed(seed))
        
        # Tạo DataLoaders
        trainloaders = [DataLoader(part, batch_size=batch_size, shuffle=True, num_workers=6) 
                       for part in train_partitions]
        valloaders = [DataLoader(vs, batch_size=batch_size, shuffle=False, num_workers=6) 
                      for vs in valsets]
        testloader = DataLoader(testset, batch_size=batch_size, shuffle=False, num_workers=6)
        
        print(f"Dữ liệu đã được phân chia và lưu tại {noise_dir}")
    
    # Phân tích phân bố lớp
    class_distributions = []
    for i, trainloader in enumerate(trainloaders):
        class_counts = Counter()
        for _, labels in trainloader:
            class_counts.update(labels.numpy())
        class_distributions.append(class_counts)
        print(f'Partition {i} class distribution: {dict(class_counts)}')
    
    # Vẽ biểu đồ phân bố lớp
    partitions = range(num_partitions)
    class_counts_list = []
    for i in partitions:
        counts = {cls: class_distributions[i].get(cls, 0) for cls in range(num_labels)}
        class_counts_list.append(counts)
    
    bar_width = 0.5
    plt.figure(figsize=(12, 8))
    bottom = np.zeros(num_partitions)
    colors = plt.cm.tab10.colors  # Sử dụng bảng màu có sẵn
    
    for cls in range(num_labels):
        counts = [class_counts_list[i].get(cls, 0) for i in partitions]
        plt.bar(partitions, counts, bar_width, bottom=bottom, label=f'Class {cls}', color=colors[cls % len(colors)])
        bottom += counts
    
    plt.xlabel('Partition')
    plt.ylabel('Number of Samples')
    plt.title('Class Distribution in Each Partition')
    plt.legend()
    plt.grid(True)
    output_dir = 'running_outputs'
    os.makedirs(output_dir, exist_ok=True)
    plt.savefig(os.path.join(output_dir, 'data_partition_combined.png'))
    plt.close()
    
    print(f'Number of train samples: {sum(len(loader.dataset) for loader in trainloaders)}, '
          f'val samples: {len(valset)}, test samples: {len(testloader.dataset)}')
    
    return trainloaders, valloaders, testloader
    
def load_datasets(
    config: DictConfig,
    num_clients: int,
    val_ratio: float = 0.1,
    seed: Optional[int] = 42,
) -> Tuple[List[DataLoader], List[DataLoader], DataLoader]:
    """Create the dataloaders to be fed into the model.

    Parameters
    ----------
    config: DictConfig
        Parameterises the dataset partitioning process
    num_clients : int
        The number of clients that hold a part of the data
    val_ratio : float, optional
        The ratio of training data that will be used for validation (between 0 and 1),
        by default 0.1
    seed : int, optional
        Used to set a fix seed to replicate experiments, by default 42

    Returns
    -------
    Tuple[DataLoader, DataLoader, DataLoader]
        The DataLoaders for training, validation, and testing.
    """
    print(f"Dataset partitioning config: {config}")
    batch_size = -1
    print('config:' , config)
    if "batch_size" in config:
        batch_size = config.batch_size
    elif "batch_size_ratio" in config:
        batch_size_ratio = config.batch_size_ratio
    else:
        raise ValueError
    partitioning = ""
    
    if "partitioning" in config:
        partitioning = config.partitioning

    # partition the data
    if partitioning == "imbalance_label":
        return prepare_partitioned_dataset(num_clients, batch_size, val_ratio, config.labels_per_client, config.seed)

    if partitioning == "imbalance_label_dirichlet":
        return prepare_imbalance_label_dirichlet(num_clients, batch_size, val_ratio, config.alpha, config.seed)

    if partitioning == "noise_based_imbalance":
        return prepare_noise_based_imbalance(num_clients, batch_size, val_ratio, config.sigma, config.seed)

    if partitioning == "quantity_skew_dirichlet":
        return prepare_quantity_skew_dirichlet(num_clients, batch_size, val_ratio, config.alpha, config.seed)
    
    if partitioning == "label_drl_and_noisy":
        return prepare_label_drl_and_noisy_data(num_clients, batch_size, 
                                      val_ratio, config.alpha, 
                                      config.sigma, config.seed)








  from .autonotebook import tqdm as notebook_tqdm


BACKEND:  Agg


In [3]:
import torch
import torch.nn as nn
import torchvision.models as models

class ResNet18(nn.Module):
    def __init__(self, num_classes=2):
        super(ResNet18, self).__init__()
        self.model = models.resnet18(pretrained=True)
        # self.model.fc = nn.Linear(self.model.fc.in_features, num_classes)
        self.model.fc = nn.Sequential(
            nn.Flatten(),
            nn.Linear(512, 128),
            nn.ReLU(inplace=True),
            nn.Linear(128, num_classes)  # Output corresponds to num_classes
        )

    def forward(self, x):
        return self.model(x)

class VGG11Model(nn.Module):
    # Implement VGG11 model for transfer learning
    def __init__(self, num_classes):
        super().__init__()
        self.model = models.vgg11(pretrained=True)
        
        # Freeze the convolutional base
        # for param in self.model.features.parameters():
        #     param.requires_grad = False
        
        # Replace avgpool with AdaptiveAvgPool2d
        self.model.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        
        # Replace the classifier with a new one
        self.model.classifier = nn.Sequential(
            nn.Flatten(),
            nn.Linear(512, 128),
            nn.ReLU(inplace=True),
            nn.Linear(128, num_classes)  # Output corresponds to num_classes
        )

    def forward(self, x):
        return self.model(x)

In [4]:
import torch
import torch.optim as optim 
import copy
import random 
import numpy as np
import time 
import matplotlib.pyplot as plt
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
def reset_model_to_zero(model):
    for param in model.parameters():
        param.data.fill_(0.0)

def federated_train(trainloaders, valloaders, testloader, config):
    # model = ResNet18(num_classes=2)
    model = VGG11Model(num_classes=2)
    nets = {net_i: copy.deepcopy(model) for net_i in range(len(trainloaders))}
    global_model = copy.deepcopy(model)  # Bản sao mô hình toàn cục
    c_nets = {net_i: copy.deepcopy(model) for net_i in range(len(trainloaders))}  # Bản sao mô hình trên từng client
    c_global = copy.deepcopy(model)

    # c_global_para = c_global.state_dict()
    # for net_id, net in c_nets.items():
    #     net.load_state_dict(c_global_para)
    for c_net in c_nets.values():
        reset_model_to_zero(c_net)
    reset_model_to_zero(c_global)

    valloader_goc = get_val_dataloader()
    num_rounds = config.num_rounds  # Số vòng huấn luyện
    accs_test = []
    accs_val = []
    accs_test.append(evaluate(global_model, testloader))
    accs_val.append(evaluate(global_model, valloader_goc))
    for round_num in range(num_rounds):
        print(f"Round {round_num + 1}/{num_rounds}")
        start = time.time()
        global_para = global_model.state_dict()

        # Chọn các client tham gia vào mỗi round
        selected_clients = select_clients(trainloaders, config.clients_per_round)
        
        # Huấn luyện trên các client đã chọn
        for client in selected_clients:
            nets[client].load_state_dict(global_para)
        local_train_net_scaffold(nets, selected_clients, global_model, c_nets, c_global, config, trainloaders, device=DEVICE)

        total_data_points = sum([len(trainloaders[client].dataset) for client in selected_clients])
        freqs = [len(trainloaders[client].dataset) / total_data_points for client in selected_clients]

        for idx in range(len(selected_clients)):
            net_para = nets[selected_clients[idx]].cpu().state_dict()
            if idx == 0:
                for key in net_para:
                    global_para[key] = net_para[key] * freqs[idx]
            else:
                for key in net_para:
                    global_para[key] += net_para[key] * freqs[idx]
        global_model.load_state_dict(global_para)
        global_model.to('cpu')
        acc_test = evaluate(global_model, testloader)        
        acc_val = evaluate(global_model, valloader_goc)

        accs_test.append(acc_test)
        accs_val.append(acc_val)
        print('Acc_val: ', acc_val)
        print('Acc_test: ', acc_test)
        # if round_num >= 0:
        #     if acc_val > 80.0:
        #         config.learning_rate = 1e-8
        #         print(f"Accuracy > 80%, decreasing learning rate to {config.learning_rate}")
        #     elif acc_val > 70.0:
        #         config.learning_rate = 1e-7
        #         print(f"Accuracy > 70%, decreasing learning rate to {config.learning_rate}")
        #     elif acc_val > 60.0:
        #         config.learning_rate = 1e-6
        #         print(f"Accuracy > 60%, decreasing learning rate to {config.learning_rate}")
        #     elif acc_val > 50.0:
        #         config.learning_rate = 1e-5
        #         print(f"Accuracy > 50%, decreasing learning rate to {config.learning_rate}")
        #     else :
        #         config.learning_rate = 1e-4
        #         print(f"Accuracy <= 50%, increasing learning rate to {config.learning_rate}")

        end = time.time()
        print(f'Time for round {round_num + 1}: ', end-start)
    # plot_accuracy(accs)
        # plot_accuracy(accs)
    print('accuracies test: ', accs_test)
    print('accuracies val: ', accs_val)
    plt.plot(range(0, num_rounds + 1), accs_test, marker='o', label='Accuracy_test')
    plt.plot(range(0, num_rounds + 1), accs_val, marker='x', label='Accuracy_val')
    plt.xlabel('Round')
    plt.xticks(range(0, num_rounds + 1, 10))
    plt.ylabel('Accuracy')
    plt.title('WOW WOW WOW')
    plt.grid(True)
    plt.legend()

    idx_file = 1
    new_file_name = f'running_outputs/accuracy_summary_{idx_file}.png'
    while os.path.exists(f'running_outputs/accuracy_summary_{idx_file}.png'):
        idx_file += 1
        new_file_name = f'running_outputs/accuracy_summary_{idx_file}.png'
    
    plt.savefig(new_file_name)
    plt.close()


def local_train_net_scaffold(nets, selected_clients, global_model, c_nets, c_global, config, trainloaders, device='cpu'):
    total_delta = copy.deepcopy(global_model.state_dict())
    for key in total_delta:
        total_delta[key] = 0.0
    c_global.to(device)
    global_model.to(device)
    for net_id in selected_clients:
        net = nets[net_id]
        net.to(device)

        c_nets[net_id].to(device)

        c_delta_para = train_net_scaffold(net, global_model, c_nets[net_id], c_global, trainloaders[net_id], config, device=device)
        c_nets[net_id].to('cpu')
        for key in total_delta:
            total_delta[key] += c_delta_para[key]
        
    for key in total_delta:
        # total_delta[key] /= len(selected_clients) ### ???
        total_delta[key] /= config.num_clients
    c_global_para = c_global.state_dict()
    for key in c_global_para:
        if c_global_para[key].type() == 'torch.LongTensor':
            c_global_para[key] += total_delta[key].type(torch.LongTensor)
        elif c_global_para[key].type() == 'torch.cuda.LongTensor':
            c_global_para[key] += total_delta[key].type(torch.cuda.LongTensor)
        else:
            #print(c_global_para[key].type())
            c_global_para[key] += total_delta[key]
    c_global.load_state_dict(c_global_para)

    # nets_list = list(nets.values())
    # return nets_list
def train_net_scaffold(net, global_model, c_local, c_global, trainloader, config, device):
    optimizer = optim.SGD(
        filter(lambda p: p.requires_grad, net.parameters()),
        lr=config.learning_rate,
        momentum=config.momentum
    )
    critierion = torch.nn.CrossEntropyLoss().to(device)
    c_local.to(device)
    c_global.to(device)
    global_model.to(device)

    c_global_para = c_global.state_dict()
    c_local_para = c_local.state_dict()
    cnt = 0
    for _ in range(config.num_epochs):
        for data, target in trainloader:
            data, target = data.to(device), target.to(device)
            optimizer.zero_grad()
            output = net(data)
            loss = critierion(output, target)
            loss.backward()
            optimizer.step()

            net_para = net.state_dict()
            for key in net_para:
                net_para[key] = net_para[key] - config.learning_rate * (c_global_para[key] - c_local_para[key])
            net.load_state_dict(net_para)
            cnt += 1
    
    c_new_para = c_local.state_dict()
    c_delta_para = copy.deepcopy(c_local.state_dict())
    global_model_para = global_model.state_dict()
    net_para = net.state_dict()
    for key in net_para:
        c_new_para[key] = c_new_para[key] - c_global_para[key] + (global_model_para[key] - net_para[key]) / (cnt * config.learning_rate)
        c_delta_para[key] = c_new_para[key] - c_local_para[key]
    c_local.load_state_dict(c_new_para)

    net.to('cpu')
    return c_delta_para

            
        
def plot_accuracy(accs):
    print('accuracies: ', accs)
    num_rounds = len(accs)-1
    plt.plot(range(0, num_rounds + 1), accs, marker='o', label='Accuracy')
    plt.xlabel('Round')
    plt.xticks(range(0, num_rounds + 1))
    plt.ylabel('Accuracy')
    plt.title('Scaffold on ResNet18 over Rounds')
    plt.grid(True)
    plt.legend()
    plt.savefig('running_outputs/accuracy_summary.png')
    plt.close()



def select_clients(trainloaders, clients_per_round):
    """Chọn ngẫu nhiên một số client tham gia huấn luyện trong mỗi round."""
    # Số lượng client có sẵn
    total_clients = len(trainloaders)
    # Chọn ngẫu nhiên một số client
    selected_clients = random.sample(range(total_clients), clients_per_round)
    return selected_clients


def evaluate(model, testloader):
    """Đánh giá mô hình trên tập kiểm tra."""
    print('evaluate on', DEVICE)
    model.to(DEVICE)
    model.eval()  # Chuyển sang chế độ đánh giá
    correct = 0
    total = 0

    with torch.no_grad():
        for data, target in testloader:
            data, target = data.to(DEVICE), target.to(DEVICE)
            output = model(data)
            _, predicted = torch.max(output, 1)
            total += target.size(0)
            correct += (predicted == target).sum().item()
    
    accuracy = 100 * correct / total
    model.to('cpu')
    return accuracy


In [5]:
!mkdir running_outputs

mkdir: cannot create directory ‘running_outputs’: File exists


In [6]:
# Đường dẫn thư mục
directory = 'running_outputs/'

# Xóa tất cả tệp trong thư mục bằng một lệnh duy nhất
for file in os.listdir(directory):
    file_path = os.path.join(directory, file)
    if os.path.isfile(file_path):
        os.remove(file_path)


In [7]:
from omegaconf import OmegaConf  # Thêm OmegaConf

# def load_config(config_file):
#     """Load configuration using OmegaConf (DictConfig)"""
#     # Dùng OmegaConf để load file config.yaml
#     config = OmegaConf.load(config_file)
#     return config
def load_config():
    """Load configuration using OmegaConf from a dictionary."""
    config_dict = {
        "num_clients": 60,
        "num_epochs": 1,
        "batch_size": 10,
        "clients_per_round": 2,
        "fraction_fit": 0.1,
        "learning_rate": 1e-3,
        "num_rounds": 1,
        "partitioning": "label_drl_and_noisy",
        "dataset_name": "chest_xray",
        "dataset_seed": 42,
        "alpha": 0.5,
        "sigma": 0.1,
        "labels_per_client": 1,  # only used when partitioning is label quantity
        "momentum": 0.9,
        "weight_decay": 0.00001,
        "dataset": {
            "name": "${dataset_name}",
            "partitioning": "${partitioning}",
            "batch_size": "${batch_size}",  # batch_size = batch_size_ratio * total_local_data_size
            "val_split": 0.0,
            "seed": "${dataset_seed}",
            "alpha": "${alpha}",
            "sigma": "${sigma}",
            "labels_per_client": "${labels_per_client}"
        }
    }

    # Chuyển đổi dictionary thành DictConfig
    config = OmegaConf.create(config_dict)

    return config

def main():
    # Parse arguments
    # args = parse_args()

    # Load configuration file
    config = load_config()  # Trả về DictConfig

    # Kiểm tra các tham số được thay thế chính xác
    print("Loaded Config:")
    print(config)

    # Load dataset
    # trainloaders, valloaders, testloader = load_datasets(config.dataset.name, args.num_clients)
    trainloaders, valloaders, testloader = load_datasets(
        config=config.dataset,
        num_clients=config.num_clients,
        val_ratio=config.dataset.val_split,
    )

    # Train federated model
    federated_train(trainloaders, valloaders, testloader, config)

if __name__ == '__main__':
    main()

Loaded Config:
{'num_clients': 60, 'num_epochs': 1, 'batch_size': 10, 'clients_per_round': 2, 'fraction_fit': 0.1, 'learning_rate': 0.001, 'num_rounds': 1, 'partitioning': 'label_drl_and_noisy', 'dataset_name': 'chest_xray', 'dataset_seed': 42, 'alpha': 0.5, 'sigma': 0.1, 'labels_per_client': 1, 'momentum': 0.9, 'weight_decay': 1e-05, 'dataset': {'name': '${dataset_name}', 'partitioning': '${partitioning}', 'batch_size': '${batch_size}', 'val_split': 0.0, 'seed': '${dataset_seed}', 'alpha': '${alpha}', 'sigma': '${sigma}', 'labels_per_client': '${labels_per_client}'}}
Dataset partitioning config: {'name': '${dataset_name}', 'partitioning': '${partitioning}', 'batch_size': '${batch_size}', 'val_split': 0.0, 'seed': '${dataset_seed}', 'alpha': '${alpha}', 'sigma': '${sigma}', 'labels_per_client': '${labels_per_client}'}
config: {'name': '${dataset_name}', 'partitioning': '${partitioning}', 'batch_size': '${batch_size}', 'val_split': 0.0, 'seed': '${dataset_seed}', 'alpha': '${alpha}', 's



evaluate on cuda
evaluate on cuda
Round 1/1
evaluate on cuda
evaluate on cuda
Acc_val:  87.5
Acc_test:  69.55128205128206
Time for round 1:  17.357775926589966
accuracies test:  [64.58333333333333, 69.55128205128206]
accuracies val:  [50.0, 87.5]


In [8]:
# from omegaconf import OmegaConf  # Thêm OmegaConf

# # def load_config(config_file):
# #     """Load configuration using OmegaConf (DictConfig)"""
# #     # Dùng OmegaConf để load file config.yaml
# #     config = OmegaConf.load(config_file)
# #     return config
# def load_config():
#     """Load configuration using OmegaConf from a dictionary."""
#     config_dict = {
#         "num_clients": 4,
#         "num_epochs": 1,
#         "batch_size": 10,
#         "clients_per_round": 2,
#         "fraction_fit": 0.1,
#         "learning_rate": 1e-3,
#         "num_rounds": 1,
#         "partitioning": "imbalance_label",
#         "dataset_name": "chest_xray",
#         "dataset_seed": 42,
#         "alpha": 0.5,
#         "sigma": 0.1,
#         "labels_per_client": 1,  # only used when partitioning is label quantity
#         "momentum": 0.9,
#         "weight_decay": 0.00001,
#         "dataset": {
#             "name": "${dataset_name}",
#             "partitioning": "${partitioning}",
#             "batch_size": "${batch_size}",  # batch_size = batch_size_ratio * total_local_data_size
#             "val_split": 0.0,
#             "seed": "${dataset_seed}",
#             "alpha": "${alpha}",
#             "sigma": "${sigma}",
#             "labels_per_client": "${labels_per_client}"
#         }
#     }

#     # Chuyển đổi dictionary thành DictConfig
#     config = OmegaConf.create(config_dict)

#     return config

# def main():
#     # Parse arguments
#     # args = parse_args()

#     # Load configuration file
#     config = load_config()  # Trả về DictConfig

#     # Kiểm tra các tham số được thay thế chính xác
#     print("Loaded Config:")
#     print(config)

#     # Load dataset
#     # trainloaders, valloaders, testloader = load_datasets(config.dataset.name, args.num_clients)
#     trainloaders, valloaders, testloader = load_datasets(
#         config=config.dataset,
#         num_clients=config.num_clients,
#         val_ratio=config.dataset.val_split,
#     )

#     # Train federated model
#     federated_train(trainloaders, valloaders, testloader, config)

# if __name__ == '__main__':
#     main()