<a href="https://colab.research.google.com/github/mohammad-rahbari/federated-learning_visual-classification/blob/Valentino/federated_model_visual_classification_v2_ProFedQ.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Importing DINO and installing its dependencies

In [None]:
# @title Clon the DINO ripo
!git clone https://github.com/facebookresearch/dino.git

Cloning into 'dino'...
remote: Enumerating objects: 175, done.[K
remote: Total 175 (delta 0), reused 0 (delta 0), pack-reused 175 (from 1)[K
Receiving objects: 100% (175/175), 24.47 MiB | 17.72 MiB/s, done.
Resolving deltas: 100% (100/100), done.


In [None]:
# @title Installing required dependencies regarding DINO
%cd dino
!pip install -r requirements.txt
!pip install timm

/content/dino
[31mERROR: Could not open requirements file: [Errno 2] No such file or directory: 'requirements.txt'[0m[31m
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch->timm)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch->timm)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch->timm)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch->timm)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch->timm)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch->timm)
  Downloading nvidia_cufft


# preprocessing the CIFAR-100 dataset

feature size in CIFAR is 32x32 but DINO requires 224x224 in the input layer.

In first step we upscale the dataset and then we add randomization to it

In last step of transformation we normalize data usind mean value and standard division of ImageNet



In [None]:
import torch
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import random_split,DataLoader

In [None]:
transform = transforms.Compose([
    transforms.Resize(256),
    transforms.RandomCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=(0.485, 0.456, 0.406),
                         std=(0.229, 0.224, 0.225))
])

In [None]:
from torch.utils.data import ConcatDataset
import numpy as np
from torchvision.datasets import CIFAR100
train_dataset = torchvision.datasets.CIFAR100(
    root='./data', train=True, download=True, transform=transform)

test_dataset = torchvision.datasets.CIFAR100(root='./data', train=False,
                                       download=True, transform=transform)
full_train = train_dataset

# Verify the length of the new dataset
print(f"Length of combined dataset: {len(full_train)}")


100%|██████████| 169M/169M [00:05<00:00, 30.7MB/s]


Length of combined dataset: 50000


In [None]:

#@title Imports
from sklearn.model_selection import StratifiedKFold
from torch.utils.data import Subset
import numpy as np
import random
import torch
import pandas as pd
import os

# Set Hyperparameters regarding the data spliting here!

In [None]:

#@title set the parameters here!!



number_of_clients = None
train_frac = 0.8 #@param
val_frac = 0.2 #@param
batch_size = 32 #@param{type:"integer"}
is_seed_fixed = True #@param{type:"boolean"}
seed = 42 #@param{type:"integer"}

def set_seed(seed=42, is_seed_fixed=True):
  if not is_seed_fixed:
    return
  torch.manual_seed(seed)
  torch.cuda.manual_seed(seed)
  torch.cuda.manual_seed_all(seed)
  np.random.seed(seed)
  random.seed(seed)
  torch.backends.cudnn.deterministic = True
  torch.backends.cudnn.benchmark = False


set_seed(seed,is_seed_fixed)



#@markdown </br> <h5>Indicate the number of clients that contribute in training:</h5>
n_clients = 80 #@param{type:"integer"}

#@markdown </br></br> <b>splitting hyperparameters</b>

spliting_method = "non-i.i.d. sharing" #@param["i.i.d. sharing","non-i.i.d. sharing"]
backbone = "dino_vits16" #@param["dino_resnet50", "dino_vits16", "dino_xcit_small_12_p16"]



In [None]:
#@title Set the parameters here only if <b>non-i.i.d. sharing</b> method had been selected!!
#@markdown Nc is the number of classes that each subset can contain
if spliting_method == "non-i.i.d. sharing":
  Nc = 25 #@param{type:"integer"}

  # are_classes_overlaping = False #@param{type:"boolean"}

#@markdown <h3>If we consider the Number of classes M and nummber of client K then:</h3>
#@markdown <ul>
#@markdown   <li>Nc should be:
#@markdown     <ul>
#@markdown       <li>
#@markdown         Greater than or equal to <b>\\(\frac{M}{K}\\)</b>
#@markdown       </li>
#@markdown       <li>
#@markdown         Less than or equal to K </b>
#@markdown       </li>
#@markdown     </ul>
#@markdown   </li>
#@markdown   <li>
#@markdown   Muximum number of clients means all classes contribute in every client
#@markdown   </li>

#@markdown </ul>


#@markdown </br></br><h3>Combination of classes are randomly selected which suits definition of federated learning especially Cross-device federated learning</h3>





# Data splitting

In [None]:
# @title data splitting

set_seed(seed,is_seed_fixed)
generator = torch.Generator().manual_seed(seed)

total_size = len(full_train)
train_size = int(train_frac * total_size)
val_size   = total_size - train_size

train_set, val_set = random_split(full_train, [train_size, val_size], generator=generator)
train_indices = torch.tensor(train_set.indices)
val_indices = torch.tensor(val_set.indices)

train_set = Subset(train_set.dataset, train_indices)
val_set = Subset(val_set.dataset, val_set.indices)

train_loader = DataLoader(train_set, batch_size=len(train_set), shuffle=False)
val_loader  =  DataLoader(val_set, batch_size=len(val_set), shuffle=False)



print(f"Train dataset size: {len(train_set)}")
print(f"Validation dataset size: {len(val_set)}")

lenghts = [train_size//n_clients] * n_clients

for i in range(train_size % n_clients):
  lenghts[i] += 1
print("Size of subset: ", lenghts)





Train dataset size: 40000
Validation dataset size: 10000
Size of subset:  [500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500]


In [None]:
# @title i.i.d sharing - split data dased on number of clients and with respect of label proportionality
set_seed(seed,is_seed_fixed)
def iid_sharing(dataset, n_clients):

  full_train_indices = dataset.indices
  full_train_labels = torch.from_numpy(np.array(dataset.dataset.targets)[full_train_indices]) #collects labels from all dataset
  unique_lables = torch.unique(full_train_labels) #Removes dupilication and generates a uniuqe list of labels (classes)
  proportionality ={}
  classes_indices = {}


  for i in unique_lables:
    proportionality[i] =( full_train_labels == i).sum() / len(full_train_labels) #Calculates proportinality of each class
    classes_indices[i] = torch.nonzero(full_train_labels == i).squeeze() #Collects and save Indices in an array based on classes

  for i in classes_indices.keys():
    classes_indices[i] = classes_indices[i][torch.randperm(classes_indices[i].shape[0])] #suffels the indices

  client_data_size = len(full_train_labels) / n_clients #Minimum dataset size of each client

  client_indices = {}

  #For each client we generate a element in client_indices dict to keep track of indices we'll associated with each client
  for client in range(n_clients):
    if not client_indices.get(client):
      client_indices[client] = torch.empty(0, dtype=torch.long)
  #__________________


  #For each client we calculate how many samples from each specific label should be seperated. We take out the requried number of them form the list
    for label in proportionality.keys():
      pointer = proportionality[label] * client_data_size
      pointer = int(pointer) if not pointer % 1 else int(pointer) + 1
      pointer = min(pointer,classes_indices[label].size()[0])
      pointer = pointer if pointer < classes_indices[label].size()[0] else classes_indices[label].size()[0]
      client_indices[client] = torch.cat((client_indices[client], classes_indices[label][:pointer]), dim=0)
      classes_indices[label] = classes_indices[label][pointer:]


  #After spliting data we distribute remaining samples amoung the clients
  for label in classes_indices.keys():
    while True:
      for client in client_indices.keys():

        if classes_indices[label].size()[0] == 0:
          break
        client_indices[client] = torch.cat((
            client_indices[client],
            classes_indices[label][:1] ),
            dim=0)


        classes_indices[label] = classes_indices[label][1:]

      if classes_indices[label].size()[0] == 0:
        break


  #split actual dataset to multiple subset for clients
  client_data={
      client_id: Subset(dataset.dataset,indices[torch.randperm(len(indices))])
      for client_id, indices in client_indices.items()
  }
  return client_data

# indices_check = []
# client_data = iid_sharing(train_set, n_clients)
# for client_id in client_data.keys():
#   indices_check = indices_check + list(client_data[client_id].indices)
#   print(f"Client {client_id} has {len(client_data[client_id])} samples")



In [None]:
# @title Non i.i.d sharing


# @title i.i.d sharing - split data dased on number of clients and with respect of label proportionality
set_seed(seed,is_seed_fixed)
def noniid_sharing(dataset,Nc , n_clients):

  full_train_indices = dataset.indices
  full_train_labels = torch.tensor(dataset.dataset.targets)[full_train_indices] #collects labels from all dataset
  unique_lables = torch.unique(full_train_labels) #Removes dupilication and generates a uniuqe list of labels (classes)

  classes_indices = {}
  classes_size = torch.zeros(unique_lables.size()[0])

  class_combs = get_class_combinations(unique_lables, Nc, n_clients)

  classes_num_partition = torch.zeros(unique_lables.size()[0])

  for i in unique_lables:
    classes_num_partition[i] = torch.sum(class_combs == i)
    classes_indices[i.item()] = torch.nonzero(full_train_labels == i).squeeze() #Collects and save Indices in an array based on classe
    classes_size[i] = classes_indices[i.item()].size()[0] #Calculate the number of smaples belonging to each class

  for i in classes_indices.keys():
    classes_indices[i] = classes_indices[i][torch.randperm(classes_indices[i].shape[0])] #suffels the indices

  client_indices = {client: torch.tensor([],dtype=torch.int64) for client in range(n_clients) }
  assigned_indices = set()

  #For each client we generate a element in client_indices dict to keep track of indices we'll associated with each client

  for client in range(n_clients):
    for cls in class_combs[client]:
      cls = cls.item()

      portion  = classes_size[cls] /classes_num_partition[cls]
      portion = int(portion) if not portion % 1 else int(portion) + 1
      portion = min(portion, classes_indices[cls].size()[0])

      class_partition = classes_indices[cls][:portion]

      class_partition = [idx for idx in class_partition if idx not in assigned_indices]

      assigned_indices.update(class_partition)

      class_partition = torch.tensor(class_partition, dtype=torch.int64)

      client_indices[client] = torch.cat((client_indices[client], class_partition), dim=0)

      classes_indices[cls] = classes_indices[cls][portion:]

  client_data={
      client_id: Subset(dataset.dataset,indices[torch.randperm(len(indices))])
      for client_id, indices in client_indices.items()
      if len(indices) > 0
  }



  return client_data, class_combs





def get_class_combinations(classes, Nc, n_clients):

  if Nc * n_clients < len(classes):
    Nc = len(classes) / n_clients
    Nc = int(Nc) if not Nc % 1 else int(Nc) + 1

    print(f"Number of classes per clients is lower then minimum. Nc changed to {Nc} (the least possible value)")

  combinations = torch.zeros((n_clients,Nc),dtype= torch.int64)
  counter =0
  ofset = 0
  flag = False

  for i in range(n_clients):
    if not flag:
      end_pointer = (i + 1) * Nc
      if end_pointer >= classes.size()[0]:
          ofset = (end_pointer - classes.size()[0])
          flag = True

      combinations[i] = classes[i* Nc - ofset: end_pointer - ofset]

    else:

      combinations[i]  = torch.randperm(classes.size()[0])[:Nc]

  return combinations

# Log System

In this section Requerd Data will be stored.<br/><br/>
**Archaving this information will make it possible to:**
*   Handle Clients
*   Manage the models
*   Keep track of results of different Backbones
*   Compare measurement criteria
*   Handel model merging process
*   Save path to the models

<br/><br/>
**These data will be saved in two seperted csv file to :**

1.   Store the LOCAL Models  
2.   Store the GLOBAL Models resulted by each round

<br/><br/>
The csv files will be handeled as panda.dataframe and each row in the csv file addresses one of models
<br/>

**Columns (COMMON):**<br/>
1. Backbone model name
2. Model name
3. Path
4. Time of log
5. Measurement criteria
 * loss
 * Accuracy
 * ...?
6. Size of dataset

**Columns (Local Models only):**<br/>
7. Client Id
8. Classes (Indicate which classes have been covered by each client)(format:"2,4,63,80,9" or "all" for all the classes)
9. Round number
10. Duration of training
11. Train Test ratio

**Columns (Global Models only):**<br/>
7. Number of clients
7. Number of rounds
8. Model Aggregation method







In [None]:
# @title Functions
import torch
from datetime import datetime
import time
from google.colab import drive
from uuid import uuid4
import os

def get_current_time():
  now = datetime.now()

  formatted_date_time = now.strftime("%Y-%m-%d %H:%M:%S") # Format the date and time as a string

  return formatted_date_time



tic_start_time = None

def next_id(log_path):
  if os.path.exists(log_path):
    df = pd.read_csv(log_path)
    while True:
      uuid = str(uuid4())
      if uuid not in df["model_name"].values:
        return uuid
  else:
    return str(uuid4())



def tic():
    global tic_start_time
    tic_start_time = time.perf_counter() # start the timer

def toc():
    if tic_start_time is None:
        print("Error: You must call tic() before toc()")
        return None
    elapsed_time = time.perf_counter() - tic_start_time
    return elapsed_time




# Model and model configuration

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from collections import defaultdict

class DinoClassifier(nn.Module):
  def __init__(self, dino_model, num_classes:int=100, device=None):
    super(DinoClassifier, self).__init__()
    self.backbone = dino_model

    #We need to freaze thhe parameters of bakbone first so we can train only on the head layer(output layer)
    for param in self.backbone.parameters():
      param.requires_grad = False

    #determine the Device
    if device is None:
      device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    self.backbone.to(device)

    #To detect the output feature dimontion of backbone we run  Dummy forward pass
    with torch.no_grad():

      dummy_input = torch.randn(1,3,224,224).to(device)
      dummy_out = self.backbone(dummy_input)


      if isinstance(dummy_out, tuple):
        dummy_out = dummy_out[0]
      elif isinstance(dummy_out, dict):
        dummy_out = dummy_out.get("x_norm_clstoken", next(iter(dummy_out.values())))

      #If the output is 3D (B, T, D), we assume first token is the [CLS] token.
      if dummy_out.dim() == 3:
        dummy_feature = dummy_out[:,0]
      else:
        dummy_feature = dummy_out
      feature_dim = dummy_feature.shape[1]
      print("Detected feature dimontion:", feature_dim)


      #Hidden Layer
      self.hidden = nn.Sequential(
          nn.Linear(feature_dim, 128),
          nn.ReLU()
      )


      #Difineing the classification Head
      self.head = nn.Linear(128, num_classes)

      #Ensure the head is trainable.
      for param in self.hidden.parameters():
        param.requires_grad = True
      for param in self.head.parameters():
        param.requires_grad = True

  def forward(self,x):

    #pass the input through the backbone
    features = self.backbone(x)

    if isinstance(features, tuple):
      features = features[0]
    elif isinstance(features, dict):
      features = features.get("x_norm_clstoken", next(iter(features.values())))


    # If featers are retuened as (B, T, D), use the first token
    if features.dim() == 3:
      features = features[:,0]


    hidden_out  = self.hidden(features)

    logits = self.head(hidden_out)

    return logits

  def linear_probe_eval(self, train_set, device, accuracy_threshold=30.0):
      """Train a linear probe on top of frozen encoder, return per-class accuracy."""

      self.eval()
      # Freeze the backbone (not the head)
      for param in self.backbone.parameters():
        param.requires_grad = False
      for param in self.hidden.parameters():
        param.requires_grad = False

      # Linear probe
      embedding_dim = self.head.in_features
      num_classes = self.head.out_features
      linear_probe = nn.Linear(embedding_dim, num_classes).to(device)

      optimizer = optim.SGD(linear_probe.parameters(), lr=1e-3)
      criterion = nn.CrossEntropyLoss()

      # Probing training
      epochs = 5
      for epoch in range(epochs):
          linear_probe.train()
          for x, y in train_set:
              x, y = x.to(device), y.to(device)
              with torch.no_grad():
                features = self.backbone(x) # Changed from self.encoder to self.backbone
                if isinstance(features, tuple):
                  features = features[0]
                elif isinstance(features, dict):
                  features = features.get("x_norm_clstoken", next(iter(features.values())))

                if features.dim() == 3:
                  features = features[:, 0]

                hidden_out = self.hidden(features)

              logits = linear_probe(hidden_out)
              loss = criterion(logits, y)

              optimizer.zero_grad()
              loss.backward()
              optimizer.step()

      class_correct = defaultdict(int)
      class_total = defaultdict(int)
      linear_probe.eval()
      with torch.no_grad():
          for x, y in train_set:
              x, y = x.to(device), y.to(device)
              features = self.backbone(x) # Changed from self.encoder to self.backbone
              if isinstance(features, tuple):
                features = features[0]
              elif isinstance(features, dict):
                features = features.get("x_norm_clstoken", next(iter(features.values())))

              if features.dim() == 3:
                features = features[:, 0]

              hidden_out = self.hidden(features)
              logits = linear_probe(hidden_out)
              _, preds = torch.max(logits, 1)
              for true, pred in zip(y, preds):
                  class_total[int(true)] += 1
                  if int(true) == int(pred):
                      class_correct[int(true)] += 1

      class_accuracy = {
          c: 100 * class_correct[c] / class_total[c]
          for c in class_total
      }

      weak_classes = [c for c, acc in class_accuracy.items() if acc < accuracy_threshold]

      # print(f"Client {self.id} - Weak classes detected: {weak_classes}") # Removed client id here
      return class_accuracy, weak_classes

  def head_only(self):
      for param in self.backbone.parameters():
        param.requires_grad = False
      for param in self.hidden.parameters():
        param.requires_grad = False
      for param in self.head.parameters():
        param.requires_grad = True
  def full_train(self):
      for param in self.backbone.parameters():
        param.requires_grad = False
      for param in self.hidden.parameters():
        param.requires_grad = True
      for param in self.head.parameters():
        param.requires_grad = True



# Clients

In [None]:
from warnings import filters
#@title clients Classs
from torch.utils.data import random_split
from torch.utils.data import Subset
import pandas as pd
import torch.hub
import copy
from collections import defaultdict
from torch.utils.data import DataLoader # Import DataLoader

set_seed(seed,is_seed_fixed)

class Client:


  def __init__(self, id, data, n_clients, spliting_method, grad_mask = True, masking_mode="topk" sparsity=0.4, batch_size = 32, classes="all", num_epochs= 10, backbone=None, path_to_model=None, initial_model=None, spliting_ratio={"train":0.8, "test":0.2}, path_to_subsets="", path_to_class_combs=""):
    self.id = id
    self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Corrected cuda() to is_available()
    self.data_set = data
    self.weak_set = None
    self.spliting_method = spliting_method
    self.classes = classes
    self.backbone = backbone
    self.sparsity = sparsity
    self.grad_mask = grad_mask
    self.path_to_model = path_to_model
    self.n_clients = n_clients
    if initial_model:
      self.model = initial_model
    else:
      self.model = None
    self.load_model()
    self.num_epochs = num_epochs
    self.num_edit_epochs = None
    self.spliting_ratio = spliting_ratio
    self.batch_size = batch_size

    self.train_set , self.test_set = self.test_train_split()
    self.duration = 0.0
    self.train_loss = None
    self.accuracy = None
    self.loss = None
    self.path_to_subsets = path_to_subsets
    self.path_to_class_combs = path_to_class_combs







  def test_train_split(self):
    train_size = int(self.spliting_ratio.get("train") * len(self.data_set))
    test_size =  len(self.data_set) - train_size

    train_set, test_set = random_split(self.data_set, [ train_size, test_size ])
    train_set = DataLoader(train_set, batch_size=self.batch_size, shuffle=True,  num_workers=2)
    test_set = DataLoader(test_set, batch_size=self.batch_size, shuffle=False,  num_workers=2)

    return train_set, test_set


  def weak_set_generator(self,weak_classes):
    filtered_indices = [i for i,(_,label) in enumerate(self.train_set.dataset) if label in weak_classes]
    filtered_train_set = Subset(train_set, filtered_indices)
    self.weak_set = DataLoader(filtered_train_set, batch_size=self.batch_size, shuffle=True, num_workers=2)


  def gradient_mask(self, mode="topk"):
        masks = {}
        all_grads = []

        for name, param in self.model.head.named_parameters():
            if param.grad is not None and param.requires_grad and 'weight' in name:
                all_grads.append((param.grad ** 2).flatten())

        all_grads_flat = torch.cat(all_grads)
        num_params = all_grads_flat.numel()
        k = int(num_params * self.sparsity)

        if k == 0:
            return

        if mode == "topk":
            threshold = torch.topk(all_grads_flat, k, largest=False).values.max()
            global_mask_flat = (all_grads_flat > threshold).float()

        elif mode == "random":
            global_mask_flat = torch.zeros_like(all_grads_flat)
            rand_indices = torch.randperm(num_params)[:k]
            global_mask_flat[rand_indices] = 1.0

        elif mode == "low":
            threshold = torch.topk(all_grads_flat, k, largest=True).values.min()
            global_mask_flat = (all_grads_flat < threshold).float()
        else:
            raise ValueError(f"Unknown masking mode: {mode}")

        idx = 0
        for name, param in self.model.head.named_parameters():
            if param.grad is not None and param.requires_grad and 'weight' in name:
                numel = param.grad.numel()
                mask_flat = global_mask_flat[idx:idx + numel]
                masks[name] = mask_flat.view_as(param.grad)
                idx += numel

        for name, param in self.model.head.named_parameters():
            if name in masks and param.grad is not None:
                param.grad *= masks[name].to(param.grad.device)



  def load_model(self):

    if self.path_to_model:
      dino_model = torch.hub.load('facebookresearch/dino:main', self.backbone)
      self.model = DinoClassifier(dino_model=dino_model, num_classes=100, device=self.device)
      state_dict = torch.load(self.path_to_model)
      self.model.load_state_dict(state_dict)

    self.model.to(self.device)


  def train(self, model_edit:bool = False):
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(self.model.head.parameters(), lr=1e-3, momentum=0.9)


    tic()

    for epoch in range(self.num_epochs if not model_edit else self.num_edit_epochs):
      self.model.train()

      if model_edit:
        self.model.head_only()
        train_data = self.weak_set

      else:
        self.model.full_train()
        train_data = self.train_set

      running_loss = 0.0


      for index, (images, labels) in enumerate(train_data):
        images = images.to(self.device)
        labels = labels.to(self.device)

        optimizer.zero_grad()

        outputs = self.model(images)
        loss = criterion(outputs, labels)

        loss.backward()

        if self.grad_mask:
          self.gradient_mask()

        optimizer.step()

        running_loss += loss.item()

      epoch_loss = running_loss / len(self.train_set)
      status = "Training" if not model_edit else "Model editing"
      print(f"client {self.id}, {status}- epoch {epoch} - epoch loss:{epoch_loss:.4f}" )
      self.duration = toc()
      self.train_loss = epoch_loss


  def evaluate(self):
    criterion = nn.CrossEntropyLoss()
    self.model.eval()
    correct = 0
    total = 0
    test_loss = 0
    with torch.no_grad():
      for index, (images, labels) in enumerate(self.train_set):
        images, labels = images.to(self.device), labels.to(self.device)
        outputs = self.model(images)

        _, prediction = torch.max(outputs.data,1)
        loss = criterion(outputs, labels)
        test_loss += loss.item() * labels.size(0)


        total += labels.size(0)

        correct += (prediction == labels).sum().item()
    self.accuracy = 100 * correct / total
    self.loss = test_loss / total

#  def model_edit(self,weak_classes, num_epochs:int=5):
 #   self.weak_set_generator(weak_classes)
  #  self.num_edit_epochs = num_epochs
  #  self.train(model_edit=True)


  def confirm_save(self,path):
      # torch.save(self.model.state_dict(),  path ) # saves whole model
      torch.save({
          'hidden': self.model.hidden.state_dict(),
          'head' : self.model.head.state_dict()
      },path)




  def linear_probe_eval(self, accuracy_threshold=50.0):
        self.model.eval()
        for param in self.model.backbone.parameters():
            param.requires_grad = False
        for param in self.model.hidden.parameters():
            param.requires_grad = False

        embedding_dim = self.model.head.in_features
        num_classes = self.model.head.out_features
        linear_probe = nn.Linear(embedding_dim, num_classes).to(self.device)

        optimizer = torch.optim.SGD(linear_probe.parameters(), lr=1e-3)
        criterion = nn.CrossEntropyLoss()

        epochs = 5
        for _ in range(epochs):
            linear_probe.train()
            for x, y in self.train_set:
                x, y = x.to(self.device), y.to(self.device)
                with torch.no_grad():
                    features = self.model.backbone(x)
                    if isinstance(features, tuple):
                        features = features[0]
                    elif isinstance(features, dict):
                        features = features.get("x_norm_clstoken", next(iter(features.values())))
                    if features.dim() == 3:
                        features = features[:, 0]
                    hidden_out = self.model.hidden(features)

                logits = linear_probe(hidden_out)
                loss = criterion(logits, y)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()


        class_correct = defaultdict(int)
        class_total = defaultdict(int)

        linear_probe.eval()
        with torch.no_grad():
            for x, y in self.train_set:
                x, y = x.to(self.device), y.to(self.device)
                features = self.model.backbone(x)
                if isinstance(features, tuple):
                    features = features[0]
                elif isinstance(features, dict):
                    features = features.get("x_norm_clstoken", next(iter(features.values())))
                if features.dim() == 3:
                    features = features[:, 0]
                hidden_out = self.model.hidden(features)
                logits = linear_probe(hidden_out)
                _, preds = torch.max(logits, 1)
                for true, pred in zip(y, preds):
                    class_total[int(true)] += 1
                    if int(true) == int(pred):
                        class_correct[int(true)] += 1

        class_accuracy = {
            c: 100 * class_correct[c] / class_total[c]
            for c in class_total
        }

        weak_classes = [c for c, acc in class_accuracy.items() if acc < accuracy_threshold]
        return class_accuracy, weak_classes



  def model_edit(self, weak_classes):
        if not weak_classes:
            print(f"Client {self.id}: no weak classes, skipping editing.")
            return

        weak_subset = [sample for sample in self.train_set.dataset if sample[1] in weak_classes]
        if len(weak_subset) == 0:
            print(f"Client {self.id}: weak classes present, but no samples found. Skipping.")
            return

        weak_loader = DataLoader(weak_subset, batch_size=self.batch_size, shuffle=True)
        for param in self.model.encoder.parameters():
            param.requires_grad = False
        for param in self.model.head.parameters():
            param.requires_grad = True

        self.model.train()
        optimizer = torch.optim.SGD(self.model.head.parameters(), lr=1e-3, momentum=0.9)
        criterion = nn.CrossEntropyLoss()

        for _ in range(1):
            for images, labels in weak_loader:
                images, labels = images.to(self.device), labels.to(self.device)
                optimizer.zero_grad()
                outputs = self.model(images)
                loss = criterion(outputs, labels)
                loss.backward()
                if self.grad_mask and self.masking_mode:
                    self.gradient_mask(mode=self.masking_mode)
                optimizer.step()




  def create_log(self, model_name, path, round_number):

    log_dict= {
        "client_id":[self.id],
        "backbone":[self.backbone],
        "model_name":[model_name],
        "initial_model_name":[initial_model_name],
        "path": [path],
        "num_of_clients":[self.n_clients],
        "Measurement_criteria":["accuracy,loss,train_loss"],
        "accuracy":[self.accuracy],
        "loss":[self.loss],
        "train_loss":[self.train_loss],
        "splitting_method":[self.spliting_method],
        "gradient_mask":[self.grad_mask],
        "sparsity":[self.sparsity],
        "size_of_dataset": [len(self.data_set.dataset)],
        "client_train_size":[len(self.train_set.dataset)],
        "client_test_size":[len(self.test_set.dataset)],
        "train_test_ratio":[self.spliting_ratio],
        "classes":[self.classes],
        "round_number":[round_number],
        "duration":[self.duration],
        "time": [get_current_time()],
        "path_to_subsets":[self.path_to_subsets],
        "path_to_class_combs":[self.path_to_class_combs],
        "has_model_edited":[not self.weak_set is None]
    }


    return pd.DataFrame(log_dict)

**<h1>❗ Important Notice ❗</h1>**

**Regarding `save_data`:**
Please be aware that checking the `save_data` option will generate a **new data subset** and a **new initial model** based on your specified parameters.

**⚠️ Crucial: Using Existing Models with New Data/Parameters ⚠️**
If you intend to use an *existing model* but wish to apply it to a *different data subset*, use a *different data splitting method*, or make *any other changes to the data or algorithm*, you **MUST** assign a **new and unique model name**.

**Why is this critical?**
Failing to use a unique model name will make it impossible to differentiate between models for each client when filtering. This will lead to inaccurate results from the client aggregation function on the server.

In [None]:

load_data = True #@param{"type":"boolean"}
initial_model_name = "0125ec00-f049-4fb2-9fe3-30120285ba50" #@param{"type":"string"}





if load_data:

  initial_model_log_df = pd.read_csv("/content/drive/MyDrive/MLDL_FederatedLearning/csv/global_log.csv")
  prev_global_model_name = initial_model_log_df[initial_model_log_df["model_name"] == initial_model_name]["prev_global_model_name"].values[0]
  prev_clients = pd.read_csv("/content/drive/MyDrive/MLDL_FederatedLearning/csv/client_log.csv")
  prev_clients = prev_clients[prev_clients["initial_model_name"]== prev_global_model_name];

  path_to_subsets =prev_clients["path_to_subsets"].values[0]
  path_to_class_combs = prev_clients["path_to_class_combs"].values[0]
  print(path_to_subsets)
  print(path_to_class_combs)


  initial_model_log_df = pd.read_csv("/content/drive/MyDrive/MLDL_FederatedLearning/csv/global_log.csv")
  initial_model_path = initial_model_log_df[initial_model_log_df["model_name"] == initial_model_name]["path"].values[0]
  initial_model_round_num = initial_model_log_df[initial_model_log_df["model_name"] == initial_model_name]["round_number"].values[0]
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  dino_model = torch.hub.load('facebookresearch/dino:main', backbone)
  initial_model = DinoClassifier(dino_model=dino_model, num_classes=100, device=device)
  initial_model.load_state_dict(torch.load(initial_model_path))

  client_data = torch.load(path_to_subsets, weights_only=False)
  if not spliting_method == "i.i.d. sharing":
    class_combs = torch.load(path_to_class_combs)


method = "iid" if spliting_method == "i.i.d. sharing" else "noniid"


/content/drive/MyDrive/MLDL_FederatedLearning/client_subsets/client_data_noniid_80clients_c86b63e6-72b3-45f7-98e4-438bb6b0bf18.pth
/content/drive/MyDrive/MLDL_FederatedLearning/client_subsets/class_combs_noniid80clients_9bb5531e-d778-478f-83dd-e4243199dbf5.pth


Downloading: "https://github.com/facebookresearch/dino/zipball/main" to /root/.cache/torch/hub/main.zip
Downloading: "https://dl.fbaipublicfiles.com/dino/dino_deitsmall16_pretrain/dino_deitsmall16_pretrain.pth" to /root/.cache/torch/hub/checkpoints/dino_deitsmall16_pretrain.pth
100%|██████████| 82.7M/82.7M [00:00<00:00, 158MB/s]


Detected feature dimontion: 384


In [None]:
 #@title Extraction of the model we want to use as initial model
# if load_data:
#   global_log = pd.read_csv("/content/drive/MyDrive/MLDL_FederatedLearning/csv/global_log.csv")


#   filter = (global_log["aggregation_method"] == "EMA") & (global_log["round_number"]==2)
#   filtered_models = global_log[filter]

#   print(filtered_models["model_name"].values)

#   filtered_models.head()


In [None]:
from uuid import uuid4
save_data = False #@param{"type":"boolean"}

try :
  if load_data:
    save_data = False
except:
  pass
if save_data:
  method = "iid" if spliting_method == "i.i.d. sharing" else "noniid"
  path_to_subsets = f"/content/drive/MyDrive/MLDL_FederatedLearning/client_subsets/client_data_{method}_{str(n_clients)}clients_{str(uuid4())}.pth"
  if spliting_method == "i.i.d. sharing":
    client_data = iid_sharing(train_set, n_clients)
    class_combs = "all"
    print(spliting_method)
  else:
    client_data, class_combs = noniid_sharing(train_set,Nc=Nc, n_clients=n_clients)
    path_to_class_combs = "/content/drive/MyDrive/MLDL_FederatedLearning/client_subsets/class_combs_"+method + str(n_clients)+"clients_"+str(uuid4())+".pth"
    print(spliting_method,path_to_class_combs)

  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  dino_model = torch.hub.load('facebookresearch/dino:main', backbone)
  initial_model = DinoClassifier(dino_model=dino_model, num_classes=100, device=device)
  initial_model_name = next_id("/content/drive/MyDrive/MLDL_FederatedLearning/csv/global_log.csv")
  initial_model_path = "/content/drive/MyDrive/MLDL_FederatedLearning/models/global/" + initial_model_name + ".pth"
  initial_model_round_num = 0
  initial_model_log = {
    "backbone": [backbone],
    "model_name": [initial_model_name],
    "num_of_clients": [n_clients],
    "path": [initial_model_path],
    "Measurement_criteria": [None],
    "prev_global_model_name":[None],
    "accuracy": [None],
    "loss": [None],
    "splitting_method": [spliting_method],
    "size_of_dataset": [len(train_dataset)],
    "train_test_ratio": [None],
    "classes": [None],
    "round_number": [0],
    "time": [get_current_time()],
    "path_to_subsets": [path_to_subsets],
    "path_to_class_combs": [path_to_class_combs],
    "num_of_participants": [None]
}
  initial_model_log["aggregation_method"] =[ np.nan]
  initial_model_log["contributors"] =[ np.nan]
  initial_model_log["momentum_vector_path"] = [np.nan]


  initial_model_log = pd.DataFrame(initial_model_log)
  initial_model_log = initial_model_log[['backbone',
                'num_of_clients',
                'splitting_method',
                'aggregation_method',
                'Measurement_criteria',
                'accuracy',
                'loss',
                'size_of_dataset',
                'train_test_ratio',
                'classes',
                'round_number',
                'num_of_participants',
                'model_name',
                'prev_global_model_name',
                "contributors",
                'path',
                "momentum_vector_path",
                'path_to_subsets',
                'path_to_class_combs',
                'time'
                ]]
  if not os.path.exists("/content/drive/MyDrive/MLDL_FederatedLearning/csv/global_log.csv"):
    initial_model_log.to_csv("/content/drive/MyDrive/MLDL_FederatedLearning/csv/global_log.csv", index=False)
  else:
    initial_model_log.to_csv("/content/drive/MyDrive/MLDL_FederatedLearning/csv/global_log.csv", mode='a', header=False, index=False)

  torch.save(initial_model.state_dict(), initial_model_path)




  torch.save(client_data, path_to_subsets)
  if method== "noniid":
    torch.save(class_combs, path_to_class_combs)





In [None]:
print('path to subsets:', path_to_subsets)
if method== "noniid":
  print('path to class combs:', path_to_class_combs)
print('initial model name:', initial_model_name)


path to subsets: /content/drive/MyDrive/MLDL_FederatedLearning/client_subsets/client_data_noniid_80clients_c86b63e6-72b3-45f7-98e4-438bb6b0bf18.pth
path to class combs: /content/drive/MyDrive/MLDL_FederatedLearning/client_subsets/class_combs_noniid80clients_9bb5531e-d778-478f-83dd-e4243199dbf5.pth
initial model name: 0125ec00-f049-4fb2-9fe3-30120285ba50


In [None]:
#@title Random clients selection

global_log = pd.read_csv("/content/drive/MyDrive/MLDL_FederatedLearning/csv/global_log.csv")


filter = (global_log["model_name"] == initial_model_name)
filtered_models = global_log[filter]

r_num = filtered_models["round_number"].values[0]

selection_percentage = 15 #@param {"type":"integer"}
set_seed(int(r_num),is_seed_fixed)
def get_random_clients(n_clients, initial_model_name, selection_percentage=10):
  if os.path.exists("/content/drive/MyDrive/MLDL_FederatedLearning/csv/client_log.csv"):
    clients_df = pd.read_csv("/content/drive/MyDrive/MLDL_FederatedLearning/csv/client_log.csv")
    clients_df = clients_df[clients_df['initial_model_name']== initial_model_name]
    selected_clients = clients_df['client_id'].values
  else:
    selected_clients = np.array([], dtype=np.int16)
  while len(selected_clients) < (selection_percentage / 100 ) * n_clients:
    rand_int = torch.randint(0,n_clients,(1,))[0].item()
    if rand_int not in selected_clients:
      selected_clients = np.append(selected_clients,rand_int)

  return selected_clients

selected_clients = get_random_clients(n_clients,initial_model_name,selection_percentage)
print(selected_clients)

[51 14 47 45  6 70 32 73  8  4 39 30]


In [None]:
is_model_editing_active = False #@param{"type":"boolean"}

In [None]:
#Main training loop

set_seed(seed, is_seed_fixed)

if spliting_method == "i.i.d. sharing":
    path_to_class_combs = " "

log_file = "/content/drive/MyDrive/MLDL_FederatedLearning/csv/client_log.csv"

counter = 1
for client_num in selected_clients:
    print(counter, "/", len(selected_clients), "#" * 100)
    counter += 1

    if os.path.exists(log_file):
        all_clients_df = pd.read_csv(log_file)
        filtered_clients_df = all_clients_df[all_clients_df["initial_model_name"] == initial_model_name]
        if client_num in filtered_clients_df["client_id"].values:
            print(f"Client {client_num} is already trained")
            continue

    client = Client(
        id=client_num,
        data=client_data[client_num],
        spliting_method=spliting_method,
        classes="all",
        n_clients=n_clients,
        batch_size=batch_size,
        num_epochs=10,
        grad_mask=True,
        masking_mode="topk",
        initial_model=copy.deepcopy(initial_model),
        backbone=backbone,
        path_to_model=None,
        spliting_ratio={"train": 0.8, "test": 0.2},
        path_to_subsets=path_to_subsets,
        path_to_class_combs=path_to_class_combs
    )

    print("Data size:", len(client_data[client_num]))
    print("Backbone:", backbone)

    if is_model_editing_active:
        class_accuracy, weak_classes = client.linear_probe_eval(accuracy_threshold=50.0)
        client.model_edit(weak_classes)

    client.train()
    client.evaluate()
    print(f"Client {client_num} - accuracy: {client.accuracy}, loss: {client.loss}")

    model_name = next_id(log_file)
    model_path = f"/content/drive/MyDrive/MLDL_FederatedLearning/models/clients/{model_name}.pth"

    log = client.create_log(
        model_name=model_name,
        path=model_path,
        round_number=initial_model_round_num + 1
    )

    if is_model_editing_active:
        log["weak_classes"] = [weak_classes]
        log["masking_mode"] = client.masking_mode
        log["editing_applied"] = bool(weak_classes)

    client.confirm_save(log["path"][0])

    if not os.path.exists(log_file):
        log.to_csv(log_file, index=False)
        print("new csv file")
        print(f"name: {log['model_name'][0]}")
        print(f"path: {log['path'][0]}")
        print(f"Logged client {client_num} to {log_file}")
    else:
        client.confirm_save(log["path"][0])
        log.to_csv(log_file, mode='a', header=False, index=False)
        print(f"name: {log['model_name'][0]}")
        print(f"path: {log['path'][0]}")
        print(f"Logged client {client_num} to {log_file}")

    del client


1 / 12 ####################################################################################################
Data size:  523
Backbone:  dino_vits16
client 51, Training- epoch 0 - epoch loss:3.2430
client 51, Training- epoch 1 - epoch loss:3.1873
client 51, Training- epoch 2 - epoch loss:3.0169
client 51, Training- epoch 3 - epoch loss:3.0014
client 51, Training- epoch 4 - epoch loss:2.9462
client 51, Training- epoch 5 - epoch loss:2.8692
client 51, Training- epoch 6 - epoch loss:2.8175
client 51, Training- epoch 7 - epoch loss:2.7620
client 51, Training- epoch 8 - epoch loss:2.6644
client 51, Training- epoch 9 - epoch loss:2.6306
Client 51- accuracy: 47.12918660287081, loss: 2.5879819587086947
name: 5510e511-c401-45b8-8d39-588eb220724c 
path: /content/drive/MyDrive/MLDL_FederatedLearning/models/clients/f81d8777-c4cc-4285-84fc-e9c8dffb40df.pth 
Logged client 51 to /content/drive/MyDrive/MLDL_FederatedLearning/csv/client_log.csv
2 / 12 #####################################################

In [None]:
log.head()

Unnamed: 0,client_id,backbone,model_name,initial_model_name,path,num_of_clients,Measurement_criteria,accuracy,loss,train_loss,...,client_train_size,client_test_size,train_test_ratio,classes,round_number,duration,time,path_to_subsets,path_to_class_combs,has_model_edited
0,30,dino_vits16,71b80f2f-7871-4b16-884f-9e2bacaeab90,0125ec00-f049-4fb2-9fe3-30120285ba50,/content/drive/MyDrive/MLDL_FederatedLearning/...,80,"accuracy,loss,train_loss",50.626566,2.48283,2.527732,...,399,100,"{'train': 0.8, 'test': 0.2}",all,6,19.525019,2025-07-04 08:21:46,/content/drive/MyDrive/MLDL_FederatedLearning/...,/content/drive/MyDrive/MLDL_FederatedLearning/...,False


In [None]:
import pandas as pd
import os

# Load your DataFrame
df = pd.read_csv('/content/drive/MyDrive/MLDL_FederatedLearning/csv/client_log.csv')  # Change to your file path and format

# Drop rows where 'path' is NaN or not a string
df = df[df['path'].notna()]
df['path'] = df['path'].astype(str).str.strip()  # Remove whitespace

# Check if each path exists
df_filtered = df[df['path'].apply(lambda x: os.path.exists(x))]
n_error = len(df["client_id"].values) -  len(df_filtered["client_id"].values)
print(f" {n_error} clients' log are removed due to the not existing saved model file")
# Save the cleaned DataFrame
df_filtered.to_csv('/content/drive/MyDrive/MLDL_FederatedLearning/csv/client_log.csv', index=False)
# print(len(df),len(df_filtered))

 0 clients' log are removed due to the not existing saved model file


In [None]:
# --- ProFedQ Methods for Client class ---
