<a href="https://colab.research.google.com/github/mohammad-rahbari/Federated-Learning-MLDL/blob/master/notebooks/FederatedLearning_clients.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [29]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Importing DINO and installing its dependencies

In [2]:
# @title Clon the DINO ripo
!git clone https://github.com/facebookresearch/dino.git

Cloning into 'dino'...
remote: Enumerating objects: 175, done.[K
remote: Total 175 (delta 0), reused 0 (delta 0), pack-reused 175 (from 1)[K
Receiving objects: 100% (175/175), 24.47 MiB | 23.01 MiB/s, done.
Resolving deltas: 100% (100/100), done.


In [3]:
# @title Installing required dependencies regarding DINO
%cd dino
!pip install -r requirements.txt
!pip install timm

/content/dino
[31mERROR: Could not open requirements file: [Errno 2] No such file or directory: 'requirements.txt'[0m[31m


In [4]:
import torch
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import random_split,DataLoader


# preprocessing the CIFAR-100 dataset

feature size in CIFAR is 32x32 but DINO requires 224x224 in the input layer.

In first step we upscale the dataset and then we add randomization to it

In last step of transformation we normalize data usind mean value and standard division of ImageNet



In [5]:
transform = transforms.Compose([
    transforms.Resize(256),
    transforms.RandomCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=(0.485, 0.456, 0.406),
                         std=(0.229, 0.224, 0.225))
])

In [6]:
from torch.utils.data import ConcatDataset
import numpy as np
from torchvision.datasets import CIFAR100
train_dataset = torchvision.datasets.CIFAR100(
    root='./data', train=True, download=True, transform=transform)

test_dataset = torchvision.datasets.CIFAR100(root='./data', train=False,
                                       download=True, transform=transform)
full_train = train_dataset

# Verify the length of the new dataset
print(f"Length of combined dataset: {len(full_train)}")


100%|██████████| 169M/169M [00:03<00:00, 42.4MB/s]


Length of combined dataset: 50000


In [7]:

#@title Imports
from sklearn.model_selection import StratifiedKFold
from torch.utils.data import Subset
import numpy as np
import random
import torch
import pandas as pd
import os

# Set Hyperparameters regarding the data spliting here!

In [8]:

#@title set the parameters here!!



number_of_clients = None
train_frac = 1 #@param
val_frac = 0 #@param
batch_size = 32 #@param{type:"integer"}
is_seed_fixed = True #@param{type:"boolean"}
seed = 42 #@param{type:"integer"}

def set_seed(seed=42, is_seed_fixed=True):
  if not is_seed_fixed:
    return
  torch.manual_seed(seed)
  torch.cuda.manual_seed(seed)
  torch.cuda.manual_seed_all(seed)
  np.random.seed(seed)
  random.seed(seed)
  torch.backends.cudnn.deterministic = True
  torch.backends.cudnn.benchmark = False


set_seed(seed,is_seed_fixed)



#@markdown </br> <h5>Indicate the number of clients that contribute in training:</h5>
n_clients = 100 #@param{type:"integer"}

#@markdown </br></br> <b>splitting hyperparameters</b>

spliting_method = "i.i.d. sharing" #@param["i.i.d. sharing","non-i.i.d. sharing"]
backbone = "dino_vits16" #@param["dino_resnet50", "dino_vits16", "dino_xcit_small_12_p16"]



In [9]:
#@title Set the parameters here only if <b>non-i.i.d. sharing</b> method had been selected!!
#@markdown Nc is the number of classes that each subset can contain
if spliting_method == "non-i.i.d. sharing":
  Nc = 25 #@param{type:"integer"}

  # are_classes_overlaping = False #@param{type:"boolean"}

#@markdown <h3>If we consider the Number of classes M and nummber of client K then:</h3>
#@markdown <ul>
#@markdown   <li>Nc should be:
#@markdown     <ul>
#@markdown       <li>
#@markdown         Greater than or equal to <b>\\(\frac{M}{K}\\)</b>
#@markdown       </li>
#@markdown       <li>
#@markdown         Less than or equal to K </b>
#@markdown       </li>
#@markdown     </ul>
#@markdown   </li>
#@markdown   <li>
#@markdown   Muximum number of clients means all classes contribute in every client
#@markdown   </li>

#@markdown </ul>


#@markdown </br></br><h3>Combination of classes are randomly selected which suits definition of federated learning especially Cross-device federated learning</h3>





# Data splitting

In [10]:
# @title data splitting

set_seed(seed,is_seed_fixed)
generator = torch.Generator().manual_seed(seed)

total_size = len(full_train)
train_size = int(train_frac * total_size)
val_size   = total_size - train_size

train_set, val_set = random_split(full_train, [train_size, val_size], generator=generator)
train_indices = torch.tensor(train_set.indices)
val_indices = torch.tensor(val_set.indices)

train_set = Subset(train_set.dataset, train_indices)
val_set = Subset(val_set.dataset, val_set.indices)

train_loader = DataLoader(train_set, batch_size=len(train_set), shuffle=False)
if val_size > 0:
  val_loader  =  DataLoader(val_set, batch_size=len(val_set), shuffle=False)
  print(f"Validation dataset size: {len(val_set)}")




print(f"Train dataset size: {len(train_set)}")

lenghts = [train_size//n_clients] * n_clients

for i in range(train_size % n_clients):
  lenghts[i] += 1
print("Size of subset: ", lenghts)





Train dataset size: 50000
Size of subset:  [500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500, 500]


In [11]:
# @title i.i.d sharing - split data dased on number of clients and with respect of label proportionality
set_seed(seed,is_seed_fixed)
def iid_sharing(dataset, n_clients):

  full_train_indices = dataset.indices
  full_train_labels = torch.from_numpy(np.array(dataset.dataset.targets)[full_train_indices]) #collects labels from all dataset
  unique_lables = torch.unique(full_train_labels) #Removes dupilication and generates a uniuqe list of labels (classes)
  classes_indices = {}


  for i in unique_lables:
    classes_indices[i] = (full_train_labels == i).nonzero(as_tuple=True)[0] #Collects and save Indices in an array based on classes

  for i in classes_indices.keys():
    classes_indices[i] = classes_indices[i][torch.randperm(classes_indices[i].shape[0])] #suffels the indices


  client_indices = {}



  for client in range(n_clients):
    if not client_indices.get(client):
      client_indices[client] = torch.empty(0, dtype=torch.long)


  for label in classes_indices.keys():
    class_sample_size = len(classes_indices[label]) // n_clients
    for k in range(n_clients):

      client_indices[k] = torch.cat((client_indices[k], classes_indices[label][class_sample_size * k : class_sample_size * (k + 1)]), dim=0)

  for label in classes_indices.keys():
    remainder  = len(classes_indices[label]) % n_clients

    for r in range(1,remainder+1):
      random_client = random.choice(list(client_indices.keys()))
      client_indices[random_client] = torch.cat((client_indices[random_client], classes_indices[label][-r].unsqueeze(0)), dim=0)

  full_train_indices_t = torch.as_tensor(full_train_indices, dtype=torch.long)
  client_data = {
      client_id: Subset(
          dataset.dataset,
          full_train_indices_t[indices][torch.randperm(len(indices))].tolist()
      )
      for client_id, indices in client_indices.items()
  }


  #split actual dataset to multiple subset for clients
  # client_data={
  #     client_id: Subset(dataset.dataset,indices[torch.randperm(len(indices))])
  #     for client_id, indices in client_indices.items()
  # }
  return client_data

# indices_check = []
# client_data = iid_sharing(train_set, n_clients)
# s = 0
# for client_id in client_data.keys():
#   indices_check = indices_check + list(client_data[client_id].indices)
#   s+= len(client_data[client_id])
#   print(f"Client {client_id} has {len(client_data[client_id])} samples")
# print(s, len(train_set))
# del indices_check,client_data

In [12]:
# @title Non i.i.d sharing


# @title i.i.d sharing - split data dased on number of clients and with respect of label proportionality
set_seed(seed,is_seed_fixed)
def noniid_sharing(dataset,Nc , n_clients):

  full_train_indices = dataset.indices
  full_train_labels = torch.tensor(dataset.dataset.targets)[full_train_indices] #collects labels from all dataset
  unique_lables = torch.unique(full_train_labels) #Removes dupilication and generates a uniuqe list of labels (classes)

  classes_indices = {}
  classes_size = torch.zeros(unique_lables.size()[0])

  class_combs = get_class_combinations(unique_lables, Nc, n_clients)

  classes_num_partition = torch.zeros(unique_lables.size()[0])

  for i in unique_lables:
    classes_num_partition[i] = torch.sum(class_combs == i)
    classes_indices[i.item()] = torch.nonzero(full_train_labels == i).squeeze() #Collects and save Indices in an array based on classe
    classes_size[i] = classes_indices[i.item()].size()[0] #Calculate the number of smaples belonging to each class

  for i in classes_indices.keys():
    classes_indices[i] = classes_indices[i][torch.randperm(classes_indices[i].shape[0])] #suffels the indices

  client_indices = {client: torch.tensor([],dtype=torch.int64) for client in range(n_clients) }
  assigned_indices = set()

  #For each client we generate a element in client_indices dict to keep track of indices we'll associated with each client

  for client in range(n_clients):
    for cls in class_combs[client]:
      cls = cls.item()

      portion  = classes_size[cls] /classes_num_partition[cls]
      portion = int(portion) if not portion % 1 else int(portion) + 1
      portion = min(portion, classes_indices[cls].size()[0])

      class_partition = classes_indices[cls][:portion]

      class_partition = [idx for idx in class_partition if idx not in assigned_indices]

      assigned_indices.update(class_partition)

      class_partition = torch.tensor(class_partition, dtype=torch.int64)

      client_indices[client] = torch.cat((client_indices[client], class_partition), dim=0)

      classes_indices[cls] = classes_indices[cls][portion:]

  client_data={
      client_id: Subset(dataset.dataset,indices[torch.randperm(len(indices))])
      for client_id, indices in client_indices.items()
      if len(indices) > 0
  }



  return client_data, class_combs





def get_class_combinations(classes, Nc, n_clients):

  if Nc * n_clients < len(classes):
    Nc = len(classes) / n_clients
    Nc = int(Nc) if not Nc % 1 else int(Nc) + 1

    print(f"Number of classes per clients is lower then minimum. Nc changed to {Nc} (the least possible value)")

  combinations = torch.zeros((n_clients,Nc),dtype= torch.int64)
  counter =0
  ofset = 0
  flag = False

  for i in range(n_clients):
    if not flag:
      end_pointer = (i + 1) * Nc
      if end_pointer >= classes.size()[0]:
          ofset = (end_pointer - classes.size()[0])
          flag = True

      combinations[i] = classes[i* Nc - ofset: end_pointer - ofset]

    else:

      combinations[i]  = torch.randperm(classes.size()[0])[:Nc]

  return combinations

# Log System

In this section Requerd Data will be stored.<br/><br/>
**Archaving this information will make it possible to:**
*   Handle Clients
*   Manage the models
*   Keep track of results of different Backbones
*   Compare measurement criteria
*   Handel model merging process
*   Save path to the models

<br/><br/>
**These data will be saved in two seperted csv file to :**

1.   Store the LOCAL Models  
2.   Store the GLOBAL Models resulted by each round

<br/><br/>
The csv files will be handeled as panda.dataframe and each row in the csv file addresses one of models
<br/>

**Columns (COMMON):**<br/>
1. Backbone model name
2. Model name
3. Path
4. Time of log
5. Measurement criteria
 * loss
 * Accuracy
 * ...?
6. Size of dataset

**Columns (Local Models only):**<br/>
7. Client Id
8. Classes (Indicate which classes have been covered by each client)(format:"2,4,63,80,9" or "all" for all the classes)
9. Round number
10. Duration of training
11. Train Test ratio

**Columns (Global Models only):**<br/>
7. Number of clients
7. Number of rounds
8. Model Aggregation method







In [13]:
# @title Functions
import torch
from datetime import datetime
import time
from google.colab import drive
from uuid import uuid4
import os

def get_current_time():
  now = datetime.now()

  formatted_date_time = now.strftime("%Y-%m-%d %H:%M:%S") # Format the date and time as a string

  return formatted_date_time



tic_start_time = None

def next_id(log_path):
  if os.path.exists(log_path) :
    df = pd.read_csv(log_path)
    while True:
      uuid = str(uuid4())
      if uuid not in df["model_name"].values:
        return uuid
  else:
    return str(uuid4())



def tic():
    global tic_start_time
    tic_start_time = time.perf_counter() # start the timer

def toc():
    if tic_start_time is None:
        print("Error: You must call tic() before toc()")
        return None
    elapsed_time = time.perf_counter() - tic_start_time
    return elapsed_time




# Model and model configuration

In [14]:
import torch
import torch.nn as nn
import torch.optim as optim
from collections import defaultdict

class DinoClassifier(nn.Module):
  def __init__(self, dino_model, num_classes:int=100, device=None):
    super(DinoClassifier, self).__init__()
    self.backbone = dino_model

    #We need to freaze thhe parameters of bakbone first so we can train only on the head layer(output layer)
    for param in self.backbone.parameters():
      param.requires_grad = False

    #determine the Device
    if device is None:
      device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    self.backbone.to(device)

    #To detect the output feature dimontion of backbone we run  Dummy forward pass
    with torch.no_grad():

      dummy_input = torch.randn(1,3,224,224).to(device)
      dummy_out = self.backbone(dummy_input)


      if isinstance(dummy_out, tuple):
        dummy_out = dummy_out[0]
      elif isinstance(dummy_out, dict):
        dummy_out = dummy_out.get("x_norm_clstoken", next(iter(dummy_out.values())))

      #If the output is 3D (B, T, D), we assume first token is the [CLS] token.
      if dummy_out.dim() == 3:
        dummy_feature = dummy_out[:,0]
      else:
        dummy_feature = dummy_out
      feature_dim = dummy_feature.shape[1]
      print("Detected feature dimontion:", feature_dim)


      #Difineing the classification Head
      self.head = nn.Linear(feature_dim, num_classes)

      #Ensure the head is trainable.
      for param in self.head.parameters():
        param.requires_grad = True

  def forward(self,x):

    #pass the input through the backbone
    features = self.backbone(x)

    if isinstance(features, tuple):
      features = features[0]
    elif isinstance(features, dict):
      features = features.get("x_norm_clstoken", next(iter(features.values())))


    # If featers are retuened as (B, T, D), use the first token
    if features.dim() == 3:
      cls_token = features[:,0]
    else:
      cls_token = features
    logits = self.head(cls_token)

    return logits




# Clients

In [15]:
from torch.utils.data import random_split
from torch.utils.data import Subset
import pandas as pd
import torch.hub
import copy
from collections import defaultdict
from torch.utils.data import DataLoader # Import DataLoader

set_seed(seed,is_seed_fixed)

class Client:


  def __init__(self, id, data, n_clients, spliting_method,num_local_steps = 5,  sparsity=0.1, batch_size = 32, classes="all", num_epochs= 10, backbone=None, path_to_model=None, initial_model=None, spliting_ratio={"train":0.8, "test":0.2}, path_to_subsets="", path_to_class_combs=""):
    self.id = id
    self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Corrected cuda() to is_available()
    self.data_set = data
    self.spliting_method = spliting_method
    self.classes = classes

    self.backbone = backbone
    self.sparsity = sparsity
    self.grad_mask = None
    self.path_to_model = path_to_model
    self.n_clients = n_clients
    if initial_model:
      self.model = initial_model
    else:
      self.model = None
    self.load_model()
    self.num_epochs = num_epochs
    self.spliting_ratio = spliting_ratio
    self.batch_size = batch_size

    self.train_set , self.test_set = self.test_train_split()
    self.num_local_steps = num_local_steps
    self.selected_batches = []
    self.get_random_batches()
    self.duration = 0.0
    self.train_loss = None
    self.accuracy = None
    self.loss = None
    self.path_to_subsets = path_to_subsets
    self.path_to_class_combs = path_to_class_combs


  def test_train_split(self):
    train_size = int(self.spliting_ratio.get("train") * len(self.data_set))
    test_size =  len(self.data_set) - train_size

    train_set, test_set = random_split(self.data_set, [ train_size, test_size ])
    train_set = DataLoader(train_set, batch_size=self.batch_size, shuffle=True,  num_workers=2)
    test_set = DataLoader(test_set, batch_size=self.batch_size, shuffle=False,  num_workers=2)

    return train_set, test_set

  def get_random_batches(self):
    total_batches = len(self.train_set)

    selected_indices = torch.sort(torch.randperm(total_batches)[:self.num_local_steps])[0]

    selected_indices = list(set(selected_indices.tolist()))
    random.shuffle(selected_indices)
    self.selected_batches = []

    for i, batch in enumerate(self.train_set):
      if i in selected_indices:

        self.selected_batches.append(batch)
    print("Total number of batches: ",total_batches, " - Number of selected batches: ", len(self.selected_batches), "selected batches: ",selected_indices )



  # def calculate_fisher_mask(self, n=5):
  #   criterion = nn.CrossEntropyLoss()
  #   fisher_score = {}
  #   last_mask = {}

  #   self.model.eval()
  #   for param in self.model.head.parameters():
  #     param.requires_grad= False

  #   for param in self.model.parameters():
  #     if param.requires_grad:
  #         fisher_score[param] = torch.zeros_like(param.data,device=self.device)
  #         last_mask[param] = torch.ones_like(param.data, device=self.device)

  #   for i in range(n):
  #     for param in fisher_score:
  #       fisher_score[param].zero_()

  #     for images, labels in self.train_set:
  #       images = images.to(self.device)
  #       labels = labels.to(self.device)

  #       outputs = self.model(images)

  #       loss = criterion(outputs, labels)

  #       self.model.zero_grad()
  #       loss.backward()

  #       for param in self.model.parameters():
  #           if param.requires_grad and param.grad is not None:
  #             fisher_score[param] += (param.grad.data.pow(2) * last_mask[param])

  #     new_mask = {}
  #     all_scores = torch.cat([torch.flatten(v) for v in fisher_score.values()])
  #     non_zero_scores=all_scores[all_scores!=0]
  #     k = int(self.sparsity * non_zero_scores.numel())
  #     threshold, _ = torch.kthvalue(non_zero_scores, non_zero_scores.numel()-k)

  #     for param, score in fisher_score.items():

  #         masked_score = score * last_mask[param]
  #         current_mask = ((masked_score < threshold) * last_mask[param]).float()
  #         new_mask[param] = current_mask
  #         last_mask[param] = new_mask[param]
  #   self.grad_mask = new_mask


  def load_model(self):

    if self.path_to_model:
      dino_model = torch.hub.load('facebookresearch/dino:main', self.backbone)
      self.model = DinoClassifier(dino_model=dino_model, num_classes=100, device=self.device)
      state_dict = torch.load(self.path_to_model)
      self.model.load_state_dict(state_dict)

    self.model.to(self.device)

  def gradient_mask(self):


    for name, param in self.model.head.named_parameters():
        if name in self.grad_mask and param.grad is not None:

          param.grad *= self.grad_mask[name].to(param.grad.device)



  def calculate_fisher_mask(self, n=5):
    criterion = nn.CrossEntropyLoss()
    head_params = [p for p in self.model.head.parameters()]
    param_ids = [id(p) for p in head_params]


    fisher_scores = {id_p:torch.zeros_like(p, device=self.device) for id_p,p in  zip(param_ids, head_params)}
    last_mask = {id_p:torch.ones_like(p, device=self.device) for id_p,p in  zip(param_ids, head_params)}



    self.model.eval()

    for _ in range(n):

      for v in fisher_scores.values():
        v.zero_()

      for images, labels in self.train_set:
        images, labels = images.to(self.device), labels.to(self.device)
        outputs = self.model(images)
        loss = criterion(outputs, labels)

        grads = torch.autograd.grad(
            loss,
            head_params,
            create_graph=False,
            retain_graph=False
        )

        for p, g in zip(head_params,grads):
          pid = id(p)
          fisher_scores[pid] += g.detach().pow(2) * last_mask[pid]

      all_scores = torch.cat([
           fisher_scores[id(p)].mul(last_mask[id(p)]).reshape(-1)
          for p in  head_params
          ])

      non_zero  = all_scores[all_scores != 0]

      if non_zero.numel() == 0:
        new_mask = {id(p):torch.zeros_like(p,device=self.device) for p in head_params }
        last_mask = new_mask
        continue

      total_nz = non_zero.numel()
      keep = int( (1 - self.sparsity)* total_nz)
      keep = min(keep, total_nz)

      if keep == 0:
        threshold = non_zero.max() + 1
      elif keep == total_nz:
        threshold = non_zero.min() - 1

      else:
        kth_smallest = total_nz - keep + 1
        threshold, _ = torch.kthvalue(non_zero, k= kth_smallest)

      new_mask = {}

      for p in head_params:
        pid = id(p)
        masked_scores = fisher_scores[pid] * last_mask[pid]
        current_mask = (masked_scores >= threshold).float() * last_mask[pid]
        new_mask[pid]  = current_mask
        last_mask[pid] = current_mask

    self.grad_mask = new_mask







  @torch.no_grad()
  def SGDM(self, buffer,weight_decay=0.0,lr=1e-3,momentum=0.9,damping=0.0, nesterov=False, max_=False):
    for param in self.model.head.parameters():
      if param.grad is None:
        continue
      grad = param.grad

      if weight_decay != 0:
        grad = grad.add(param, alpha=weight_decay)

      pid = id(param)
      buf = buffer.get(pid)
      if buf is None:
        buf = torch.zeros_like(param)
        buffer[pid] = buf


      if momentum != 0 :
        buf.mul_(momentum).add_(grad, alpha=(1 - damping))
        update = grad.add(buf, alpha= momentum) if nesterov else buf

      else:

        update = grad

      gm = self.grad_mask
      mask = gm.get(pid)

      if mask is None:
        mask = torch.ones_like(update)
      else:
        mask = mask.to(device=update.device, dtype=update.dtype)
        if mask.numel() == 1:
          mask = mask.expand_as(update)
        elif mask.shape != update.shape:

          mask = mask.expand_as(update)

      update = update * mask

      param.add_(update, alpha=(lr if max_ else -lr))

      buffer[pid] = buf
    return buffer









  def SGDM_train(self):
    criterion = nn.CrossEntropyLoss()

    tic()
    self.calculate_fisher_mask()



    for p in self.model.backbone.parameters():
      p.requires_grad = False
    self.model.backbone.eval()

    for p in self.model.head.parameters():
        p.requires_grad = True

    self.model.train()

    buffer = {}
    running_loss = 0.0


    for step_num in range(len(self.selected_batches)):
      images, labels = self.selected_batches[step_num]
      images, labels = images.to(self.device), labels.to(self.device)

      self.model.zero_grad(set_to_none=True)
      outputs = self.model(images)
      loss = criterion(outputs, labels.long())

      loss.backward()

      buffer = self.SGDM(buffer=buffer, momentum=0.9)





      running_loss += loss.item()
      # loc_stp_loss = running_loss / len(self.train_set)
      avg_loss = running_loss / (step_num + 1)
      print(f"client: {self.id}- local step number: {step_num} - step loss: {avg_loss:.4f} " )
    self.duration = toc()
    self.train_loss = running_loss / len(self.selected_batches)
    # self.train_loss = running_loss / max(1, len(self.selected_batches))

  def train_default(self):
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(self.model.head.parameters(), lr=1e-3, momentum=0.9)
    tic()
    self.calculate_fisher_mask()


    for p in self.model.backbone.parameters():
      p.requires_grad = False
    self.model.backbone.eval()

    for p in self.model.head.parameters():
        p.requires_grad = True

    self.model.train()

    running_loss = 0.0
    for step_num in range(len(self.selected_batches)):


      images, labels = self.selected_batches[step_num]
      images, labels = images.to(self.device), labels.to(self.device)

      optimizer.zero_grad(set_to_none=True)
      outputs = self.model(images)
      loss = criterion(outputs, labels.long())
      loss.backward()


      for p in self.model.head.parameters():
            if p.grad is None:
                continue
            m = self.grad_mask.get(id(p))
            if m is not None:
                p.grad.mul_(m.to(p.grad.device, dtype=p.grad.dtype))

      optimizer.step()

      running_loss += loss.item()

      # loc_stp_loss = running_loss / len(self.train_set)
      avg_loss = running_loss / (step_num + 1)
      print(f"client: {self.id}- local step number: {step_num} - step loss: {avg_loss:.4f} " )
    self.duration = toc()
    self.train_loss = running_loss / len(self.selected_batches)
    # self.train_loss = running_loss / max(1, len(self.selected_batches))

  def evaluate(self):
    criterion = nn.CrossEntropyLoss()
    self.model.eval()
    correct = 0
    total = 0
    test_loss = 0
    with torch.no_grad():
      for index, (images, labels) in enumerate(self.train_set):
        images, labels = images.to(self.device), labels.to(self.device)
        outputs = self.model(images)

        _, prediction = torch.max(outputs.data,1)
        loss = criterion(outputs, labels)
        test_loss += loss.item() * labels.size(0)


        total += labels.size(0)

        correct += (prediction == labels).sum().item()
    self.accuracy = 100 * correct / total
    self.loss = test_loss / total


  def confirm_save(self,path):
      torch.save(self.model.head.state_dict(),  path )

  def create_log(self, model_name, path, round_number):

    log_dict= {
        "client_id":[self.id],
        "backbone":[self.backbone],
        "model_name":[model_name],
        "initial_model_name":[initial_model_name],
        "path": [path],
        "num_of_clients":[self.n_clients],
        "Measurement_criteria":["accuracy,loss,train_loss"],
        "accuracy":[self.accuracy],
        "loss":[self.loss],
        "train_loss":[self.train_loss],
        "splitting_method":[self.spliting_method],
        "sparsity":[self.sparsity],
        "size_of_dataset": [len(self.data_set.dataset)],
        "client_train_size":[len(self.train_set.dataset)],
        "client_test_size":[len(self.test_set.dataset)],
        "train_test_ratio":[self.spliting_ratio],
        "classes":[self.classes],
        "round_number":[round_number],
        "duration":[self.duration],
        "time": [get_current_time()],
        "path_to_subsets":[self.path_to_subsets],
        "path_to_class_combs":[self.path_to_class_combs]
    }


    return pd.DataFrame(log_dict)

In [16]:
#@title Random clients selection



selection_percentage = 10 #@param {"type":"integer"}

def get_random_clients(n_clients, initial_model_name, selection_percentage=10):
  global_log = pd.read_csv("/content/drive/MyDrive/MLDL_FederatedLearning/csv/global_log.csv")
  filter = (global_log["model_name"] == initial_model_name)
  filtered_models = global_log[filter]
  r_num = filtered_models["round_number"].values[0]

  set_seed(int(r_num),is_seed_fixed)
  if os.path.exists("/content/drive/MyDrive/MLDL_FederatedLearning/csv/client_log.csv"):
    clients_df = pd.read_csv("/content/drive/MyDrive/MLDL_FederatedLearning/csv/client_log.csv")
    clients_df = clients_df[clients_df['initial_model_name']== initial_model_name]
    selected_clients = clients_df['client_id'].values
  else:
    selected_clients = np.array([], dtype=np.int16)
  while len(selected_clients) < (selection_percentage / 100 ) * n_clients:
    rand_int = torch.randint(0,n_clients,(1,))[0].item()
    if rand_int not in selected_clients:
      selected_clients = np.append(selected_clients,rand_int)

  return selected_clients

# selected_clients = get_random_clients(n_clients,initial_model_name,selection_percentage)
# print(selected_clients)

# Training loop

**<h1>❗ Important Notice ❗</h1>**

**Regarding `save_data`:**
Please be aware that checking the `save_data` option will generate a **new data subset** and a **new initial model** based on your specified parameters.

**⚠️ Crucial: Using Existing Models with New Data/Parameters ⚠️**
If you intend to use an *existing model* but wish to apply it to a *different data subset*, use a *different data splitting method*, or make *any other changes to the data or algorithm*, you **MUST** assign a **new and unique model name**.

**Why is this critical?**
Failing to use a unique model name will make it impossible to differentiate between models for each client when filtering. This will lead to inaccurate results from the client aggregation function on the server.

In [18]:

load_data = False #@param{"type":"boolean"}
initial_model_name = "fd2666ce-eda6-4873-b246-e1e6c1df708d" #@param{"type":"string"}
path_to_subsets = "/content/drive/MyDrive/MLDL_FederatedLearning/client_subsets/client_data_iid_100clients_3faef75c-38d6-4d9f-91af-9fb902f9d7eb.pth" # @param {"type":"string"}
path_to_class_combs = "" # @param {"type":"string"}





if load_data:

  initial_model_log_df = pd.read_csv("/content/drive/MyDrive/MLDL_FederatedLearning/csv/global_log.csv")
  initial_model_path = initial_model_log_df[initial_model_log_df["model_name"] == initial_model_name]["path"].values[0]
  initial_model_round_num = initial_model_log_df[initial_model_log_df["model_name"] == initial_model_name]["round_number"].values[0]
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  dino_model = torch.hub.load('facebookresearch/dino:main', backbone)
  initial_model = DinoClassifier(dino_model=dino_model, num_classes=100, device=device)
  initial_model.load_state_dict(torch.load(initial_model_path))

  client_data = torch.load(path_to_subsets, weights_only=False)
  if not spliting_method == "i.i.d. sharing":
    class_combs = torch.load(path_to_class_combs)





In [24]:
from uuid import uuid4
save_data = True #@param{"type":"boolean"}

if load_data:
  save_data = False

if save_data:
  method = "iid" if spliting_method == "i.i.d. sharing" else "noniid"
  if spliting_method == "i.i.d. sharing":
    path_to_subsets = f"/content/drive/MyDrive/MLDL_FederatedLearning/client_subsets/client_data_{method}_{str(n_clients)}clients_{str(uuid4())}.pth"
    client_data = iid_sharing(train_set, n_clients)
    class_combs = "all"
    print(spliting_method)
  else:
    client_data, class_combs = noniid_sharing(train_set,Nc=Nc, n_clients=n_clients)
    path_to_class_combs = f"/content/drive/MyDrive/MLDL_FederatedLearning/client_subsets/class_combs_{method}_{str(n_clients)}clients_{str(uuid4())}.pth"
    print(spliting_method)

  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  dino_model = torch.hub.load('facebookresearch/dino:main', backbone)
  initial_model = DinoClassifier(dino_model=dino_model, num_classes=100, device=device)
  initial_model_name = next_id("/content/drive/MyDrive/MLDL_FederatedLearning/csv/global_log.csv")
  initial_model_path = "/content/drive/MyDrive/MLDL_FederatedLearning/models/global/" + initial_model_name + ".pth"
  initial_model_round_num = 0
  initial_model_log = {
    "backbone": [backbone],
    "model_name": [initial_model_name],
    "num_of_clients": [n_clients],
    "path": [initial_model_path],
    "Measurement_criteria": [None],
    "prev_global_model_name":[None],
    "accuracy": [None],
    "loss": [None],
    "splitting_method": [spliting_method],
    "size_of_dataset": [len(train_dataset)],
    "train_test_ratio": [None],
    "classes": [None],
    "round_number": [0],
    "time": [get_current_time()],
    "path_to_subsets": [path_to_subsets],
    "path_to_class_combs": [path_to_class_combs],
    "num_of_participants": [None]
}
  initial_model_log["aggregation_method"] =[ np.nan]
  initial_model_log["contributors"] =[ np.nan]
  initial_model_log["momentum_vector_path"] = [np.nan]


  initial_model_log = pd.DataFrame(initial_model_log)
  initial_model_log = initial_model_log[['backbone',
                'num_of_clients',
                'splitting_method',
                'aggregation_method',
                'Measurement_criteria',
                'accuracy',
                'loss',
                'size_of_dataset',
                'train_test_ratio',
                'classes',
                'round_number',
                'num_of_participants',
                'model_name',
                'prev_global_model_name',
                "contributors",
                'path',
                "momentum_vector_path",
                'path_to_subsets',
                'path_to_class_combs',
                'time'
                ]]
  if not os.path.exists("/content/drive/MyDrive/MLDL_FederatedLearning/csv/global_log.csv"):
    initial_model_log.to_csv("/content/drive/MyDrive/MLDL_FederatedLearning/csv/global_log.csv", index=False)
  else:
    initial_model_log.to_csv("/content/drive/MyDrive/MLDL_FederatedLearning/csv/global_log.csv", mode='a', header=False, index=False)

  torch.save(initial_model.state_dict(), initial_model_path)




  torch.save(client_data, path_to_subsets)
  if method== "noniid":
    torch.save(class_combs, path_to_class_combs)





i.i.d. sharing


Using cache found in /root/.cache/torch/hub/facebookresearch_dino_main


Detected feature dimontion: 384


In [25]:
set_seed(seed,is_seed_fixed)

selected_clients = get_random_clients(n_clients,initial_model_name,selection_percentage)
print(selected_clients)

if not spliting_method == "i.i.d. sharing":
  path_to_class_combs = " "

log_file = "/content/drive/MyDrive/MLDL_FederatedLearning/csv/client_log.csv"

print(initial_model_name)
for client_num in selected_clients:
  if os.path.exists("/content/drive/MyDrive/MLDL_FederatedLearning/csv/client_log.csv"):
    all_clients_df = pd.read_csv(log_file)
    filtered_clients_df = all_clients_df[all_clients_df["initial_model_name"] == initial_model_name]
    print( f"{np.where(selected_clients == client_num)[0] +1 }/{len(selected_clients)} ","#"*100)
    if client_num in filtered_clients_df["client_id"].values:
      print(f"Client {client_num} is already trained")
      continue
# try:
  client = Client(id=client_num,
                data=client_data[client_num] ,
                spliting_method=spliting_method,
                classes="all",
                n_clients=n_clients,
                batch_size = batch_size,
                num_epochs= 10,
                initial_model = copy.deepcopy(initial_model),
                backbone=backbone,
                path_to_model=None,
                spliting_ratio={"train":0.8, "test":0.2},
                path_to_subsets=path_to_subsets,
                path_to_class_combs=path_to_class_combs
                )
  print("Data size: ",len(client_data[client_num]))
  print("Backbone: ", backbone)


  client.SGDM_train()
  # client.train_default()
  client.evaluate()
  # Use the save_client method from the Client class
  log = client.create_log(
      model_name=next_id(log_file), # Generate a new model name
      path=f"/content/drive/MyDrive/MLDL_FederatedLearning/models/clients/{next_id(log_file)}.pth", # Generate a new path
      round_number=initial_model_round_num + 1
      )
  client.confirm_save(log['path'][0]) # Save the model

  if not os.path.exists(log_file):
    log.to_csv(log_file, index=False)

    print("new csv file ")
    print(f"name: {log['model_name'][0]} ")
    print(f"path: {log['path'][0]} ")
    print(f"Logged client {client_num} to {log_file}")

  else: # HERE
  # Create a new CSV file IF path doesn't exist
    # This check is no longer necessary as we generate a new id and path every time
    path_check = pd.read_csv(log_file)['model_name'].values # model_name

    # if log['model_name'][0] not in path_check:
    client.confirm_save(log['path'][0])
    log.to_csv(log_file, mode='a', header=False, index=False)
    print(f"name: {log['model_name'][0]} ")
    print(f"path: {log['path'][0]} ")
    print(f"Logged client {client_num} to {log_file}")
    # else :
    #   print("Existing Log")
  del client

[44 39 33 60 63 79 27  3 97 83]
e0dba491-1379-4419-86d3-8d10285ec32c
Total number of batches:  13  - Number of selected batches:  5 selected batches:  [6, 3, 2, 12, 9]
Data size:  500
Backbone:  dino_vits16
client: 44- local step number: 0 - step loss: 6.8544 
client: 44- local step number: 1 - step loss: 7.0364 
client: 44- local step number: 2 - step loss: 6.8248 
client: 44- local step number: 3 - step loss: 6.8290 
client: 44- local step number: 4 - step loss: 6.9401 
new csv file 
name: 143d81d1-904a-4b85-89ba-adc7def1783d 
path: /content/drive/MyDrive/MLDL_FederatedLearning/models/clients/8a496a88-8575-4722-babe-2b3228de2673.pth 
Logged client 44 to /content/drive/MyDrive/MLDL_FederatedLearning/csv/client_log.csv
[2]/10  ####################################################################################################
Total number of batches:  13  - Number of selected batches:  5 selected batches:  [0, 7, 5, 9, 10]
Data size:  500
Backbone:  dino_vits16
client: 39- local step n

In [17]:
import pandas as pd
import os

# Load your DataFrame
df = pd.read_csv('/content/drive/MyDrive/MLDL_FederatedLearning/csv/client_log.csv')  # Change to your file path and format

# Drop rows where 'path' is NaN or not a string
df = df[df['path'].notna()]
df['path'] = df['path'].astype(str).str.strip()  # Remove whitespace

# Check if each path exists
df_filtered = df[df['path'].apply(lambda x: os.path.exists(x))]
n_error = len(df["client_id"].values) -  len(df_filtered["client_id"].values)
print(f" {n_error} clients' log are removed due to the not existing saved model file")
# Save the cleaned DataFrame
df_filtered.to_csv('/content/drive/MyDrive/MLDL_FederatedLearning/csv/client_log.csv', index=False)
# print(len(df),len(df_filtered))

 0 clients' log are removed due to the not existing saved model file


# Automated training loop

In [None]:
del selected_clients

In [35]:
from google.colab import output
from google.colab import drive

set_seed(seed,is_seed_fixed)

global_log_df = None
# @title automate
import pandas as pd
import os
import time
initial_model_name = "e0dba491-1379-4419-86d3-8d10285ec32c" #@param
flag = False
counter = 0
while not flag:

  while True:
    del global_log_df
    global_log_df = pd.read_csv("/content/drive/MyDrive/MLDL_FederatedLearning/csv/global_log.csv")
    pre_models_names = global_log_df["prev_global_model_name"].values


    if initial_model_name in pre_models_names:
      log_buffer = global_log_df[global_log_df["prev_global_model_name"] == initial_model_name ]
      initial_model_name = log_buffer["model_name"].values[0]

    else:
      # log_buffer = global_log_df[global_log_df["model_name"] == initial_model_name]
      r = log_buffer["round_number"].values[0]
      path_to_subsets = log_buffer["path_to_subsets"].values[0]
      path_to_class_combs = log_buffer["path_to_class_combs"].values[0]
      break


  counter +=1
  if r > 200 or counter > 100:
    flag = True
    break
  clients_data = pd.read_csv("/content/drive/MyDrive/MLDL_FederatedLearning/csv/client_log.csv")
  filter =  clients_data['initial_model_name']== initial_model_name
  filtered_clients_data = clients_data[filter] # Using filter to collect clients with specified initial model
  if len(filtered_clients_data) >= n_clients / selection_percentage:
    print("Waiting for Aggregation" )
    print("last model was: ",initial_model_name )
    time.sleep(20)

    continue
  output.clear()
  try:
    initial_model_log_df = pd.read_csv("/content/drive/MyDrive/MLDL_FederatedLearning/csv/global_log.csv")
    initial_model_path = initial_model_log_df[initial_model_log_df["model_name"] == initial_model_name]["path"].values[0]
    initial_model_round_num = initial_model_log_df[initial_model_log_df["model_name"] == initial_model_name]["round_number"].values[0]
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    dino_model = torch.hub.load('facebookresearch/dino:main', backbone)
    initial_model = DinoClassifier(dino_model=dino_model, num_classes=100, device=device)
    initial_model.load_state_dict(torch.load(initial_model_path))

    client_data = torch.load(path_to_subsets, weights_only=False)
    if not spliting_method == "i.i.d. sharing":
      class_combs = torch.load(path_to_class_combs)


    print("="*200,"\n","="*200)
    print("round number:",r, " - initial model:",initial_model_name)

    selected_clients = get_random_clients(n_clients,initial_model_name,selection_percentage)
    print("selected clients",selected_clients)


    if not spliting_method == "i.i.d. sharing":
      path_to_class_combs = " "

    log_file = "/content/drive/MyDrive/MLDL_FederatedLearning/csv/client_log.csv"
  except:
    print("error in loading file")
    drive.flush_and_unmount()
    time.sleep(10)
    drive.mount('/content/drive')
    continue

  for client_num in selected_clients:
    if os.path.exists("/content/drive/MyDrive/MLDL_FederatedLearning/csv/client_log.csv"):
      all_clients_df = pd.read_csv(log_file)
      filtered_clients_df = all_clients_df[all_clients_df["initial_model_name"] == initial_model_name]
      print( f"{np.where(selected_clients == client_num)[0] +1 }/{len(selected_clients)} ","#"*100)
      if client_num in filtered_clients_df["client_id"].values:
        print(f"Client {client_num} is already trained")
        # continue
  # try:
    client = Client(id=client_num,
                  data=client_data[client_num] ,
                  spliting_method=spliting_method,
                  classes="all",
                  n_clients=n_clients,
                  batch_size = batch_size,
                  num_epochs= 10,
                  initial_model = copy.deepcopy(initial_model),
                  backbone=backbone,
                  path_to_model=None,
                  spliting_ratio={"train":0.8, "test":0.2},
                  path_to_subsets=path_to_subsets,
                  path_to_class_combs=path_to_class_combs
                  )
    print("Data size: ",len(client_data[client_num]))
    print("Backbone: ", backbone)


    client.SGDM_train()
    # client.train_default()
    client.evaluate()
    # Use the save_client method from the Client class
    log = client.create_log(
        model_name=next_id(log_file), # Generate a new model name
        path=f"/content/drive/MyDrive/MLDL_FederatedLearning/models/clients/{next_id(log_file)}.pth", # Generate a new path
        round_number=initial_model_round_num + 1
        )
    client.confirm_save(log['path'][0]) # Save the model

    if not os.path.exists(log_file):
      log.to_csv(log_file, index=False)

      print("new csv file ")
      print(f"name: {log['model_name'][0]} ")
      print(f"path: {log['path'][0]} ")
      print(f"Logged client {client_num} to {log_file}")

    else: # HERE
    # Create a new CSV file IF path doesn't exist
      # This check is no longer necessary as we generate a new id and path every time
      path_check = pd.read_csv(log_file)['model_name'].values # model_name

      # if log['model_name'][0] not in path_check:
      client.confirm_save(log['path'][0])
      log.to_csv(log_file, mode='a', header=False, index=False)
      print(f"name: {log['model_name'][0]} ")
      print(f"path: {log['path'][0]} ")
      print(f"Logged client {client_num} to {log_file}")
      # else :
      #   print("Existing Log")
    del client

  del selected_clients

  time.sleep(50)



Using cache found in /root/.cache/torch/hub/facebookresearch_dino_main


Detected feature dimontion: 384
round number: 50  - initial model: 62227862-ecd5-4a61-a2c3-9b7623dc0c08
selected clients [ 4  8 83  1 29 50 10  5 94 18]
[1]/10  ####################################################################################################
Total number of batches:  13  - Number of selected batches:  5 selected batches:  [12, 0, 7, 10, 11]
Data size:  500
Backbone:  dino_vits16
client: 4- local step number: 0 - step loss: 3.2694 
client: 4- local step number: 1 - step loss: 2.9468 
client: 4- local step number: 2 - step loss: 2.8909 
client: 4- local step number: 3 - step loss: 2.8052 
client: 4- local step number: 4 - step loss: 2.8423 
name: adecde8a-28cd-455e-92f0-1df84ae0fec3 
path: /content/drive/MyDrive/MLDL_FederatedLearning/models/clients/41704cc0-0009-474e-b41f-4b82501f352c.pth 
Logged client 4 to /content/drive/MyDrive/MLDL_FederatedLearning/csv/client_log.csv
[2]/10  ########################################################################################