## Importing Libraries

In [1]:
# Standard library imports
import os
import sys
import json
import random
import shutil
from collections import OrderedDict

# Third-party library imports
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import CosineAnnealingLR
from torch.utils.data import Dataset, DataLoader, Subset
import torchvision
import torchvision.transforms as transforms
from torchvision import datasets, models
from PIL import Image
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# Google Colab specific imports
from google.colab import drive

# Set the working directory
DIR_DATA = '/content/'
os.chdir(DIR_DATA)


## Checkpoints

In [7]:
import os
import torch

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Create a Folder in the root directory
!mkdir -p "/content/drive/My Drive/My Folder/checkpoints_shakespeare"

DIR_DATA = "./data"
CHECKPOINT_DIR = '/content/drive/My Drive/My Folder/checkpoints_shakespeare'

os.makedirs(CHECKPOINT_DIR, exist_ok=True)

def save_checkpoint(model, optimizer, epoch, hyperparameters, subfolder="", data_to_save=None):
    """Salva il checkpoint del modello e rimuove quello precedente."""
    subfolder_path = os.path.join(CHECKPOINT_DIR, subfolder)
    os.makedirs(subfolder_path, exist_ok=True)

    # File corrente e precedente
    filename = f"model_epoch_{epoch}_params_{hyperparameters}.pth"
    filepath = os.path.join(subfolder_path, filename)
    filename_json = f"model_epoch_{epoch}_params_{hyperparameters}.json"
    filepath_json = os.path.join(subfolder_path, filename_json)


    previous_filename = f"model_epoch_{epoch -1}_params_{hyperparameters}.pth"
    previous_filepath = os.path.join(subfolder_path, previous_filename)
    previous_filename_json = f"model_epoch_{epoch -1}_params_{hyperparameters}.json"
    previous_filepath_json = os.path.join(subfolder_path, previous_filename_json)

    # Rimuove il checkpoint precedente
    if epoch > 1 and os.path.exists(previous_filepath) and os.path.exists(previous_filepath_json):
        os.remove(previous_filepath)
        os.remove(previous_filepath_json)

    # Salva il nuovo checkpoint
    if optimizer is not None:
        torch.save({
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),  # Salvataggio dello stato dell'ottimizzatore
            'epoch': epoch
        }, filepath)
    else:
        torch.save({
            'model_state_dict': model.state_dict(),
            'epoch': epoch
        }, filepath)
    print(f"Checkpoint salvato: {filepath}")

    with open(filepath_json, 'w') as json_file:
      json.dump(data_to_save, json_file, indent=4)


def load_checkpoint(model, optimizer, hyperparameters, subfolder=""):
    """Carica l'ultimo checkpoint disponibile basato sugli iperparametri."""
    subfolder_path = os.path.join(CHECKPOINT_DIR, subfolder)
    if not os.path.exists(subfolder_path):
        print("No checkpoint found, Starting now...")
        return 1, None  # Le epoche iniziano da 1

    # Cerca i file con gli iperparametri specificati
    files = [f for f in os.listdir(subfolder_path) if f"params_{hyperparameters}" in f and f.endswith('.pth')]
    if files:
        # Trova il file con l'epoca più alta
        latest_file = max(files, key=lambda x: int(x.split('_')[2]))
        filepath = os.path.join(subfolder_path, latest_file)
        checkpoint = torch.load(filepath)

        model.load_state_dict(checkpoint['model_state_dict'])
        if optimizer is not None:
            optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

        # Trova e carica il file JSON associato
        json_filename = latest_file.replace('.pth', '.json')
        json_filepath = os.path.join(subfolder_path, json_filename)
        json_data = None
        if os.path.exists(json_filepath):
            with open(json_filepath, 'r') as json_file:
                json_data = json.load(json_file)
            print(f"JSON data loaded: {json_filepath}")
        else:
            print(f"No JSON file found for: {latest_file}")

        print(f"Checkpoint found: Resume epoch {checkpoint['epoch'] + 1}")
        return checkpoint['epoch'] + 1, json_data

    print("No checkpoint found, Starting now...")
    return 1, None  # Le epoche iniziano da 1



ValueError: mount failed

## Shakespeare Dataset

In [None]:
def text_transform(text, max_length=100):
    # Tokenizzazione semplice: converti ogni carattere in un valore numerico (es. ASCII)
    tokenized = [ord(char) for char in text]

    # Padding o Troncamento per lunghezza fissa
    if len(tokenized) < max_length:
        tokenized += [0] * (max_length - len(tokenized))  # Pad con zeri
    else:
        tokenized = tokenized[:max_length]  # Troncamento

    # Converte in tensore
    return torch.tensor(tokenized, dtype=torch.float)


class ShakespeareDataset(Dataset):
    def __init__(self, root, split, preprocess_params=None, transform=None):
        """
        Args:
            root (str): Path to the dataset directory.
            split (str): Dataset split, either 'train' or 'test'.
            preprocess_params (dict, optional): Parameters for running preprocess.sh script. Keys include:
                - sharding (str): 'iid' or 'niid' for data partitioning.
                - iu (float): Fraction of users if i.i.d. sampling.
                - sf (float): Fraction of data to sample.
                - k (int): Minimum number of samples per user.
                - t (str): 'user' or 'sample' for train-test partition.
                - tf (float): Fraction of data in training set.
                - raw (bool): Include raw text data.
                - smplseed (int): Seed for sampling.
                - spltseed (int): Seed for splitting.
        """
        self.root = root
        self.split = split
        self.preprocess_params = preprocess_params or {}

        if transform is None:
            self.transform = text_transform

        # Ensure the working directory is set to the dataset folder
        os.chdir(self.root)

        # Run preprocessing script if needed
        self._preprocess_data()

        # Load the dataset
        self.data = self._load_data()

    def _preprocess_data(self):
        """Runs preprocess.sh with the given parameters."""
        cmd = "bash preprocess.sh"

        if 'sharding' in self.preprocess_params:
            cmd += f" -s {self.preprocess_params['sharding']}"
        if 'iu' in self.preprocess_params:
            cmd += f" --iu {self.preprocess_params['iu']}"
        if 'sf' in self.preprocess_params:
            cmd += f" --sf {self.preprocess_params['sf']}"
        if 'k' in self.preprocess_params:
            cmd += f" -k {self.preprocess_params['k']}"
        if 't' in self.preprocess_params:
            cmd += f" -t {self.preprocess_params['t']}"
        if 'tf' in self.preprocess_params:
            cmd += f" --tf {self.preprocess_params['tf']}"
        if 'raw' in self.preprocess_params and self.preprocess_params['raw']:
            cmd += f" --raw"
        if 'smplseed' in self.preprocess_params:
            cmd += f" --smplseed {self.preprocess_params['smplseed']}"
        if 'spltseed' in self.preprocess_params:
            cmd += f" --spltseed {self.preprocess_params['spltseed']}"

        print(f"Running command: {cmd}")
        os.system(cmd)
        os.chdir(DIR_DATA)


    def _load_data(self):
      """Loads data from the JSON file in the train or test folder, assuming only one file per folder."""
      # Identifica il file JSON nella directory specificata
      folder_path = os.path.join(self.root,'data', self.split)
      json_files = [f for f in os.listdir(folder_path) if f.endswith(".json")]

      if len(json_files) != 1:
          raise ValueError(f"Expected exactly one JSON file in {folder_path}, but found {len(json_files)} files.")

      file_path = os.path.join(folder_path, json_files[0])

      # Carica i dati dal file JSON
      with open(file_path, 'r') as f:
          data = json.load(f)

      # Converti la struttura JSON in un DataFrame di pandas
      records = []
      for user, user_data in data['user_data'].items():
          for x, y in zip(user_data['x'], user_data['y']):
              records.append({
                  'user': user,
                  'input': x,
                  'target': y
              })

      return pd.DataFrame(records)


    def get_dataframe(self):
        """Returns the dataset as a pandas DataFrame."""
        return self.data

    def __len__(self):
        """Returns the number of samples in the dataset."""
        return len(self.data)

    def __getitem__(self, idx):
        sample = {
            'input': self.data.iloc[idx]['input'],
            'target': self.data.iloc[idx]['target']
        }

        # Applica la trasformazione agli input (e.g., tokenizzazione e padding)
        if self.transform:
            sample['input'] = self.transform(sample['input'])

        # Converte i target in tensori
        sample['target'] = torch.tensor(sample['target'], dtype=torch.long)
        return sample


## Shakespeare Model Architecture

In [4]:
class ShakespeareRNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_size, num_layers):
        super(ShakespeareRNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_size, num_layers, batch_first=True, dropout=0.2)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x, hidden):
        x = self.embedding(x)  # Embed input
        out, hidden = self.lstm(x, hidden)  # Pass through LSTM layers
        out = self.fc(out)  # Fully connected layer for output
        return out, hidden

## Centralized training functions

In [5]:
def train_model(model, train_loader, test_loader, optimizer, scheduler, criterion, epochs, hyperparameters):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    # Carica checkpoint se esiste
    start_epoch = load_checkpoint(model, optimizer, hyperparameters,"Centralized/")

    train_losses, test_losses, test_accuracies = [], [], []

    for epoch in range(start_epoch, epochs):
        model.train()
        epoch_loss = 0
        for inputs, targets in train_loader:
            inputs, targets = inputs.to(device), targets.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()

        # Step the scheduler
        scheduler.step()

        # Salva checkpoint
        save_checkpoint(model, optimizer, epoch, hyperparameters,"Centralized/")

        # Evaluate on test set
        test_loss, test_accuracy = evaluate_model(model, test_loader, criterion, device)
        train_losses.append(epoch_loss / len(train_loader))
        test_losses.append(test_loss)
        test_accuracies.append(test_accuracy)

        print(f"Epoch {epoch+1}/{epochs}, Train Loss: {epoch_loss:.4f}, "
              f"Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}")

    return train_losses, test_losses, test_accuracies

def evaluate_model(model, test_loader, criterion, device):
    model.eval()
    total_loss = 0
    correct = 0
    total = 0

    with torch.no_grad():
        for inputs, targets in test_loader:
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            total_loss += loss.item()
            _, predicted = outputs.max(1)
            correct += (predicted == targets).sum().item()
            total += targets.size(0)

    return total_loss / len(test_loader), correct / total


## Centralized training

In [9]:
# Always run before creating new datasets

if os.path.exists("/content/leaf/"):
  # Use shutil.rmtree to remove the folder and its contents
  shutil.rmtree("/content/leaf")
  print(f"Successfully deleted folder leaf")

os.chdir("/content/")
!git clone https://github.com/maxfra01/leaf.git

# -----------------------------------------

preprocess_params = {
        'sharding': 'iid',
        'sf': 1.0,
        't': 'sample',
        'tf': 0.8,
    } # Get the full-size dataset

train_dataset_big = ShakespeareDataset(root="leaf/data/shakespeare/", split="train", preprocess_params=preprocess_params)
test_dataset = ShakespeareDataset(root="leaf/data/shakespeare", split="test", preprocess_params=preprocess_params)


Successfully deleted folder leaf
Cloning into 'leaf'...
remote: Enumerating objects: 772, done.[K
remote: Counting objects: 100% (6/6), done.[K
remote: Compressing objects: 100% (6/6), done.[K
remote: Total 772 (delta 0), reused 0 (delta 0), pack-reused 766 (from 1)[K
Receiving objects: 100% (772/772), 6.78 MiB | 13.87 MiB/s, done.
Resolving deltas: 100% (363/363), done.
Running command: bash preprocess.sh -s iid --sf 1.0 -t sample --tf 0.8
Running command: bash preprocess.sh -s iid --sf 1.0 -t sample --tf 0.8


In [None]:
# Hyperparameters
BATCH_SIZE = 64
LEARNING_RATE = 0.01
MOMENTUM = 0.8
WEIGHT_DECAY=1e-4
EPOCHS = 50

hyperparameters = f"BS{BATCH_SIZE}_LR{LEARNING_RATE}_WD{WEIGHT_DECAY}_M{MOMENTUM}"


# Create the validation split
indexes = range(0, len(train_dataset_big))
splitting = train_test_split(indexes, train_size = 0.8, random_state = 42, shuffle = True)
train_indexes = splitting[0]
val_indexes = splitting[1]

train_dataset = Subset(train_dataset_big, train_indexes)
val_dataset = Subset(train_dataset_big, val_indexes)

# Create Dataloaders
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=2, drop_last=True)
val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=2)

# Set device and model parameters
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
input_dim = len(train_dataset[0]['input'])
output_dim = len(set(train_dataset_big.data['target']))  # Numero di classi

model_shakespeare = ShakespeareRNN(input_dim=input_dim, output_dim=output_dim)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model_shakespeare.parameters(), lr=LEARNING_RATE, momentum=MOMENTUM, weight_decay=WEIGHT_DECAY)
scheduler = CosineAnnealingLR(optimizer, T_max=200)

# Train the model
train_losses = []
val_losses = []
val_accuracies = []

train_losses, val_losses, val_accuracies = train_model(
    model=model_shakespeare,
    train_loader=train_dataloader,
    test_loader=val_dataloader,
    optimizer=optimizer,
    scheduler=scheduler,
    criterion=criterion,
    epochs=EPOCHS,
    hyperparameters=hyperparameters
)

# Evaluation on test split

test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, num_workers=2, shuffle=False)

test_loss, test_accuracy = evaluate_model(model_shakespeare, test_dataloader, criterion, device)

print("\nFinal Model Evaluation on Test Set:")
print(f"Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}")

plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.plot(val_losses, label='Shakespeare Val Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(val_accuracies, label='Shakespare Val Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()

plt.show()


TypeError: new(): invalid data type 'str'

## Federate Learning classes

In [8]:
def generate_skewed_probabilities(num_clients, gamma):
    """It generates skewed probabilities for clients using a Dirichlet distribution."""
    probabilities = np.random.dirichlet([gamma] * num_clients)
    return probabilities


def plot_selected_clients_distribution(selected_clients_per_round, num_clients, hyperparameters):
    """Plotta la distribuzione dei client selezionati alla fine del processo."""
    counts = np.zeros(num_clients)

    # Conta quante volte ogni client è stato selezionato in tutti i round
    for selected_clients in selected_clients_per_round:
        for client in selected_clients:
            counts[client] += 1

    plt.figure(figsize=(10, 6))
    plt.bar(range(num_clients), counts, color='skyblue', edgecolor='black')
    plt.title("Distribuzione dei Client Selezionati Durante il Federated Averaging")
    plt.xlabel("Client ID")
    plt.ylabel("Frequenza di Selezione")
    plt.grid(axis='y')
    plt.savefig(f"CIFAR100_Client_distribution_{hyperparameters}.png")
    plt.show()



class Client:

  def __init__(self, model, client_id, data, optimizer_params):
    self.client_id = client_id
    self.data = data
    self.model = model
    self.optimizer_params = optimizer_params

  def train(self, global_weights, local_steps, batch_size):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    self.model.to(device)
    self.model.load_state_dict(global_weights)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(
        self.model.parameters(),
        lr=self.optimizer_params['lr'],
        momentum=self.optimizer_params['momentum'],
        weight_decay=self.optimizer_params['weight_decay']
        )
    trainloader = DataLoader(self.data, batch_size=batch_size, pin_memory=True, shuffle=True)
    steps = 0  # Track the number of steps
    while steps < local_steps:
      for inputs, targets in trainloader:
          if steps >= local_steps:  # Stop after completing the required steps
              break
          inputs, targets = inputs.to(device), targets.to(device)
          optimizer.zero_grad()
          outputs = self.model(inputs)
          loss = criterion(outputs, targets)
          loss.backward()
          optimizer.step()
          steps += 1
    return self.model.state_dict()


class Server:

  def __init__(self, model, clients, test_data):
    self.model = model
    self.clients = clients
    self.test_data = test_data
    self.round_losses = []
    self.round_accuracies = []
    self.selected_clients_per_round = [] #clint selezionati per skewness

  def federated_averaging(self, local_steps, batch_size, num_rounds, fraction_fit, skewness = None, hyperparameters = None):

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    self.model.to(device)
     # Carica il checkpoint se esiste
    data_to_load = None
    if skewness is  None:
      start_epoch, data_to_load = load_checkpoint(self.model,optimizer=None,hyperparameters=hyperparameters, subfolder="Federated_Uniform/")
    else:
      start_epoch, data_to_load = load_checkpoint(self.model,optimizer=None,hyperparameters=hyperparameters, subfolder="Federated_Skewed/")

    if data_to_load is not None:
      self.round_losses = data_to_load['round_losses']
      self.round_accuracies = data_to_load['round_accuracies']
      self.selected_clients_per_round = data_to_load['selected_clients_per_round']


    for round in range(start_epoch, num_rounds+1):

      if skewness is not None:
        probabilities = generate_skewed_probabilities(len(self.clients), skewness)
        selected_clients = np.random.choice(self.clients, size=max(1, int(fraction_fit*len(self.clients))), replace=False, p=probabilities)

      else:
        selected_clients = np.random.choice(self.clients, size=max(1, int(fraction_fit*len(self.clients))), replace=False)

      self.selected_clients_per_round.append([client.client_id for client in selected_clients])


      global_weights = self.model.state_dict()

      # Simulating parallel clients training
      client_weights = {}
      for client in selected_clients:
        client_weights[client.client_id] = client.train(global_weights, local_steps, batch_size)

      new_global_weights = {key: torch.zeros_like(value).type(torch.float32) for key, value in global_weights.items()}

      total_data_size = sum([len(client.data) for client in selected_clients])
      for client in selected_clients:
        scaling_factor = len(client.data) / total_data_size
        for key in new_global_weights.keys():
          new_global_weights[key] += scaling_factor * client_weights[client.client_id][key]

      # Update global model weights
      self.model.load_state_dict(new_global_weights)

      # Evaluate global model each 10 rounds
      if round % 10 == 0:
        loss, accuracy = evaluate_model(self.model, DataLoader(self.test_data, batch_size=batch_size, shuffle=True, pin_memory=True), nn.CrossEntropyLoss(), device)
        self.round_losses.append(loss)
        self.round_accuracies.append(accuracy)
        print(f"Round {round}/{num_rounds} - Loss: {loss:.4f}, Accuracy: {accuracy:.4f}")

        data_to_save = {
          'round_losses': self.round_losses,
          'round_accuracies': self.round_accuracies,
          'selected_clients_per_round': [[client for client in round_clients] for round_clients in self.selected_clients_per_round]  # Serializziamo solo i client_id
      }

        if skewness is  None:
          save_checkpoint(self.model, None, round , hyperparameters, "Federated_Uniform/", data_to_save)
        else:
          save_checkpoint(self.model, None, round , hyperparameters, "Federated_Skewed/", data_to_save)




    plt.figure(figsize=(12,5))
    plt.subplot(1, 2, 1)
    plt.plot(self.round_losses, label='CIFAR-100 Test Loss')
    plt.xlabel('Round (x10)')
    plt.ylabel('Loss')
    plt.legend()

    plt.subplot(1, 2, 2)
    plt.plot(self.round_accuracies, label='CIFAR-100 Test Accuracy')
    plt.xlabel('Round (x10)')
    plt.ylabel('Accuracy')
    plt.legend()
    if skewness is  None:
      plt.savefig(f"CIFAR100_fedavg_uniform{hyperparameters}.jpg")
    else:
      plt.savefig(f"CIFAR100_fedavg_skew{hyperparameters}.jpg")

    plt.show()

    plot_selected_clients_distribution(self.selected_clients_per_round, len(self.clients), hyperparameters)


