# TP introduction to Federated Learning
Diane Lingrand (diane.lingrand@univ-cotedazur.fr)

Polytech SI5 / M2 - Advanced Deep Learning - 2022-23

In this lab, we will consider a very simple dataset: MNIST and will simulate a centralised federated learning using a server and few workers. The original MNIST dataset will be split in different subsets with the same number of data for each worker and the same distribution of classes.

## necessary imports

In [None]:
import matplotlib.pyplot as plt
import random
import numpy as np
from sklearn import ensemble
from sklearn.metrics import ConfusionMatrixDisplay, f1_score
from sklearn.utils import shuffle
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Conv2D, Flatten, MaxPooling2D
from tensorflow.keras.callbacks import EarlyStopping

import tensorflow.keras.utils
import pandas
from pandas import DataFrame
import copy

In [None]:
# reading the dataset
from tensorflow.keras.datasets import mnist

(x_train, y_train), (x_test, y_test) = mnist.load_data()

In [None]:
nbClasses=10
print("shape of x_train:", x_train.shape)
print("shape of y_train:", y_train.shape)

In [None]:
# display a random image from the train dataset (re-run the cell in order to change the image)
import matplotlib.pyplot as plt
i = random.randint(0,len(x_train)-1)
plt.imshow(x_train[i],aspect="auto",cmap='gray')
plt.show()

In [None]:
# flatten the images...
x_train = x_train.reshape(60000, 784)
x_test = x_test.reshape(10000, 784)
# ... and normalize the data (grey levels are integers from 0 to 255)
xTrain = x_train.astype('float32')/255
xTest = x_test.astype('float32')/255

# original labels corresponds to digits. We transform the labels to categorical labels.
yTrain = tensorflow.keras.utils.to_categorical(y_train, nbClasses)
yTest = tensorflow.keras.utils.to_categorical(y_test, nbClasses)

print('shape of xTrain :', xTrain.shape)
print('shape of yTrain :', yTrain.shape)


# 1. Reference

This section is not exactly about federated learning but allows you to build a benchmark. 
In this section you will choose a neural network topology, learn its weights using the training set and evaluate it using the test set. 

In [None]:
# NEURAL NETWORKS TOPOLOGY PROPOSITIONS

def buildModel1():
    model = Sequential()
    model.add(Conv2D(32, (3, 3), activation='relu', input_shape=(28, 28, 1)))
    model.add(MaxPooling2D((2, 2)))
    model.add(Flatten())
    model.add(Dense(100, activation='relu'))
    model.add(Dense(nbClasses, activation='softmax'))
    return model

def buildModel2():
    model = Sequential()    
    model.add(Dense(100, input_dim=784, activation='relu'))
    model.add(Dense(75, activation='relu'))
    model.add(Dense(50, activation='relu'))
    model.add(Dense(50, activation='sigmoid'))
    model.add(Dense(nbClasses, activation='softmax'))
    return model

def buildModel3():
    model = Sequential()    
    model.add(Dense(50, input_dim=784, activation='relu'))
    model.add(Dense(50, activation='relu'))
    model.add(Dense(50, activation='sigmoid'))
    model.add(Dense(nbClasses, activation='softmax'))
    return model
  

Using the method "summary()", compare the number of weights to be learn for these architectures. 

In [None]:
m1 = buildModel1()
m1.summary()
#to be continued ...

At this point, choose a model. 

In [None]:
model = buildModel1() # put you choice here

In [None]:
#we need to define the loss function for the training, the optimisation method (RMSprop) and the accuracy as a metric

model.compile(optimizer='rmsprop',loss='categorical_crossentropy', metrics=['accuracy'])


In [None]:
def plot_history(history):
  plt.plot(history.history['accuracy'])
  plt.plot(history.history['val_accuracy'])
  plt.title('model accuracy')
  plt.ylabel('accuracy')
  plt.xlabel('epoch')
  plt.legend(['train', 'val'], loc='upper left')
  plt.show()

In [None]:
from tensorflow.keras.callbacks import EarlyStopping

# we define a callback function that will control if the accuracy 
# on the validation set (a part of train set) is not changing more than 10-4 with a patience of 20 iterations
# If the last accuracy value is not the best one, we still keep the last results
# In this example, we extracted 20% of the train set for the validation set that will be used to monitor the convergence.

ourCallback = EarlyStopping(monitor='val_accuracy', min_delta=0.0001, patience=20, verbose=0, mode='auto', baseline=None, restore_best_weights=True)

# let's learn the network again !
# We do not know when the training will stop but no more than 2000 epochs.
h = model.fit(xTrain, yTrain, epochs=2000, batch_size=128, validation_split=0.2, callbacks=[ourCallback])



In [None]:
plot_history(h)

In [None]:
pandas.DataFrame(h.history).plot()

What is the value of the F1 score (both train set and test set)?

In [None]:
# print metrics
score = model.evaluate(xTest,yTest)
print("%s: %.2f%%" % (model.metrics_names[1], score[1]*100))

from sklearn.metrics import confusion_matrix, plot_confusion_matrix, f1_score
pred_test = np.argmax(model.predict(xTest),axis=1)
print(pred_test.shape, yTest.shape)
print("F1 score: ", f1_score(pred_test,np.argmax(yTest,axis=1),average=None))
print("F1 score micro: ", f1_score(pred_test,np.argmax(yTest,axis=1), average='micro'))
print("F1 score macro: ", f1_score(pred_test,np.argmax(yTest,axis=1), average='macro'))

In [None]:
# display the confusion matrix:
cm = confusion_matrix(np.argmax(yTest,axis=1), pred_test)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[0,1,2,3,4,5,6,7,8,9])
disp.plot()

In [None]:
# this will be usefull for the next sections
def printMetrics(m):
    score = m.evaluate(xTest,yTest)
    print("%s: %.2f%%" % (m.metrics_names[1], score[1]*100))

    pred_test = np.argmax(m.predict(xTest),axis=1)

    print("F1 score: ", f1_score(pred_test,np.argmax(yTest,axis=1),average=None))
    print("F1 score micro: ", f1_score(pred_test,np.argmax(yTest,axis=1), average='micro'))
    print("F1 score macro: ", f1_score(pred_test,np.argmax(yTest,axis=1), average='macro'))
    cm = confusion_matrix(np.argmax(yTest,axis=1), pred_test)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[0,1,2,3,4,5,6,7,8,9])
    disp.plot()

# 2. Federated Learning

## 2.1 Data and clients

A first model will be learned using 20% of the data (all classes). Workers will load this first model and own another 20% of the data.

In [None]:
tmp = 0
yTrain0 = []
for cl in range(0,10):
    cl0 = xTrain[y_train==cl]
    n0 = len(cl0)
    tmp += len(cl0[:n0//5])
xTrain0 = np.empty(shape=(tmp,28,28), dtype=float)
i = 0
for cl in range(0,10):
    cl0 = xTrain[y_train==cl]
    n0 = len(cl0)//5
    xTrain0[i:i+n0] = cl0[:n0]
    i += n0
    yTrain0 += [cl]*n0
print(xTrain0.shape)
print(len(yTrain0))
yTrain0 = tensorflow.keras.utils.to_categorical(yTrain0, nbClasses)
print(yTrain0.shape)

In [None]:
#this is **VERY** important for the splitting into validation and train in the fit method
xTrain0, yTrain0 = shuffle(xTrain0, yTrain0, random_state=0)

## 2.2 Pre-learned model

In [None]:
# write the code for the server that will define and learn a neural network using xTrain0 and yTrain0
model0 = #to be continued

In [None]:
# evaluate this model0

In [None]:
# we make a copy of this pre-learned model for restarts using different methods
modelBase = copy.deepcopy(model0)

## 2.3 Workers

Let's start with 2 workers. They first receive a copy of the server. We assign the next 20% of xTrain and yTrain to the first worker and the next 20% to the second worker

In [None]:
#Worker 1
model1 = copy.deepcopy(model0)
xTrain1 = np.empty(shape=(tmp,28,28), dtype=float)
yTrain1 = []

#Worker 2
model2 = copy.deepcopy(model0)
xTrain2 = np.empty(shape=(tmp,28,28), dtype=float)
yTrain2 = []

#data split: assign new 20% of train data to each worker 
i = 0
for cl in range(0,10):
    cl0 = xTrain[y_train==cl]
    n0 = len(cl0)//5
    xTrain1[i:i+n0] = cl0[n0:2*n0]
    xTrain2[i:i+n0] = cl0[2*n0:3*n0]
    i += n0
    yTrain1 += [cl]*n0
    yTrain2 += [cl]*n0
yTrain1 = tensorflow.keras.utils.to_categorical(yTrain1, nbClasses)
yTrain2 = tensorflow.keras.utils.to_categorical(yTrain2, nbClasses)

xTrain1, yTrain1 = shuffle(xTrain1, yTrain1)
xTrain2, yTrain2 = shuffle(xTrain2, yTrain2)


### 2.3.1 one epoch on each worker

In [None]:
# perform one epoch for each worker

In [None]:
printMetrics(model2)

### 2.3.2 merging to the server using weight averaging

In [None]:
# first method: weight averaging
## get the weights of all model, compute the average and use this result as weight for the server (model0)


In [None]:
# evaluate

In [None]:
# perform another epoch and print again the metrics.

In [None]:
## add 2 other workers and do the fed learning again

### 2.3.3 merging to the server using weights averaging on the last layer

In [None]:
# second method: weights averaging on the last layer
wlast0 = model0.get_layer('dense_42').get_weights() # 2 elements in the list
nlast0 = len(wlast0)
# similar processing

### 2.3.4 merging to the server using weights averaging on the last 2 layers

In [None]:
# second method (bis): weights averaging only on dense layers (the last 2 layers)

## 2.4 Gradient averaging

### 2.4.1 Introduction to GradientTape

[GradientTape](https://www.tensorflow.org/api_docs/python/tf/GradientTape) allows to compute and record (tape) gradients of functions using automatic differentiation. An [introduction](https://medium.com/analytics-vidhya/tf-gradienttape-explained-for-keras-users-cc3f06276f22) is also worth to read. 

Here is a simple example:

In [None]:
import tensorflow as tf
import math

In [None]:
#lets initialize a variable:
x = tf.Variable(4.0)

# gradient tape is defined to record operations defining function y
with tf.GradientTape() as tape:
    y = x**3

print(y) # value of y computed with x equal to 4.0
print(tape.gradient(y,x)) # gradient of y with respect to variable x


So that we can apply this to a neural network considered as a function, taking x as input, y as output and the w values as trainable variables.


In [None]:
with tensorflow.GradientTape() as tape:
    # Make prediction
    pred_y = model(xTrain)
    # Calculate loss
    model_loss = tensorflow.keras.losses.categorical_crossentropy(yTrain, pred_y)
    
# Calculate gradients
model_gradients = tape.gradient(model_loss, model.trainable_variables)

### 2.4.2 Using GradientTape on the whole model

In [None]:
model0 = copy.deepcopy(modelBase)
model1 = copy.deepcopy(model0)
model2 = copy.deepcopy(model0)

In [None]:
# third method: gradient averaging
# récuperer les gradients (GradientTape)
#one iteration
def step(m, data, labels):
    with tensorflow.GradientTape() as tape:
        # Make prediction
        pred_y = m(data)
        # Calculate loss
        model_loss = tensorflow.keras.losses.categorical_crossentropy(labels, pred_y)
    
    # Calculate gradients
    model_gradients = tape.gradient(model_loss, m.trainable_variables)
    # Update model
    m.optimizer.apply_gradients(zip(model_gradients, m.trainable_variables))
    return model_gradients
 


#### 2.4.2.1 Only one iteration

In [None]:
# using bs as the batch size
bs = 128
# compute the first iteration 
n = 0 # first iteration
# and memorize 
## grad1 the gradient for model1
## grad2 the gradient for model2
## compute the average 
## use this gradient for modifying weights of model0



In [None]:
# send gradients to the server and use the gradients to modify the weights in the server
model0.optimizer.apply_gradients(zip(grad1,model0.trainable_variables))

In [None]:
# print metrics

#### 2.4.2.2 Only one epoch

In [None]:
#the same for 1 epoch


In [None]:
printMetrics(model0)

#### 2.4.2.3 Many epochs

In [None]:
# display evolution of metrics