In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, cohen_kappa_score, confusion_matrix
from typing import Dict, List, Tuple
import pandas as pd
import tensorflow as tf

import flwr as fl
from flwr.common import Metrics

import matplotlib.pyplot as plt

In [2]:
# Set a seed for reproducibility
seed_value = 42
import random
random.seed(seed_value)
import numpy as np
np.random.seed(seed_value)
tf.random.set_seed(seed_value)
tf.keras.utils.set_random_seed(seed_value)

## Data Loading

In [3]:
df_bank_a = pd.read_csv('../data/BankA.csv')
df_bank_b = pd.read_csv('../data/BankB.csv')
df_bank_c = pd.read_csv('../data/BankC.csv')

In [4]:
df_all = pd.concat([df_bank_a, df_bank_b, df_bank_c])

## Data Preprocessing

In [5]:
# Strip all string values from the dataset
df = df_all.apply(lambda x: x.str.strip() if x.dtype == "object" else x)

### Feature Columns

In [6]:
# Combine Never-worked and Without-pay into one category
df['workclass'] = df['workclass'].replace(['Never-worked', 'Without-pay'], 'Not-working')
df['workclass'] = df['workclass'].replace(['?', '*'], 'unknown')
df['workclass'].value_counts()

workclass
Private             546342
Self-emp-not-inc     66145
Local-gov            51137
unknown              47431
State-gov            34717
Self-emp-inc         27715
Federal-gov          25879
Not-working            633
Name: count, dtype: int64

In [7]:
# Combine Married-civ-spouse and Married-AF-spouse into one category
df['marital-status'] = df['marital-status'].replace(['Married-civ-spouse', 'Married-AF-spouse'], 'Married')
df['marital-status'].value_counts()

marital-status
Married                  368820
Never-married            250510
Divorced                 110459
Widowed                   34203
Separated                 25566
Married-spouse-absent     10441
Name: count, dtype: int64

In [8]:
# Replace occupation by 4 categories (low, medium, high, unknown)
df['occupation'] = df['occupation'].replace(['Exec-managerial', 'Prof-specialty'], 'high')
df['occupation'] = df['occupation'].replace(['Armed-Forces', 'Protective-serv', 'Tech-support', 'Sales', 'Craft-repair', 'Transport-moving'], 'medium')
df['occupation'] = df['occupation'].replace(['Adm-clerical', 'Machine-op-inspct', 'Farming-fishing', 'Handlers-cleaners', 'Other-service', 'Priv-house-serv'], 'low')
df['occupation'] = df['occupation'].replace(['?', '*'], 'unknown')
df['occupation'].value_counts()

occupation
low        281223
medium     259651
high       211330
unknown     47795
Name: count, dtype: int64

In [9]:
# Combine Husband and Wife into one category
df['relationship'] = df['relationship'].replace(['Husband', 'Wife'], 'Parent')
df['relationship'].value_counts()

relationship
Parent            362767
Not-in-family     212898
Own-child         119123
Unmarried          80532
Other-relative     24679
Name: count, dtype: int64

In [10]:
# Map native-country to continents
df['native-country'] = df['native-country'].str.strip()
df['native-country'] = df['native-country'].replace(['United-States', 'Puerto-Rico', 'Canada', 'Outlying-US(Guam-USVI-etc)', 'Cuba', 'Jamaica', 'Mexico', 'Dominican-Republic', 'El-Salvador', 'Guatemala', 'Haiti', 'Honduras', 'Nicaragua', 'Trinadad&Tobago', 'Peru', 'Ecuador', 'Columbia', 'Honduras', 'Haiti', 'Guatemala', 'El-Salvador', 'Dominican-Republic', 'Columbia', 'Ecuador', 'Peru', 'Jamaica', 'Mexico', 'Puerto-Rico', 'Cuba', 'Outlying-US(Guam-USVI-etc)', 'Canada', 'United-States'], 'North-America')
df['native-country'] = df['native-country'].replace(['Germany', 'England', 'Italy', 'Poland', 'Portugal', 'Ireland', 'France', 'Yugoslavia', 'Scotland', 'Greece', 'Hungary', 'Holand-Netherlands'], 'Europe')
df['native-country'] = df['native-country'].replace(['Philippines', 'India', 'China', 'Japan', 'Vietnam', 'Taiwan', 'Iran', 'Thailand', 'Hong', 'Cambodia', 'Laos'], 'Asia')
df['native-country'] = df['native-country'].replace(['South', 'Columbia', 'Ecuador', 'Peru'], 'South-America')
df['native-country'] = df['native-country'].replace(['Trinadad&Tobago', 'Honduras', 'Haiti', 'Guatemala', 'El-Salvador', 'Dominican-Republic', 'Columbia', 'Ecuador', 'Peru'], 'Central-America')
df['native-country'] = df['native-country'].replace(['?', '*'], 'Unknown')
df['native-country'].value_counts()

native-country
North-America    753935
Asia              14694
Unknown           14390
Europe            14273
South-America      2707
Name: count, dtype: int64

In [11]:
df['education'] = df['education'].replace(['Preschool', '1st-4th', '5th-6th', '7th-8th', '9th', '10th', '11th', '12th'], 'school')
df['education'] = df['education'].replace(['Assoc-voc', 'Assoc-acdm', 'Prof-school', 'Some-college'], 'higher')
df['education'].value_counts()

education
HS-grad      258661
higher       241834
Bachelors    133796
school       110209
Masters       45697
Doctorate      9802
Name: count, dtype: int64

In [12]:
df['race'] = df['race'].replace(['Black', 'Asian-Pac-Islander', 'Amer-Indian-Eskimo', 'Other'], 'Other')
df['race'].value_counts()

race
White    686196
Other    113803
Name: count, dtype: int64

In [13]:
df['age'] = pd.cut(df['age'], bins=[0, 20, 30, 40, 50, 60, 70, 80, 90, 100], labels=['0-20', '20-30', '30-40', '40-50', '50-60', '60-70', '70-80', '80-90', '90+'])
df['age'].value_counts()

age
20-30    199748
30-40    187599
40-50    168168
50-60    107704
0-20      64609
60-70     51995
70-80     15668
80-90      4508
90+           0
Name: count, dtype: int64

In [14]:
# Drop the fnlwgt column
df.drop(['fnlwgt'], axis=1, inplace=True)

# Drop the education column
# df.drop(['education'], axis=1, inplace=True)

# Drop the capital-gain column
# df.drop(['capital-gain'], axis=1, inplace=True)

# Drop the capital-loss column
# df.drop(['capital-loss'], axis=1, inplace=True)

# Drop the gender column
df.drop(['gender'], axis=1, inplace=True)

# Drop the age column
df.drop(['age'], axis=1, inplace=True)

# Drop the race column
df.drop('race', axis=1, inplace=True)

### Target Column

In [15]:
# replace income by 0 and 1
df['income'] = df['income'].map({'<=50K': 0, '>50K': 1})
df['income'].value_counts()

income
0    601449
1    198550
Name: count, dtype: int64

In [16]:
categorical_columns = df.select_dtypes(include=['object']).columns
 
for column in df[categorical_columns].columns:
    print(f"Column: {column}")
    print(df[column].value_counts())
    print("\n" + "="*30 + "\n")

# Print also the numberical columns, categorize them into bins of 6
numerical_columns = df.select_dtypes(include=['int64']).columns

for column in df[numerical_columns].columns:
    print(f"Column: {column}")
    print(df[column].value_counts(bins=6))
    print("\n" + "="*30 + "\n")

Column: institute
institute
Bank B    403240
Bank A    226164
Bank C    170595
Name: count, dtype: int64


Column: workclass
workclass
Private             546342
Self-emp-not-inc     66145
Local-gov            51137
unknown              47431
State-gov            34717
Self-emp-inc         27715
Federal-gov          25879
Not-working            633
Name: count, dtype: int64


Column: education
education
HS-grad      258661
higher       241834
Bachelors    133796
school       110209
Masters       45697
Doctorate      9802
Name: count, dtype: int64


Column: marital-status
marital-status
Married                  368820
Never-married            250510
Divorced                 110459
Widowed                   34203
Separated                 25566
Married-spouse-absent     10441
Name: count, dtype: int64


Column: occupation
occupation
low        281223
medium     259651
high       211330
unknown     47795
Name: count, dtype: int64


Column: relationship
relationship
Parent            36276

In [17]:
# One-hot encode the categorical columns
df = pd.get_dummies(df, columns=categorical_columns)

## Model Training

### Train-Test Split

In [18]:
# Drop institutes columns
df_all = df.drop(['institute_Bank A', 'institute_Bank B', 'institute_Bank C'], axis=1)
df_bank_a = df[df['institute_Bank A'] == 1].drop(['institute_Bank A', 'institute_Bank B', 'institute_Bank C'], axis=1)
df_bank_b = df[df['institute_Bank B'] == 1].drop(['institute_Bank A', 'institute_Bank B', 'institute_Bank C'], axis=1)
df_bank_c = df[df['institute_Bank C'] == 1].drop(['institute_Bank A', 'institute_Bank B', 'institute_Bank C'], axis=1)

In [19]:
# number of rows in each dataset
print(f"Number of rows in df_all:    {len(df_all)}")
print(f"Number of rows in df_bank_a: {len(df_bank_a)}")
print(f"Number of rows in df_bank_b: {len(df_bank_b)}")
print(f"Number of rows in df_bank_c: {len(df_bank_c)}")

Number of rows in df_all:    799999
Number of rows in df_bank_a: 226164
Number of rows in df_bank_b: 403240
Number of rows in df_bank_c: 170595


In [20]:
# Create a test set that contains 20% of the data from each bank
df_bank_a_test = df_bank_a.sample(frac=0.2, random_state=42)
df_bank_b_test = df_bank_b.sample(frac=0.2, random_state=42)
df_bank_c_test = df_bank_c.sample(frac=0.2, random_state=42)

# Create a training set that contains the remaining 80% of the data from each bank
df_bank_a_train = df_bank_a.drop(df_bank_a_test.index)
df_bank_b_train = df_bank_b.drop(df_bank_b_test.index)
df_bank_c_train = df_bank_c.drop(df_bank_c_test.index)

# Create a validation set that contains 20% of the data from each bank
df_bank_a_val = df_bank_a_train.sample(frac=0.2, random_state=42)
df_bank_b_val = df_bank_b_train.sample(frac=0.2, random_state=42)
df_bank_c_val = df_bank_c_train.sample(frac=0.2, random_state=42)

# Create a training set that contains the remaining 80% of the data from each bank
df_bank_a_train = df_bank_a_train.drop(df_bank_a_val.index)
df_bank_b_train = df_bank_b_train.drop(df_bank_b_val.index)
df_bank_c_train = df_bank_c_train.drop(df_bank_c_val.index)

# Combine the training sets into one training set
df_train = pd.concat([df_bank_a_train, df_bank_b_train, df_bank_c_train])

# Combine the test sets into one test set
df_test = pd.concat([df_bank_a_test, df_bank_b_test, df_bank_c_test])

# Combine the validation sets into one validation set
df_val = pd.concat([df_bank_a_val, df_bank_b_val, df_bank_c_val])

### Model Training

In [21]:
# Split data into X and y
X_train = df_train.drop('income', axis=1)
y_train = df_train['income']

X_test = df_test.drop('income', axis=1)
y_test = df_test['income']

X_val = df_val.drop('income', axis=1)
y_val = df_val['income']

In [22]:
X_train = np.asarray(X_train).astype(np.float32)
y_train = np.asarray(y_train).astype(np.float32)

X_test = np.asarray(X_test).astype(np.float32)
y_test = np.asarray(y_test).astype(np.float32)

X_val = np.asarray(X_val).astype(np.float32)
y_val = np.asarray(y_val).astype(np.float32)

In [23]:
from tensorflow import keras

model = keras.models.Sequential([
    keras.layers.Dense(128, activation="relu", input_shape=X_train.shape[1:]),
    keras.layers.Dropout(0.2),
    keras.layers.Dense(16, activation="relu"),
    keras.layers.Dropout(0.1),
    keras.layers.Dense(1, activation="sigmoid")
])

model.compile(loss="binary_crossentropy", optimizer="sgd", metrics=["accuracy"])

In [24]:
history = model.fit(X_train, y_train, epochs=50, validation_data=(X_val, y_val), batch_size=128)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [25]:
y_pred = model.predict(X_test)
y_pred = (y_pred > 0.5)

cm = confusion_matrix(y_test, y_pred)
print(cm)

accuracy_score(y_test, y_pred)



[[104068  16451]
 [ 14119  25362]]


0.8089375

### FML Simulation

In [26]:
import tensorflow as tf
VERBOSE = 0

In [27]:
NUM_CLIENTS = 3 # Bank A, B, C
NUM_FML_ROUNDS = 5

In [28]:
def get_model():
    """Constructs a simple model architecture suitable for the Dataset."""
    model = keras.models.Sequential([
        keras.layers.Dense(128, activation="relu", input_shape=X_train.shape[1:]),
        keras.layers.Dropout(0.2),
        keras.layers.Dense(16, activation="relu"),
        keras.layers.Dropout(0.1),
        keras.layers.Dense(1, activation="sigmoid")
    ])
    tf.random.set_seed(seed_value)
    model.compile("adam", "binary_crossentropy", metrics=["accuracy"])
    return model

In [29]:
class FinancialClient(fl.client.NumPyClient):
    def __init__(self, trainset, valset) -> None:
        # Create model
        self.model = get_model()
        self.trainset = trainset
        self.valset = valset

    def get_parameters(self, config):
        return self.model.get_weights()

    def fit(self, parameters, config):
        self.model.set_weights(parameters)
        tf.random.set_seed(seed_value)
        self.model.fit(self.trainset[0], self.trainset[1], epochs=5, verbose=VERBOSE)
        return self.model.get_weights(), len(self.trainset), {}

    def evaluate(self, parameters, config):
        self.model.set_weights(parameters)
        loss, acc = self.model.evaluate(self.valset[0], self.valset[1], verbose=VERBOSE)
        return loss, len(self.valset), {"accuracy": acc}

In [30]:
def get_client_fn(global_train_datasets_list):
    """Return a function to construct a client.

    The VirtualClientEngine will execute this function whenever a client is sampled by
    the strategy to participate.
    """

    def client_fn(cid: str) -> fl.client.Client:
        """Construct a DiabetesClient with its own dataset partition."""

        # Extract partition for client with id = cid
        X, y = global_train_datasets_list[int(cid)]

        # Now let's split it into train (90%) and validation (10%)
        X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=seed_value)

        trainset = (X_train, y_train)
        valset = (X_val, y_val)

        # Create and return client
        return FinancialClient(trainset, valset)

    return client_fn


def weighted_average(metrics: List[Tuple[int, Metrics]]) -> Metrics:
    """Aggregation function for (federated) evaluation metrics, i.e. those returned by
    the client's evaluate() method."""
    # Multiply accuracy of each client by number of examples used
    accuracies = [num_examples * m["accuracy"] for num_examples, m in metrics]
    examples = [num_examples for num_examples, _ in metrics]

    # Aggregate and return custom metric (weighted average)
    return {"accuracy": sum(accuracies) / sum(examples)}


def get_evaluate_fn(testset):
    """Return an evaluation function for server-side (i.e. centralised) evaluation."""

    # The `evaluate` function will be called after every round by the strategy
    def evaluate(
        server_round: int,
        parameters: fl.common.NDArrays,
        config: Dict[str, fl.common.Scalar],
    ):
        model = get_model()  # Construct the model
        model.set_weights(parameters)  # Update model with the latest parameters
        loss, accuracy = model.evaluate(testset[0], testset[1], verbose=VERBOSE)
        return loss, {"accuracy": accuracy}

    return evaluate

In [31]:
# Comine train and validation sets for each bank
df_bank_a_train_val = pd.concat([df_bank_a_train, df_bank_a_val])
df_bank_b_train_val = pd.concat([df_bank_b_train, df_bank_b_val])
df_bank_c_train_val = pd.concat([df_bank_c_train, df_bank_c_val])

# Split data into X and y
X_train_val_bank_a = df_bank_a_train_val.drop('income', axis=1)
y_train_val_bank_a = df_bank_a_train_val['income']

X_train_val_bank_b = df_bank_b_train_val.drop('income', axis=1)
y_train_val_bank_b = df_bank_b_train_val['income']

X_train_val_bank_c = df_bank_c_train_val.drop('income', axis=1)
y_train_val_bank_c = df_bank_c_train_val['income']

X_train_val_bank_a = np.asarray(X_train_val_bank_a).astype(np.float32)
y_train_val_bank_a = np.asarray(y_train_val_bank_a).astype(np.float32)

X_train_val_bank_b = np.asarray(X_train_val_bank_b).astype(np.float32)
y_train_val_bank_b = np.asarray(y_train_val_bank_b).astype(np.float32)

X_train_val_bank_c = np.asarray(X_train_val_bank_c).astype(np.float32)
y_train_val_bank_c = np.asarray(y_train_val_bank_c).astype(np.float32)

# Create a list of datasets for each bank
global_train_datasets_list = [
    (X_train_val_bank_a, y_train_val_bank_a),
    (X_train_val_bank_b, y_train_val_bank_b),
    (X_train_val_bank_c, y_train_val_bank_c),
]

# Create a test set
df_test = pd.concat([df_bank_a_test, df_bank_b_test, df_bank_c_test])

# Split data into X and y
X_test = df_test.drop('income', axis=1)
y_test = df_test['income']

X_test = np.asarray(X_test).astype(np.float32)
y_test = np.asarray(y_test).astype(np.float32)

global_test_dataset = (X_test, y_test)

In [None]:
# Create FedAvg strategy, considering all clients for training and evaluation
strategy = fl.server.strategy.FedAvg(
    fraction_fit=1,
    fraction_evaluate=1,
    min_fit_clients=NUM_CLIENTS,
    min_evaluate_clients=NUM_CLIENTS,  
    min_available_clients=NUM_CLIENTS, 
    evaluate_metrics_aggregation_fn=weighted_average,
    evaluate_fn=get_evaluate_fn(global_test_dataset),
)

# Start simulation
history_nn = fl.simulation.start_simulation(
    client_fn=get_client_fn(global_train_datasets_list),
    num_clients=NUM_CLIENTS,
    config=fl.server.ServerConfig(num_rounds=NUM_FML_ROUNDS),
    strategy=strategy,
)

In [33]:
accuracy_value_nn = pd.DataFrame(history_nn.metrics_centralized['accuracy']).rename(columns={0: 'round', 1: 'accuracy'})
accuracy_value_nn

Unnamed: 0,round,accuracy
0,0,0.246756
1,1,0.810069
2,2,0.810387
3,3,0.809656
4,4,0.810094
5,5,0.809869
