# Data Processing for Multi-Party Decentralised Federated Learning Example

We will be using the Bank Marketing data that has been used throughout all of the notebooks in this repository. However, in order to demonstrate the power of federated learning, we need to seperate the original dataset into; First, a training and testing set. Then with the training set we can seperate it into multiple training sets for seperate 'Banks'.

For this we will designate the testing set to the central aggregator, i.e. me/you. This will only be initialised on the central aggregator side and is data only accessible by me/you. Then we will have the following three datasets which are subsets of the original training set;
- 'Portuguese Bank'
- 'American Bank'
- 'Australian Bank'

The idea behind this is that, each of these banks will have data under different regualtions and requirements. This makes the daring, training, and use of this data difficult. Therefore, hopefully this federated learning method might pose a solution, as no other party will have direct access to anothers data, however, the central aggregator will be able to trian models and test their own data based on these banks data.

# Setup

In [1]:
import pandas as pd
import numpy as np

# Import Data

In [2]:
df_benchmark = pd.read_csv('datasets/bank-additional-full-kaggle-dataset-final.csv', sep = ',')
df_y = pd.read_csv('datasets/bank-additional-full-kaggle-dataset-y.csv', sep = ',')
df_benchmark.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,duration,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,campaign,pdays,previous,poutcome
0,3,3,1,0,0,0,0,1,6,1,3,1.1,93.994,-36.4,4.857,5191.0,1,999,0,1
1,3,7,1,3,1,0,0,1,6,1,2,1.1,93.994,-36.4,4.857,5191.0,1,999,0,1
2,2,7,1,3,0,2,0,1,6,1,3,1.1,93.994,-36.4,4.857,5191.0,1,999,0,1
3,2,0,1,1,0,0,0,1,6,1,2,1.1,93.994,-36.4,4.857,5191.0,1,999,0,1
4,3,7,1,3,0,0,2,1,6,1,3,1.1,93.994,-36.4,4.857,5191.0,1,999,0,1


In [3]:
df_y.head()

Unnamed: 0,['y']_yes
0,0
1,0
2,0
3,0
4,0


In [4]:
df_benchmark = pd.concat([df_benchmark, df_y], axis=1)

In [5]:
df_benchmark.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,campaign,pdays,previous,poutcome,['y']_yes
0,3,3,1,0,0,0,0,1,6,1,...,1.1,93.994,-36.4,4.857,5191.0,1,999,0,1,0
1,3,7,1,3,1,0,0,1,6,1,...,1.1,93.994,-36.4,4.857,5191.0,1,999,0,1,0
2,2,7,1,3,0,2,0,1,6,1,...,1.1,93.994,-36.4,4.857,5191.0,1,999,0,1,0
3,2,0,1,1,0,0,0,1,6,1,...,1.1,93.994,-36.4,4.857,5191.0,1,999,0,1,0
4,3,7,1,3,0,0,2,1,6,1,...,1.1,93.994,-36.4,4.857,5191.0,1,999,0,1,0


In [6]:
df_benchmark = df_benchmark.rename({"['y']_yes": 'y'}, axis=1)

In [7]:
df_benchmark.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,campaign,pdays,previous,poutcome,y
0,3,3,1,0,0,0,0,1,6,1,...,1.1,93.994,-36.4,4.857,5191.0,1,999,0,1,0
1,3,7,1,3,1,0,0,1,6,1,...,1.1,93.994,-36.4,4.857,5191.0,1,999,0,1,0
2,2,7,1,3,0,2,0,1,6,1,...,1.1,93.994,-36.4,4.857,5191.0,1,999,0,1,0
3,2,0,1,1,0,0,0,1,6,1,...,1.1,93.994,-36.4,4.857,5191.0,1,999,0,1,0
4,3,7,1,3,0,0,2,1,6,1,...,1.1,93.994,-36.4,4.857,5191.0,1,999,0,1,0


In [8]:
df_benchmark = df_benchmark.drop('duration', axis=1)

In [9]:
df_benchmark.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,campaign,pdays,previous,poutcome,y
0,3,3,1,0,0,0,0,1,6,1,1.1,93.994,-36.4,4.857,5191.0,1,999,0,1,0
1,3,7,1,3,1,0,0,1,6,1,1.1,93.994,-36.4,4.857,5191.0,1,999,0,1,0
2,2,7,1,3,0,2,0,1,6,1,1.1,93.994,-36.4,4.857,5191.0,1,999,0,1,0
3,2,0,1,1,0,0,0,1,6,1,1.1,93.994,-36.4,4.857,5191.0,1,999,0,1,0
4,3,7,1,3,0,0,2,1,6,1,1.1,93.994,-36.4,4.857,5191.0,1,999,0,1,0


In [10]:
neg, pos = np.bincount(df_benchmark['y'])
total = neg + pos
print('Examples:\n    Total: {}\n    Positive: {} ({:.2f}% of total)\n'.format(
    total, pos, 100 * pos / total))

Examples:
    Total: 41188
    Positive: 4640 (11.27% of total)



## Data processing

In [11]:
# drop rows with missing values
df_benchmark = df_benchmark.dropna()

In [12]:
df_benchmark.shape

(41188, 20)

In [13]:
# extract labels
df_y = df_benchmark[["y"]]
df_final = df_benchmark.drop("y", 'columns')

In [14]:
df_final.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,campaign,pdays,previous,poutcome
0,3,3,1,0,0,0,0,1,6,1,1.1,93.994,-36.4,4.857,5191.0,1,999,0,1
1,3,7,1,3,1,0,0,1,6,1,1.1,93.994,-36.4,4.857,5191.0,1,999,0,1
2,2,7,1,3,0,2,0,1,6,1,1.1,93.994,-36.4,4.857,5191.0,1,999,0,1
3,2,0,1,1,0,0,0,1,6,1,1.1,93.994,-36.4,4.857,5191.0,1,999,0,1
4,3,7,1,3,0,0,2,1,6,1,1.1,93.994,-36.4,4.857,5191.0,1,999,0,1


In [15]:
df_y.head()

Unnamed: 0,y
0,0
1,0
2,0
3,0
4,0


In [16]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(df_final, df_y, test_size = 0.1942313295, random_state = 101)

In [17]:
print("############# Data summary #############")
print(f"x_train has shape: {x_train.shape}")
print(f"y_train has shape: {y_train.shape}")
print(f"x_test has shape: {x_test.shape}")
print(f"y_test has shape: {y_test.shape}")
print("#######################################")

############# Data summary #############
x_train has shape: (33188, 19)
y_train has shape: (33188, 1)
x_test has shape: (8000, 19)
y_test has shape: (8000, 1)
#######################################


In [18]:
portuguese_bank_data, other_data, portuguese_bank_target, other_target = train_test_split(x_train, y_train, test_size = 0.666666, random_state = 101)
american_bank_data, australian_bank_data, american_bank_target, australian_bank_target = train_test_split(other_data, other_target, test_size = 0.5, random_state = 101)

In [19]:
print("############# Data summary #############")
print(f"portuguese_bank_data has shape: {portuguese_bank_data.shape}")
print(f"portuguese_bank_target has shape: {portuguese_bank_target.shape}")
print(f"american_bank_data has shape: {american_bank_data.shape}")
print(f"american_bank_target has shape: {american_bank_target.shape}")
print(f"australian_bank_data has shape: {australian_bank_data.shape}")
print(f"australian_bank_target has shape: {australian_bank_target.shape}")
print(f"x_test has shape: {x_test.shape}")
print(f"y_test has shape: {y_test.shape}")
print("#######################################")

############# Data summary #############
portuguese_bank_data has shape: (11062, 19)
portuguese_bank_target has shape: (11062, 1)
american_bank_data has shape: (11063, 19)
american_bank_target has shape: (11063, 1)
australian_bank_data has shape: (11063, 19)
australian_bank_target has shape: (11063, 1)
x_test has shape: (8000, 19)
y_test has shape: (8000, 1)
#######################################


In [20]:
portuguese_bank_data.to_csv('datasets/portuguese-bank-data.csv', index=False)
portuguese_bank_target.to_csv('datasets/portuguese-bank-target.csv', index=False)

american_bank_data.to_csv('datasets/american-bank-data.csv', index=False)
american_bank_target.to_csv('datasets/american-bank-target.csv', index=False)

australian_bank_data.to_csv('datasets/australian-bank-data.csv', index=False)
australian_bank_target.to_csv('datasets/australian-bank-target.csv', index=False)

x_test.to_csv('datasets/test-data.csv', index=False)
y_test.to_csv('datasets/test-target.csv', index=False)