# Federated Fraud Demo with logistic regression


# Clone Sherpa framework
Go to https://github.com/sherpaai/Sherpa.ai-Federated-Learning-Framework/blob/master/install.md
Follow insructions to install Sherpa framework

Then copy this notebook into the notebook folder


In [1]:
# Libraries
import pandas as pd
import numpy as np

import shfl
from shfl.data_base.data_base import LabeledDatabase
from sklearn.datasets import make_classification

from shfl.private.reproducibility import Reproducibility

from sklearn.linear_model import LogisticRegression
from shfl.model.linear_classifier_model import LinearClassifierModel

import random

In [3]:
# Load target dataset
data_tmp = pd.read_csv('./creditcard.csv').values

# How to separate features from labels
print('features: \n', data_tmp[:, :-1])
print('\n labels: \n', data_tmp[:, -1])

FileNotFoundError: [Errno 2] File ./creditcard.csv does not exist: './creditcard.csv'

In [3]:
def prep_data(dataset):
    ''' '''

    data = dataset[:, :-1]
    labels = dataset[:, -1]

    database = LabeledDatabase(data, labels)

    train_data, train_labels, test_data, test_labels = database.load_data()

    n_features = train_data.shape[1]
    n_classes = len(np.unique(train_labels))
    n_samples = train_data.shape[0] + test_data.shape[0]

    # check results
    if False == True:
        print("Shape of training and test data: " + str(train_data.shape) + str(test_data.shape))
        print("Shape of training and test labels: " + str(train_labels.shape) + str(test_labels.shape))
        print(train_data[0,:])

    return n_features, n_classes, n_samples, train_data, train_labels, test_data, test_labels, database

# Define model builder
def model_builder():
    ''' '''

    sk_model = LogisticRegression(warm_start=True, solver='lbfgs', multi_class='auto')
    model = LinearClassifierModel(n_features=n_features, classes=classes, model=sk_model)

    return model

# Train model on centralize data for comparison purpose
def do_centralized_train(n_features, classes, train_data, train_labels, test_data, test_labels):
    ''' '''

    model_centralized = LinearClassifierModel(n_features=n_features, classes=classes)
    model_centralized.train(train_data, train_labels)
    y_score = model_centralized.predict(test_data)

    t1 = np.array(y_score) == np.array(test_labels)
    t3 = np.array(y_score) != np.array(test_labels)
    t2 = np.array(y_score) == 1

    num_fraud = len(test_labels[test_labels == 1])
    num_detec = len(y_score[t1 & t2])
    num_fake = len(y_score[t3 & t2])

    print(len(test_labels), num_fraud, num_detec, num_fake)
    # print('\n\nCohen kappa: ' + str(model_centralized.evaluate(test_data, test_labels)[1]))
    # print('\nCentralized test overrall accuracy: ' + str(len(test_score[np.array(test_score)==np.array(test_labels)]) / len(test_score)) )
    # print('\nBalanced Accuracy: ' + str(model_centralized.performance(test_data, test_labels)) + '\n\n')

    return num_fraud, num_detec, num_fake

# Train model on decentralized data
def do_decentralized_train(database, model_builder, test_data, test_labels, n_rounds = 3, n_clients = 4):
    ''' '''

    iid_distribution = shfl.data_distribution.IidDataDistribution(database)
    federated_data, test_data, test_labels = iid_distribution.get_federated_data(num_nodes=n_clients, percent=100)
    aggregator = shfl.federated_aggregator.FedAvgAggregator()


    # Run the federated experiment:
    federated_government = shfl.federated_government.FederatedGovernment(model_builder, federated_data, aggregator)
    federated_government.run_rounds(n=n_rounds, test_data=test_data, test_label=test_labels)

    return 

In [4]:
dset = data_tmp

# flag to reduce dataset
decrease_dataset = False
num_reduced = 40000

if decrease_dataset:
    dset = data_tmp[0:num_reduced]
    n_sample_global = dset.shape[0]

In [5]:
# Prep data
n_features, n_classes, n_samples, train_data, train_labels, test_data, test_labels, database = prep_data(dset)


In [6]:
# Train model with ALL data on CENTRALIZED fashion
do_centralized_train(n_features, np.unique(train_labels), train_data, train_labels, test_data, test_labels)


56961 103 77 31


(103, 77, 31)

In [7]:
# Train model with ALL data on DECENTRALIZED fashion 
classes = np.unique(train_labels)
do_decentralized_train(database, model_builder, test_data, test_labels, n_rounds = 3, n_clients = 4)


Accuracy round 0
Test performance client <shfl.private.federated_operation.FederatedDataNode object at 0x7fe8088c9c50>: (0.8054559015527355, 0.6050481902221312)
Test performance client <shfl.private.federated_operation.FederatedDataNode object at 0x7fe8088c9d50>: (0.8589155166661145, 0.6975508239335391)
Test performance client <shfl.private.federated_operation.FederatedDataNode object at 0x7fe8088c92d0>: (0.8977944373088194, 0.76234296390171)
Test performance client <shfl.private.federated_operation.FederatedDataNode object at 0x7fe8088c9f50>: (0.8444227605682287, 0.7059491697110494)
Global model test performance : (0.8784121369297795, 0.7531747092464312)



Accuracy round 1
Test performance client <shfl.private.federated_operation.FederatedDataNode object at 0x7fe8088c9c50>: (0.820238854280823, 0.709208133982148)
Test performance client <shfl.private.federated_operation.FederatedDataNode object at 0x7fe8088c9d50>: (0.7523040707441158, 0.6148406298967847)
Test performance client <shfl.

In [8]:
# Train N (n_splits) data split unilateral models

n_splits = 4
n_rounds = 3
n_sample_global = data_tmp.shape[0]

dset = data_tmp

# flag to reduce dataset
decrease_dataset = False
num_reduced = 40000

if decrease_dataset:
    dset = data_tmp[0:num_reduced]
    n_sample_global = dset.shape[0]

x = random.sample(range(n_sample_global), n_sample_global)

ll = round(len(x) / n_splits)
perf = []

for j in range(n_rounds):
    print('\nROUND #', j, '\n')

    for i in range(n_splits):
        init = ll * i
        endit = ll + init
        tmp = dset[init:endit]
        
        print(str('\n\nEntity #') + str(i) + str(' with ') + str(len(tmp)) + str(' transactions from a total of ') + str(n_sample_global) + str('\n\n'))

        # Prep data
        n_features, n_classes, n_samples, train_data, train_labels, test_data, test_labels, database = prep_data(tmp)

        # Balance check
        num_train_positive = len(train_labels[train_labels == 1])
        num_train_negative = len(train_labels[train_labels == 0])

        num_test_positive = len(test_labels[test_labels == 1])
        num_test_negative = len(test_labels[test_labels == 0])

        print(100*num_train_positive/num_train_negative, 100*num_test_positive/num_test_negative)

        # Train model on centralized data for comparison:
        
        eva_tmp = do_centralized_train(n_features, np.unique(train_labels), train_data, train_labels, test_data, test_labels)

        # Store results
        perf.append([j, eva_tmp, round(100 * num_train_positive/num_train_negative, 2), round(100 * num_test_positive/num_test_negative, 2)])



ROUND # 0 



Entity #0 with 71202 transactions from a total of 284807


0.235799253888928 0.3028808903289427
14240 43 20 9


Entity #1 with 71202 transactions from a total of 284807


0.12480005624791267 0.14768971095013714
14240 21 17 1


Entity #2 with 71202 transactions from a total of 284807


0.17762614094018747 0.19701660568533633
14240 28 18 4


Entity #3 with 71201 transactions from a total of 284807


0.13184263263368842 0.13360523169959918
14240 19 0 0

ROUND # 1 



Entity #0 with 71202 transactions from a total of 284807


0.24461925628706685 0.2675679481763132
14240 38 19 13


Entity #1 with 71202 transactions from a total of 284807


0.11952051182901537 0.16882386043894204
14240 24 0 0


Entity #2 with 71202 transactions from a total of 284807


0.1811498619391829 0.18291824961305755
14240 26 20 2


Entity #3 with 71201 transactions from a total of 284807


0.14240506329113925 0.0913755535249877
14240 13 9 2

ROUND # 2 



Entity #0 with 71202 transactions from a total 

In [9]:
# Summary for unilateral training
perf

[[0, (43, 20, 9), 0.24, 0.3],
 [0, (21, 17, 1), 0.12, 0.15],
 [0, (28, 18, 4), 0.18, 0.2],
 [0, (19, 0, 0), 0.13, 0.13],
 [1, (38, 19, 13), 0.24, 0.27],
 [1, (24, 0, 0), 0.12, 0.17],
 [1, (26, 20, 2), 0.18, 0.18],
 [1, (13, 9, 2), 0.14, 0.09],
 [2, (34, 15, 9), 0.25, 0.24],
 [2, (16, 0, 0), 0.13, 0.11],
 [2, (31, 24, 5), 0.17, 0.22],
 [2, (18, 0, 0), 0.13, 0.13]]