# Federated Fraud Demo with logistic regression


# Clone Sherpa framework
Go to https://github.com/sherpaai/Sherpa.ai-Federated-Learning-Framework/blob/master/install.md
Follow insructions to install Sherpa framework

Then copy this notebook into the notebook folder


In [None]:
# Libraries
import pandas as pd
import numpy as np

import shfl
from shfl.data_base.data_base import LabeledDatabase
from sklearn.datasets import make_classification

from shfl.private.reproducibility import Reproducibility

from sklearn.linear_model import LogisticRegression
from shfl.model.linear_classifier_model import LinearClassifierModel

import random
import pathlib
import os

INPUTS_PATH = os.environ.get("NEVERMINED_INPUTS_PATH")

In [None]:
# get input file path
# input_file_path = next(pathlib.Path(INPUTS_PATH).rglob("*/creditcard.csv")).as_posix()
input_file_path = "../resources/data/creditcard.csv"

# Load target dataset
data_tmp = pd.read_csv(input_file_path).values

# How to separate features from labels
print('features: \n', data_tmp[:, :-1])
print('\n labels: \n', data_tmp[:, -1])

In [None]:
def prep_data(dataset):
    ''' '''

    data = dataset[:, :-1]
    labels = dataset[:, -1]

    database = LabeledDatabase(data, labels)

    train_data, train_labels, test_data, test_labels = database.load_data()

    n_features = train_data.shape[1]
    n_classes = len(np.unique(train_labels))
    n_samples = train_data.shape[0] + test_data.shape[0]

    # check results
    if False == True:
        print("Shape of training and test data: " + str(train_data.shape) + str(test_data.shape))
        print("Shape of training and test labels: " + str(train_labels.shape) + str(test_labels.shape))
        print(train_data[0,:])

    return n_features, n_classes, n_samples, train_data, train_labels, test_data, test_labels, database

# Define model builder
def model_builder():
    ''' '''

    sk_model = LogisticRegression(warm_start=True, solver='lbfgs', multi_class='auto')
    model = LinearClassifierModel(n_features=n_features, classes=classes, model=sk_model)

    return model

# Train model on decentralized data
def do_decentralized_train(database, model_builder, test_data, test_labels, n_rounds = 3, n_clients = 4):
    ''' '''

    iid_distribution = shfl.data_distribution.IidDataDistribution(database)
    federated_data, test_data, test_labels = iid_distribution.get_federated_data(num_nodes=n_clients, percent=100)
    aggregator = shfl.federated_aggregator.FedAvgAggregator()


    # Run the federated experiment:
    federated_government = shfl.federated_government.FederatedGovernment(model_builder, federated_data, aggregator)
    federated_government.run_rounds(n=n_rounds, test_data=test_data, test_label=test_labels)

    return 

In [None]:
dset = data_tmp

# flag to reduce dataset
decrease_dataset = False
num_reduced = 40000

if decrease_dataset:
    dset = data_tmp[0:num_reduced]
    n_sample_global = dset.shape[0]

In [None]:
# Prep data
n_features, n_classes, n_samples, train_data, train_labels, test_data, test_labels, database = prep_data(dset)


In [None]:
# Train model with ALL data on DECENTRALIZED fashion 
classes = np.unique(train_labels)
do_decentralized_train(database, model_builder, test_data, test_labels, n_rounds = 3, n_clients = 4)


In [None]:
# Train N (n_splits) data split unilateral models

n_splits = 4
n_rounds = 3
n_sample_global = data_tmp.shape[0]

dset = data_tmp

# flag to reduce dataset
decrease_dataset = False
num_reduced = 40000

if decrease_dataset:
    dset = data_tmp[0:num_reduced]
    n_sample_global = dset.shape[0]

x = random.sample(range(n_sample_global), n_sample_global)

ll = round(len(x) / n_splits)
perf = []

for j in range(n_rounds):
    print('\nROUND #', j, '\n')

    for i in range(n_splits):
        init = ll * i
        endit = ll + init
        tmp = dset[init:endit]
        
        print(str('\n\nEntity #') + str(i) + str(' with ') + str(len(tmp)) + str(' transactions from a total of ') + str(n_sample_global) + str('\n\n'))

        # Prep data
        n_features, n_classes, n_samples, train_data, train_labels, test_data, test_labels, database = prep_data(tmp)

        # Balance check
        num_train_positive = len(train_labels[train_labels == 1])
        num_train_negative = len(train_labels[train_labels == 0])

        num_test_positive = len(test_labels[test_labels == 1])
        num_test_negative = len(test_labels[test_labels == 0])

        print(100*num_train_positive/num_train_negative, 100*num_test_positive/num_test_negative)

        # Train model on centralized data for comparison:
        
        eva_tmp = do_centralized_train(n_features, np.unique(train_labels), train_data, train_labels, test_data, test_labels)

        # Store results
        perf.append([j, eva_tmp, round(100 * num_train_positive/num_train_negative, 2), round(100 * num_test_positive/num_test_negative, 2)])


In [None]:
# Summary for unilateral training
perf