# Federated Fraud Demo with logistic regression


# Clone Sherpa framework
Go to https://github.com/sherpaai/Sherpa.ai-Federated-Learning-Framework/blob/master/install.md
Follow insructions to install Sherpa framework

Then copy this notebook into the notebook folder


In [None]:
# Libraries
import pandas as pd
import numpy as np

import shfl
from shfl.data_base.data_base import LabeledDatabase
from sklearn.datasets import make_classification

from shfl.private.reproducibility import Reproducibility

from sklearn.linear_model import LogisticRegression
from shfl.model.linear_classifier_model import LinearClassifierModel

from xain_sdk import ParticipantABC, run_participant, configure_logging


import random
import pathlib
import os
from io import BytesIO

INPUTS_PATH = os.environ.get("NEVERMINED_INPUTS_PATH")
COORDINATOR_URL = "http://172.17.0.2:8081"

In [None]:
# get input file path
input_file_path = next(pathlib.Path(INPUTS_PATH).rglob("*/creditcard*.csv")).as_posix()
# input_file_path = "../resources/data/creditcard.csv"

# Load target dataset
data_tmp = pd.read_csv(input_file_path).values

# How to separate features from labels
print('features: \n', data_tmp[:, :-1])
print('\n labels: \n', data_tmp[:, -1])

In [None]:
def prep_data(dataset):
    ''' '''

    data = dataset[:, :-1]
    labels = dataset[:, -1]

    database = LabeledDatabase(data, labels)

    train_data, train_labels, test_data, test_labels = database.load_data()

    n_features = train_data.shape[1]
    n_classes = len(np.unique(train_labels))
    n_samples = train_data.shape[0] + test_data.shape[0]

    # check results
    print("Shape of training and test data: " + str(train_data.shape) + str(test_data.shape))
    print("Shape of training and test labels: " + str(train_labels.shape) + str(test_labels.shape))
    print(train_data[0,:])

    return n_features, n_classes, n_samples, train_data, train_labels, test_data, test_labels

# Define model builder
def model_builder():
    ''' '''

    sk_model = LogisticRegression(warm_start=True, solver='lbfgs', multi_class='auto')
    model = LinearClassifierModel(n_features=n_features, classes=classes, model=sk_model)

    return model

def train(n_features, classes, train_data, train_labels, test_data, test_labels):
    # Balance check
    num_train_positive = len(train_labels[train_labels == 1])
    num_train_negative = len(train_labels[train_labels == 0])

    num_test_positive = len(test_labels[test_labels == 1])
    num_test_negative = len(test_labels[test_labels == 0])

    print(100*num_train_positive/num_train_negative, 100*num_test_positive/num_test_negative)

    model_centralized = LinearClassifierModel(n_features=n_features, classes=classes)
    model_centralized.train(train_data, train_labels)
    y_score = model_centralized.predict(test_data)

    t1 = np.array(y_score) == np.array(test_labels)
    t3 = np.array(y_score) != np.array(test_labels)
    t2 = np.array(y_score) == 1

    num_fraud = len(test_labels[test_labels == 1])
    num_detec = len(y_score[t1 & t2])
    num_fake = len(y_score[t3 & t2])

    print(len(test_labels), num_fraud, num_detec, num_fake)
   
    return num_fraud, num_detec, num_fake

In [None]:

class Participant(ParticipantABC):

    def __init__(self, data):
        super(Participant, self).__init__()
        self.n_features, self.n_classes, self.n_samples, self.train_data, self.train_labels, self.test_data, self.test_labels = prep_data(data)
        self.model = LinearClassifierModel(n_features=self.n_features, classes=np.unique(self.train_labels))

    def get_weights(self):
        return np.concatenate(self.model.get_weights(), axis=None)

    def set_weights(self, weights):
        shapes = self.get_shapes()
        # expand the flat weights
        indices: np.ndarray = np.cumsum([np.prod(shape) for shape in shapes])
        tensorflow_weights: List[np.ndarray] = np.split(
            weights, indices_or_sections=indices
        )
        tensorflow_weights = [
            np.reshape(weight, newshape=shape)
            for weight, shape in zip(tensorflow_weights, shapes)
        ]

        # apply the weights to the tensorflow model
        self.model.set_weights(tensorflow_weights)

    def train_round(self, training_input):
        if training_input:
            self.model.set_weights(training_input)

        # Balance check
        num_train_positive = len(self.train_labels[self.train_labels == 1])
        num_train_negative = len(self.train_labels[self.train_labels == 0])

        num_test_positive = len(self.test_labels[self.test_labels == 1])
        num_test_negative = len(self.test_labels[self.test_labels == 0])

        print(100*num_train_positive/num_train_negative, 100*num_test_positive/num_test_negative)

        model_centralized = LinearClassifierModel(n_features=n_features, classes=classes)
        model_centralized.train(train_data, train_labels)
        y_score = model_centralized.predict(test_data)

        t1 = np.array(y_score) == np.array(test_labels)
        t3 = np.array(y_score) != np.array(test_labels)
        t2 = np.array(y_score) == 1

        num_fraud = len(test_labels[test_labels == 1])
        num_detec = len(y_score[t1 & t2])
        num_fake = len(y_score[t3 & t2])

        print(len(test_labels), num_fraud, num_detec, num_fake)

    def serialize_training_result(self, training_result):
        (weights, number_of_samples) = training_result

        writer = BytesIO()
        writer.write(number_of_samples.to_bytes(4, byteorder="big"))
        np.save(writer, weights, allow_pickle=False)
        return writer.getbuffer()[:]

    def deserialize_training_input(self, data):
        if not data:
            return None

        reader = BytesIO(data)
        return np.load(reader, allow_pickle=False)

In [None]:
configure_logging(log_http_requests=True)

participant = Participant(data_tmp)
run_participant(participant, COORDINATOR_URL)
