## Importando dependências

In [1]:
# pip install tensorflow sklearn

In [65]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
from tensorflow.keras.metrics import Precision
from tensorflow.keras.metrics import Recall
from tensorflow.keras.metrics import Accuracy
from tensorflow.keras.losses import Loss
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.utils import shuffle
from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import RandomOverSampler

## Importando dataset e splitando em teste e treino

In [3]:
def split_time_windows_dataset(dataset, window_size = 20, step_size = 1, is_label = False):
    # split com janelas rolante
    splitted_dataset = list()
    split_size = len(dataset) - (window_size - step_size)
    offset = 0
    
    for _ in range(split_size):
        if is_label:
            splitted_dataset.append(dataset[offset:(window_size + offset)])
        else:
            splitted_dataset.append(dataset[offset:(window_size + offset), :])
        offset += step_size
        
    final_dataset = np.array(splitted_dataset)                            
    return final_dataset

def balance_dataset_labels(dataset):
    balanced_dataset = dataset.loc[dataset.iloc[:, -1].isin([251, 12, 3, 24, 7, 8, 6])]
    standart_label_dataset = dataset.loc[dataset.iloc[:, -1] == 0]
    slice_dataset = standart_label_dataset.iloc[0:2200]
    balanced_dataset = balanced_dataset.append(slice_dataset)
    balanced_dataset = shuffle(balanced_dataset)
    balanced_dataset.reset_index(inplace=True, drop=True)
    return balanced_dataset


def normalize_dataset_labels(dataset):
    y = dataset.iloc[:, -1].values
    unique_labels = sorted(list(set(y)))
    for i in range(len(y)):
        y[i] = unique_labels.index(y[i])

    dataset.iloc[:, -1] = y

HEADER = ["timestamp",'srcip', 'srcport', 'dstip', 'dstport', 'proto', 'total_fpackets', 'total_fvolume',
          'total_bpackets', 'total_bvolume', 'min_fpktl', 'mean_fpktl', 'max_fpktl', 'std_fpktl', 'min_bpktl',
          'mean_bpktl', 'max_bpktl', 'std_bpktl', 'min_fiat', 'mean_fiat', 'max_fiat', 'std_fiat', 'min_biat',
          'mean_biat', 'max_biat', 'std_biat', 'duration', 'min_active', 'mean_active', 'max_active', 'std_active',
          'min_idle', 'mean_idle', 'max_idle', 'std_idle', 'sflow_fpackets', 'sflow_fbytes', 'sflow_bpackets',
          'sflow_bbytes', 'fpsh_cnt', 'bpsh_cnt', 'furg_cnt', 'burg_cnt', 'total_fhlen', 'total_bhlen', "dscp", 'class']
initial_dataset = pd.read_csv('dataset-oi-2017-02-24.csv', names=HEADER)

# initial_dataset = balance_dataset_labels(initial_dataset)

normalize_dataset_labels(initial_dataset)

## Criando modelo do cliente

In [69]:
class Client:
    def __init__(self, client_id = 123, dataset = [1.0, 2.0]):
        self.client_id = client_id
        self.local_dataset = pd.DataFrame(dataset)
        self.local_tree = DecisionTreeClassifier(random_state=0)
        self.metrics_history = list()
        self.local_forest = list()
        
        
    def preprocess(self):
        X = self.local_dataset.iloc[:, 5:-1].values
        y = self.local_dataset.iloc[:, -1].values
        
        X = np.array(X, dtype=np.float64)
        y = np.array(y, dtype=np.float64)
        
        sc = StandardScaler()
        X = sc.fit_transform(X)
        
        oversample = RandomOverSampler()
        X, y = oversample.fit_resample(X, y)
        
#         X = split_time_windows_dataset(X)
#         y = split_time_windows_dataset(y, is_label = True)
        
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
        
    def train_tree(self):
        params = {'max_depth': [6,8,10,12],
                 'min_samples_split': [2,3,4]}

        gcv = GridSearchCV(estimator=self.local_tree,param_grid=params)
        gcv.fit(self.X_train, self.y_train)
        
        self.local_tree = gcv.best_estimator_
        self.local_tree.fit(self.X_train, self.y_train)
        
    def test_tree(self):
        y_pred = self.local_tree.predict(self.X_test)
        
#         precision = Precision()
#         precision.update_state(self.y_test, y_pred)
#         prec = precision.result().numpy()
        
#         recall = Recall()
#         recall.update_state(self.y_test, y_pred)
#         rec = recall.result().numpy()
        
        acc = self.local_tree.score(self.X_test, self.y_test)
        print(f"[INFO] Accuracy of client {self.client_id} = {acc}")
        
        self.metrics_history.append((acc, 0.0, 0.0))
#         cm = confusion_matrix(self.y_test, y_pred)
#         print(cm)  
        
    def plot_result(self, rounds):
        acc_history, prec_history, rec_history = zip(*self.metrics_history)
        epochs = range(1, rounds + 1)
        plt.plot(epochs, prec_history, 'g', label='Precision')
        plt.plot(epochs, acc_history, 'b', label='Accuracy')
        plt.plot(epochs, rec_history, 'r', label='Recall')
        plt.title(f'Accuracy, Precision and Recall history of client {self.client_id}')
        plt.xlabel('Epochs')
        plt.ylabel('Metrics')
        plt.legend()
        plt.show()

## Splitando dataset para os clientes da rede

In [70]:
num_clients = 20
max_num_trees = 10
clients = [Client(client_id = i) for i in range(num_clients)]

unique_ips = initial_dataset['srcip'].value_counts().head(num_clients + 10).keys()
client_index, i = (0,0)
while client_index < num_clients:
    local_dataset = initial_dataset.loc[initial_dataset['srcip'] == unique_ips[i]]
    num_classes = len(local_dataset['class'].value_counts())
    if num_classes > 1:
        clients[client_index].local_dataset = local_dataset
        client_index += 1
    i += 1
        

## Treinando e testando modelos DT locais

In [None]:
for client in clients:
    client.preprocess()
    client.train_tree()
    client.test_tree()

print("[INFO] All local trains are finished!")

## Compartilhando DTs e atualizando RF

In [46]:
for current in range(num_clients):
    appended_trees, replaced_trees = (0,0)
    for neighbour in range(num_clients):
        if current != neighbour:
            neighbour_tree = clients[neighbour].local_tree
            acc = neighbour_tree.score(clients[current].X_test, clients[current].y_test)
            
            if len(clients[current].local_forest) < max_num_trees:
                clients[current].local_forest.append(neighbour_tree)
                appended_trees += 1
            else:
                min_acc, index, min_index = (0,0,0)
                for tree in clients[current].local_forest: # busca árvore de menor acurácia
                    tree_acc = tree.score(clients[current].X_test, clients[current].y_test)
                    if min_acc > tree_acc:
                        min_acc = tree_acc
                        min_index = index
                    index += 1
                if min_acc < tree_acc:
                    clients[current].local_forest[min_index] = neighbour_tree # substitui a árvore pela nova
                    replaced_trees += 1
                            
    print(f"Client {current} appended {appended_trees} and replaced {replaced_trees} trees to forest")
                

Accuracy for tree of client 1 = 0.3230518598968232
Accuracy for tree of client 2 = 0.876513711648113
Accuracy for tree of client 3 = 0.44186804235677435
Accuracy for tree of client 4 = 0.334890035297312
Accuracy for tree of client 5 = 0.8303013847407005
Accuracy for tree of client 6 = 0.6204181373879989
Accuracy for tree of client 7 = 0.869671463480858
Accuracy for tree of client 8 = 0.7472169427097475
Accuracy for tree of client 9 = 0.8686939994569645
Accuracy for tree of client 10 = 0.8760249796361662
Accuracy for tree of client 11 = 0.8766766223187619
Accuracy for tree of client 12 = 0.8210154765137117
Accuracy for tree of client 13 = 0.8734727124626663
Accuracy for tree of client 14 = 0.8766766223187619
Accuracy for tree of client 15 = 0.8761335867499321
Accuracy for tree of client 16 = 0.8766766223187619
Accuracy for tree of client 17 = 0.8761878903068151
Accuracy for tree of client 18 = 0.869671463480858
Accuracy for tree of client 19 = 0.8766766223187619
Client 0 appended 0 and 

## Testando a RF dos clientes

In [47]:
import statistics
import multiprocessing
from statistics import mode

def forest_predict(forest, X):
    forest_prediction = list()
    for row in X:
        tree_predictions = list()
        for tree in forest:
            tree_predictions.append(tree.predict([row])[0])
        forest_prediction.append(mode(tree_predictions))
    return forest_prediction

accuracy = Accuracy()
for client in clients:
    print(f"Lenght of client {client.client_id} dataset = {len(client.X_test)}")
    forest_pred = forest_predict(client.local_forest, client.X_test)
    accuracy.update_state(client.y_test, forest_pred)
    acc = accuracy.result().numpy()
    print(f"Accuracy for client {client.client_id} forest = {acc}")

Lenght of client 0 dataset = 18415
Accuracy for client 0 forest = 0.876676619052887
Lenght of client 1 dataset = 14581
Accuracy for client 1 forest = 0.7063280344009399
Lenght of client 2 dataset = 13271
Accuracy for client 2 forest = 0.7903689742088318
Lenght of client 3 dataset = 10556
Accuracy for client 3 forest = 0.7170512080192566
Lenght of client 4 dataset = 10515
Accuracy for client 4 forest = 0.7170542478561401
Lenght of client 5 dataset = 7451
Accuracy for client 5 forest = 0.6797657608985901
Lenght of client 6 dataset = 7302
Accuracy for client 6 forest = 0.6933403015136719
Lenght of client 7 dataset = 6690
Accuracy for client 7 forest = 0.7157950401306152
Lenght of client 8 dataset = 6420
Accuracy for client 8 forest = 0.7138265371322632
Lenght of client 9 dataset = 5825
Accuracy for client 9 forest = 0.7300992012023926
Lenght of client 10 dataset = 5549
Accuracy for client 10 forest = 0.7441144585609436
Lenght of client 11 dataset = 5466
Accuracy for client 11 forest = 0.7

## Plotando resultados dos treinos locais

In [None]:
final_metrics = [[],[],[]]
for client in clients:
    client.plot_result(train_rounds)
    for i in range(3):
        final_metrics[i].append(client.metrics_history[-1][i])

plt.boxplot(final_metrics)
plt.title('Final metrics result')
plt.xticks([1,2,3], ['accuracy', 'precision', 'recall'])
plt.show()