In [None]:
import os
import math

import pickle

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import KFold, StratifiedKFold

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow import keras

import matplotlib.pyplot as plt

import re

In [None]:
def create_model(n_features):
    '''
        Create an MLP model.
    '''
    model = Sequential([
        Dense(units = 128, input_shape = (n_features,), activation = 'relu'),
        # Dropout(rate = 0.2),
        Dense(units = 32, activation = 'relu'),
        # Dropout(rate = 0.2),
        Dense(units = 16, activation = 'relu'),
        # Dropout(rate = 0.2),
        Dense(units = 1, activation = 'sigmoid')
    ])
    
    # Compile model.
    model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
    
    return model

def create_train_test_sets(train_df, test_df):
    '''
    '''
    
    # Drop domain name, family and Label column to create X_train data.
    X_train = np.array(train_df.drop(["Domain Name", "Family", "Label"], axis = 1))

    # Create a y np array with the labels.
    y_train = np.array(train_df["Label"])

    # Drop domain name, family and Label column to create X_test data.
    X_test = np.array(test_df.drop(["Domain Name", "Family", "Label"], axis = 1))

    # Create a y np array with the labels.
    y_test = np.array(test_df["Label"])
    
    # Normalize datasets.
    X_train, X_test = MinMax_normalization(X_train, X_test)
    
    return X_train, y_train, X_test, y_test

def MinMax_normalization(X_train, X_test):
    '''
        Normalize data using MinMax Normalization
        
            Input: 
                Train, validation and test set
                
            Return: 
                Scaled train, validation and test set
    '''
    
    # Create a scaler based on train dataset.
    scaler_obj = MinMaxScaler()
    X_train_scaled = scaler_obj.fit_transform(X_train)
    
    # Transform validation and test sety based on the training scaler.
    X_test_scaled = scaler_obj.transform(X_test)
    
    return X_train_scaled, X_test_scaled

def k_fold_training_results_line_plot(train_results_dict):

    # Create a figure with a 3x3 grid of subplots
    fig, axs = plt.subplots(3, 2, figsize = (25, 10))  # You can adjust the figsize as needed

    # Flatten the axs array for easier iteration
    axs = axs.flatten()

    # Create subplots with two lines in each subplot
    for i, ax in enumerate(axs):
        ax.plot(train_results_dict[str(i)][0], label = 'training loss')
        ax.plot(train_results_dict[str(i)][1], label = 'Validation loss')
        ax.set_title(f'Fold {i + 1}')
        ax.legend()

    # Adjust the layout and spacing
    plt.tight_layout()
    
    plt.savefig('non_federated_mlp_loss_plot_kfold.png')

    # Show the plots
    plt.show()

# def plot_training_results(train_results_dict):
#     '''
#         Create a loss plot after mlp training.
#     '''
    
#     # Create a figure with a 3x3 grid of subplots
#     plt.subplots(figsize = (15, 10))  # You can adjust the figsize as needed


#     # Create subplots with two lines in each subplot
#     plt.plot(train_results_dict['0'][0], label = 'training loss')
#     plt.plot(train_results_dict['0'][1], label = 'Validation loss')
#     plt.legend()

#     # Adjust the layout and spacing
#     plt.tight_layout()

#     plt.savefig('new_mlp_training_loss_plot_100_epochs_32_batch.png')

#     # Show the plots
#     plt.show()

# def create_train_validation_test_sets(train_df, test_df):
#     '''
#     '''
    
#     # Drop domain name, family and Label column to create X_train data.
#     X_train = np.array(train_df.drop(["Domain Name", "Family", "Label"], axis = 1))

#     # Create a y np array with the labels.
#     y_train = np.array(train_df["Label"])
    
#     # Split into train and validation set. 
#     X_train, X_validation, y_train, y_validation = train_test_split(X_train, y_train, test_size = 0.2, random_state = 42)

#     # Drop domain name, family and Label column to create X_test data.
#     X_test = np.array(test_df.drop(["Domain Name", "Family", "Label"], axis = 1))

#     # Create a y np array with the labels.
#     y_test = np.array(test_df["Label"])
    
#     # Normalize datasets.
#     X_train, X_validation, X_test = MinMax_normalization(X_train, X_validation, X_test)
    
#     return X_train, y_train, X_validation, y_validation, X_test, y_test

In [None]:
# Train on cpu only
# os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

# Fix path
final_train_path = "/kaggle/input/non-federated-train-test-df/final_train_df.csv"
final_test_path = "/kaggle/input/non-federated-train-test-df/final_test_df.csv"

# Load final train and test dataframes.
train_df = pd.read_csv(final_train_path, header = 0)
test_df = pd.read_csv(final_test_path, header = 0)

# Create X and y arrays, for train and test sets.
X_train, y_train, X_test, y_test = create_train_test_sets(train_df, test_df)

no_folds = 6
skf_obj = StratifiedKFold(n_splits = no_folds, shuffle = True, random_state = 42)

train_results_dict = {}
evaluation_results_dict = {}
for i, (train_index, test_index) in enumerate(skf_obj.split(X_train, y_train)):
    
    print('Fold: {}'.format(i))
    
    # Create current train and validation sets.
    current_X_train = X_train[train_index]
    current_y_train = y_train[train_index]
    
    X_validation = X_train[test_index]
    y_validation = y_train[test_index]
    
    # Create MLP model.
    n_features = current_X_train.shape[1]
    mlp = create_model(n_features)
    
    # Train MLP on the dataset
    history = mlp.fit(current_X_train, current_y_train, epochs = 100, batch_size = 32, validation_data = (X_validation, y_validation), verbose = 1)

    # Save training results: training and validation loss.
    train_results_dict[str(i)] = [history.history['loss'], history.history["val_loss"]]
    
    # Get the predictions of the model.
    y_pred = mlp.predict(X_test)
    y_pred = y_pred.flatten()
    y_pred = np.round(y_pred)
    y_pred = y_pred.astype(int)
    
    # Save evaluation results: Accuracy, Precision, Recall, F1-score.
    evaluation_results_dict[str(i)] = [accuracy_score(y_test, y_pred),\
                                       precision_score(y_test, y_pred),\
                                       recall_score(y_test, y_pred),\
                                       f1_score(y_test, y_pred)\
                                      ]
print('Done')

In [None]:
k_fold_training_results_line_plot(train_results_dict)

In [None]:
evaluation_results_df = pd.DataFrame(evaluation_results_dict)
evaluation_results_df = evaluation_results_df.transpose() 
evaluation_results_df.rename(columns = {0:'Accuracy', 1: 'Precision', 2: 'Recall', 3: 'F1-score'}, inplace = True)
evaluation_results_df

In [None]:
evaluation_results_df.describe()

# Explainable AI


In [None]:
import shap

# Fix path
final_train_path = "/kaggle/input/non-federated-train-test-df/final_train_df.csv"
final_test_path = "/kaggle/input/non-federated-train-test-df/final_test_df.csv"

# Load final train and test dataframes.
train_df = pd.read_csv(final_train_path, header = 0)
test_df = pd.read_csv(final_test_path, header = 0)

# Create X and y arrays, for train and test sets.
X_train, y_train, X_test, y_test = create_train_test_sets(train_df, test_df)

In [None]:
# Create MLP model.
n_features = X_train.shape[1]
mlp = create_model(n_features)

# Train MLP on the dataset
history = mlp.fit(X_train, y_train, epochs = 100, batch_size = 32, verbose = 1)


In [None]:
# Get the predictions of the model.
y_pred = mlp.predict(X_test)
y_pred = y_pred.flatten()
y_pred = np.round(y_pred)
y_pred = y_pred.astype(int)

print(accuracy_score(y_test, y_pred),precision_score(y_test, y_pred),recall_score(y_test, y_pred),f1_score(y_test, y_pred))

In [None]:
explainer = shap.DeepExplainer(model = mlp, data = X_train)

In [None]:
# Compute shapley values.
shap_values = explainer.shap_values(X_train)

In [None]:
train_df.drop(["Domain Name", "Family", "Label"], axis = 1).columns

In [None]:
# summary_plot of a specific class

class_names = ['benign', 'malicious']
shap.summary_plot(shap_values[1], X, plot_type = "dot", class_names = class_names, feature_names = df.drop('Action', axis = 1).columns)