**1. Initialization**

In [None]:
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.multiclass import OneVsRestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
import os
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE
import time
import pickle
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score, auc, RocCurveDisplay, roc_curve, average_precision_score, precision_recall_curve
import json
from sklearn.preprocessing import LabelBinarizer
from itertools import cycle
import seaborn as sns




folder = os.path.join("/", "Data", "CREMEv2_Result", "20230310", "logs_working", "toTrain")
train_technique = ["label_accounting_train_technique.csv",
                   "relabel_syslog_train_technique.csv",
                   "label_traffic_train_technique.csv"]
train_lifecycle = ["label_accounting_train_lifecycle.csv",
                   "relabel_syslog_train_lifecycle.csv",
                   "label_traffic_train_lifecycle.csv"]
datas = ["accounting", "syslog", "traffic"]

model_folder = os.path.join("/", "Data", "CREMEv2_Result", "20230310", "logs_working", "model")




for data in train_technique:
    if os.path.exists(os.path.join(folder, data)):
        print("Path is exists: ", data)
    else:
        print("Path is not exists: ", data)

for data in train_lifecycle:
    if os.path.exists(os.path.join(folder, data)):
        print("Path is exists: ", data)
    else:
        print("Path is not exists: ", data)

**2. Model Definition,Parameters Settings, and Evaluation Definition**

In [None]:
r_state = 42
core = -1
# model = XGBClassifier(objective='multi:softprob', eval_metric='merror', n_jobs=core)



models = {}
# model_name_technique = []
# model_name_lifecycle = []

### Linear-based
models['Logistic_Regresion'] = LogisticRegression(max_iter=1500, n_jobs=core, verbose=True)

### Tree-based
models['Decision_Tree'] = DecisionTreeClassifier()

# ### SVM-based
# models['SVM'] = SVC(kernel='linear', gamma='auto', verbose=True)

### Naive bayes
models['Naive_Bayes'] = GaussianNB()

### KNN-based
models['KNN'] = KNeighborsClassifier(n_jobs=core)

### ensemble-based
models['XGBoost'] = XGBClassifier(objective='multi:softprob', eval_metric='merror', n_jobs=core, verbosity=2)



evaluation_technique = {}
evaluation_lifecycle = {}


evaluation_roc_technique = {}
evaluation_roc_lifecycle = {}

evaluation_prauc_technique = {}
evaluation_prauc_lifecycle = {}

## accuracy, precision, recall, and F1-score
for data_type in datas:
    evaluation_technique[data_type] = {}
    for name in models:
        evaluation_technique[data_type][name] = {
            'accuracy': [],
            'precision': [],
            'recall': [],
            'f1_score': []
        }

for data_type in datas:
    evaluation_lifecycle[data_type] = {}
    for name in models:
        evaluation_lifecycle[data_type][name] = {
            'accuracy': [],
            'precision': [],
            'recall': [],
            'f1_score': []
        }

# ROC-AUC
for data_type in datas:
    evaluation_roc_technique[data_type] = {}
    for name in models:
        evaluation_roc_technique[data_type][name] = {
            'roc': []
        }

for data_type in datas:
    evaluation_roc_lifecycle[data_type] = {}
    for name in models:
        evaluation_roc_lifecycle[data_type][name] = {
            'roc': []
        }



## PR-AUC
for data_type in datas:
    evaluation_prauc_technique[data_type] = {}
    for name in models:
        evaluation_prauc_technique[data_type][name] = {
            'precision': {},
            'recall': {},
            'average_precision': {}
        }

for data_type in datas:
    evaluation_prauc_lifecycle[data_type] = {}
    for name in models:
        evaluation_prauc_lifecycle[data_type][name] = {
            'precision': {},
            'recall': {},
            'average_precision': {}
        }




print(evaluation_technique)
print("=========================================================================================")
print(evaluation_lifecycle)
print("=========================================================================================")
print("=========================================================================================")
print(evaluation_roc_technique)
print("=========================================================================================")
print(evaluation_roc_lifecycle)
print("=========================================================================================")
print("=========================================================================================")
print(evaluation_prauc_technique)
print("=========================================================================================")
print(evaluation_prauc_lifecycle)

**3. Training and evaluation**

**3.1 Training and evaluating (Technique)**


* Training
* Evaluation
  * Precision
  * Recall
  * F1-score
  * ROC-AUC
  * PR-AUC
* Data Store


In [None]:
i=1
for data in train_technique:
    for data_type in datas:

        remove_extension = data.split('.')
        name_without_ext = remove_extension[0].split('_')

        if data_type == name_without_ext[1]:
            print(f"Processing dataset {i}: {data}")

            df = pd.read_csv(os.path.join(folder, data))


            label_origin = sorted([int(i) for i in df['Label'].unique()])
            le = preprocessing.LabelEncoder()
            le.fit(df['Label'])
            le_origin_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
            origin_le_mapping = dict(zip(le.transform(le.classes_), le.classes_))

            X = df.drop(columns=['Label'])
            X = X.to_numpy()
            # X = X.reshape(-1)
            y = df['Label']
            y = y.to_numpy()
            y = y.reshape(-1)
            y = le.transform(y)

            class_label = list(label_origin)



            print(f"Train Test Split {data}")
            X_train_technique, X_test_technique, y_train_technique, y_test_technique = train_test_split(X, y, test_size = 0.2, random_state=r_state)

            print(f"Data balancing of {data}")
            X_train_technique, y_train_technique = SMOTE(n_jobs=-1, random_state=r_state).fit_resample(X_train_technique, y_train_technique)
            j = 1
            for name, model in models.items():
                model_filename = '{}{}_model_{}_{}_{}.pkl'.format(i, j, name, data_type, name_without_ext[-1])
                print(f"{i}{j}. Model {name} --> filename = {model_filename}")
                print("===================================================")

                start_time = time.time()
                if os.path.exists(os.path.join(model_folder, model_filename)):
                    print("Load ", model_filename)
                    model = pickle.load(open(os.path.join(model_folder, model_filename), 'rb'))
                else:
                    print(f"training model {name} on {data}")
                    model.fit(X_train_technique, y_train_technique)
                    print("Dump ", model_filename)
                    pickle.dump(model, open(os.path.join(model_folder, model_filename), 'wb'))
                y_pred_technique = model.predict(X_test_technique)
                y_score_technique = model.predict_proba(X_test_technique)
                label_binarizer_technique = LabelBinarizer().fit(y_train_technique)
                y_onehot_test_technique = label_binarizer_technique.transform(y_test_technique)

                evaluation_technique[data_type][name]['accuracy'].append(accuracy_score(y_test_technique, y_pred_technique))
                evaluation_technique[data_type][name]['precision'].append(precision_score(y_test_technique, y_pred_technique, average='weighted',zero_division=0))
                evaluation_technique[data_type][name]['recall'].append(recall_score(y_test_technique, y_pred_technique, average='weighted', zero_division=0))
                evaluation_technique[data_type][name]['f1_score'].append(f1_score(y_test_technique, y_pred_technique, average='weighted', zero_division=0))
                evaluation_roc_technique[data_type][name]['roc'] = list((y_test_technique, y_onehot_test_technique, y_score_technique))
                end_time = time.time()

                print("Execution Time: {:.2f}\n".format(end_time - start_time))
                j += 1
            else:
                continue
    i += 1

In [None]:
precision = dict()
recall = dict()
average_precision = dict()



i=1
for data in train_technique:
    for data_type in datas:

        remove_extension = data.split('.')
        name_without_ext = remove_extension[0].split('_')

        if data_type == name_without_ext[1]:
            print(f"Processing dataset {i}: {data}")

            df = pd.read_csv(os.path.join(folder, data))


            label_origin = sorted([int(i) for i in df['Label'].unique()])
            le = preprocessing.LabelEncoder()
            le.fit(df['Label'])
            le_origin_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
            origin_le_mapping = dict(zip(le.transform(le.classes_), le.classes_))

            X = df.drop(columns=['Label'])
            X = X.to_numpy()
            # X = X.reshape(-1)
            y = df['Label']
            y = y.to_numpy()
            y = y.reshape(-1)
            y = le.transform(y)

            class_label = list(label_origin)



            print(f"Train Test Split {data}")
            X_train_technique_prauc, X_test_technique_prauc, y_train_technique_prauc, y_test_technique_prauc = train_test_split(X, y, test_size = 0.2, random_state=r_state)

            print(f"Data balancing of {data}")
            X_train_technique_prauc, y_train_technique_prauc = SMOTE(n_jobs=-1, random_state=r_state).fit_resample(X_train_technique_prauc, y_train_technique_prauc)
            j = 1
            for name, model in models.items():
                model_filename = '{}{}_model_prauc_{}_{}_{}.pkl'.format(i, j, name, data_type, name_without_ext[-1])
                print(f"{i}{j}. Model {name} --> filename = {model_filename}")
                print("===================================================")

                start_time = time.time()
                if os.path.exists(os.path.join(model_folder, model_filename)):
                    print("Load ", model_filename)
                    model = pickle.load(open(os.path.join(model_folder, model_filename), 'rb'))
                else:
                    print(f"training model {name} on {data}")
                    model = OneVsRestClassifier(model).fit(X_train_technique_prauc, y_train_technique_prauc)
                    print("Dump ", model_filename)
                    pickle.dump(model, open(os.path.join(model_folder, model_filename), 'wb'))
                y_score_technique_prauc = model.decision_function(X_test_technique_prauc)
                label_binarizer_technique_prauc = LabelBinarizer().fit(y_train_technique_prauc)
                y_onehot_test_technique_prauc = label_binarizer_technique_prauc.transform(y_test_technique_prauc)

                for k in range(y):
                    precision[k], recall[k] = precision_recall_curve(y_test_technique_prauc[:, k], y_score_technique_prauc[:, k])
                    average_precision[k] = average_precision_score(y_test_technique_prauc[:, k], y_score_technique_prauc[:, k])

                evaluation_prauc_technique[data_type][name]['precision'].append(precision)
                evaluation_prauc_technique[data_type][name]['recall'].append(recall)
                evaluation_prauc_technique[data_type][name]['average_precision'].append(average_precision)


                end_time = time.time()

                print("Execution Time: {:.2f}\n".format(end_time - start_time))
                j += 1
            else:
                continue
    i += 1

In [None]:
print(evaluation_technique)
print("=======================================================================================================")
print(evaluation_roc_technique)
print("=======================================================================================================")
print(evaluation_prauc_technique)

In [None]:
json_filename = "evaluation_result_technique.json"
with open(os.path.join(model_folder, json_filename), 'w') as json_file:
    json.dump(evaluation_technique, json_file)

In [None]:
json_filename = "evaluation_result_roc_technique.json"
with open(os.path.join(model_folder, json_filename), 'w') as json_file:
    json.dump(evaluation_roc_technique, json_file)

In [None]:
json_filename = "evaluation_result_prauc_technique.json"
with open(os.path.join(model_folder, json_filename), 'w') as json_file:
    json.dump(evaluation_prauc_technique, json_file)

**3.2. Training and evaluating (Lifecycle)**
* Training
* Evaluation
  * Precision
  * Recall
  * F1-score
  * ROC-AUC
  * PR-AUC
* Data Store

In [None]:
i=1
for data in train_lifecycle:
    for data_type in datas:

        remove_extension = data.split('.')
        name_without_ext = remove_extension[0].split('_')

        if data_type == name_without_ext[1]:
            print(f"Processing dataset {i}: {data}")

            df = pd.read_csv(os.path.join(folder, data))


            label_origin = sorted([int(i) for i in df['Label_lifecycle'].unique()])
            le = preprocessing.LabelEncoder()
            le.fit(df['Label_lifecycle'])
            le_origin_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
            origin_le_mapping = dict(zip(le.transform(le.classes_), le.classes_))

            X = df.drop(columns=['Label_lifecycle'])
            X = X.to_numpy()
            # X = X.reshape(-1)
            y = df['Label_lifecycle']
            y = y.to_numpy()
            y = y.reshape(-1)
            y = le.transform(y)

            class_label = list(label_origin)



            print(f"Train Test Split {data}")
            X_train_lifecycle, X_test_lifecycle, y_train_lifecycle, y_test_lifecycle = train_test_split(X, y, test_size = 0.2, random_state=r_state)

            print(f"Data balancing of {data}")
            X_train_lifecycle, y_train_lifecycle = SMOTE(n_jobs=-1, random_state=r_state).fit_resample(X_train_lifecycle, y_train_lifecycle)
            j = 1
            for name, model in models.items():
                model_filename = '{}{}_model_{}_{}_{}.pkl'.format(i, j, name, data_type, name_without_ext[-1])
                print(f"{i}{j}. Model {name} --> filename = {model_filename}")
                print("===================================================")

                start_time = time.time()
                if os.path.exists(os.path.join(model_folder, model_filename)):
                    print("Load ", model_filename)
                    model = pickle.load(open(os.path.join(model_folder, model_filename), 'rb'))
                else:
                    print(f"training model {name} on {data}")
                    model.fit(X_train_lifecycle, y_train_lifecycle)
                    print("Dump ", model_filename)
                    pickle.dump(model, open(os.path.join(model_folder, model_filename), 'wb'))
                y_pred_lifecycle = model.predict(X_test_lifecycle)
                y_score_lifecycle = model.predict_proba(X_test_lifecycle)
                label_binarizer_lifecycle = LabelBinarizer().fit(y_train_lifecycle)
                y_onehot_test_lifecycle = label_binarizer_lifecycle.transform(y_test_lifecycle)

                evaluation_lifecycle[data_type][name]['accuracy'].append(accuracy_score(y_test_lifecycle, y_pred_lifecycle))
                evaluation_lifecycle[data_type][name]['precision'].append(precision_score(y_test_lifecycle, y_pred_lifecycle, average='weighted',zero_division=0))
                evaluation_lifecycle[data_type][name]['recall'].append(recall_score(y_test_lifecycle, y_pred_lifecycle, average='weighted', zero_division=0))
                evaluation_lifecycle[data_type][name]['f1_score'].append(f1_score(y_test_lifecycle, y_pred_lifecycle, average='weighted', zero_division=0))
                evaluation_roc_lifecycle[data_type][name]['roc'] = list((y_test_lifecycle, y_onehot_test_lifecycle, y_score_lifecycle))
                end_time = time.time()

                print("Execution Time: {:.2f}\n".format(end_time - start_time))
                j += 1
            else:
                continue
    i += 1

In [None]:
precision = dict()
recall = dict()
average_precision = dict()



i=1
for data in train_lifecycle:
    for data_type in datas:

        remove_extension = data.split('.')
        name_without_ext = remove_extension[0].split('_')

        if data_type == name_without_ext[1]:
            print(f"Processing dataset {i}: {data}")

            df = pd.read_csv(os.path.join(folder, data))


            label_origin = sorted([int(i) for i in df['Label_lifecycle'].unique()])
            le = preprocessing.LabelEncoder()
            le.fit(df['Label_lifecycle'])
            le_origin_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
            origin_le_mapping = dict(zip(le.transform(le.classes_), le.classes_))

            X = df.drop(columns=['Label_lifecycle'])
            X = X.to_numpy()
            # X = X.reshape(-1)
            y = df['Label_lifecycle']
            y = y.to_numpy()
            y = y.reshape(-1)
            y = le.transform(y)

            class_label = list(label_origin)



            print(f"Train Test Split {data}")
            X_train_lifecycle_prauc, X_test_lifecycle_prauc, y_train_lifecycle_prauc, y_test_lifecycle_prauc = train_test_split(X, y, test_size = 0.2, random_state=r_state)

            print(f"Data balancing of {data}")
            X_train_lifecycle_prauc, y_train_lifecycle_prauc = SMOTE(n_jobs=-1, random_state=r_state).fit_resample(X_train_lifecycle_prauc, y_train_lifecycle_prauc)
            j = 1
            for name, model in models.items():
                model_filename = '{}{}_model_prauc_{}_{}_{}.pkl'.format(i, j, name, data_type, name_without_ext[-1])
                print(f"{i}{j}. Model {name} --> filename = {model_filename}")
                print("===================================================")

                start_time = time.time()
                if os.path.exists(os.path.join(model_folder, model_filename)):
                    print("Load ", model_filename)
                    model = pickle.load(open(os.path.join(model_folder, model_filename), 'rb'))
                else:
                    print(f"training model {name} on {data}")
                    model = OneVsRestClassifier(model).fit(X_train_lifecycle_prauc, Y_train_lifecycle_prauc)
                    print("Dump ", model_filename)
                    pickle.dump(model, open(os.path.join(model_folder, model_filename), 'wb'))
                y_score_lifecycle_prauc = model.decision_function(X_test_lifecycle_prauc)
                label_binarizer_lifecycle_prauc = LabelBinarizer().fit(y_train_lifecycle_prauc)
                y_onehot_test_lifecycle_prauc = label_binarizer_lifecycle_prauc.transform(y_test_lifecycle_prauc)

                for k in range(y):
                    precision[k], recall[k] = precision_recall_curve(y_test_lifecycle_prauc[:, k], y_score_lifecycle_prauc[:, k])
                    average_precision[k] = average_precision_score(y_test_lifecycle_prauc[:, k], y_score_lifecycle_prauc[:, k])

                evaluation_prauc_lifecycle[data_type][name]['precision'].append(precision)
                evaluation_prauc_lifecycle[data_type][name]['recall'].append(recall)
                evaluation_prauc_lifecycle[data_type][name]['average_precision'].append(average_precision)


                end_time = time.time()

                print("Execution Time: {:.2f}\n".format(end_time - start_time))
                j += 1
            else:
              continue
    i += 1

In [None]:
json_filename = "evaluation_result_lifecycle.json"
with open(os.path.join(model_folder, json_filename), 'w') as json_file:
    json.dump(evaluation_lifecycle, json_file)

In [None]:
json_filename = "evaluation_result_lifecycle.json"
with open(os.path.join(model_folder, json_filename), 'w') as json_file:
    json.dump(evaluation_roc_lifecycle, json_file)

In [None]:
json_filename = "evaluation_result_lifecycle.json"
with open(os.path.join(model_folder, json_filename), 'w') as json_file:
    json.dump(evaluation_prauc_lifecycle, json_file)