In [1]:
import os
#import mlflow
import pandas as pd
import seaborn as sns
#import scikitplot as skplt
from sklearn import svm
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from pandas.api.types import is_string_dtype
from pandas.api.types import is_numeric_dtype
#from minio import Minio

import matplotlib.pyplot as plt

In [2]:
# settings
DATA_FOLDER = "data/5GNIDD"
FILE_NAME = "Combined.csv"
FULL_PATH = f"{DATA_FOLDER}/{FILE_NAME}"
ABORTED_COLUMNS = [
    "sTos", "Unnamed: 0", "Seq", "dDSb",
    "Mean", "Sum", "Min", "Max", "dTos",
    "sDSb", "dTtl", "dHops", "SrcGap",
    "DstGap", "SrcWin", "DstWin", "sVid",
    "dVid", "SrcTCPBase", "DstTCPBase",
    "RunTime"
]
MINIO_HOST = os.environ.get('MINIO_HOST', 'http://minio.minio-operator.svc.cluster.local')
MINIO_ACCESS_KEY = os.environ.get('MINIO_ACCESS_KEY', '30VCRG8D7UBAYCE52K71')
MINIO_SECRET_KEY = os.environ.get('MINIO_SECRET_KEY', 'zpu0z6G2QpSwdsUqrAOzWVHVTKHiVBs42s7qvhdN')

os.environ['MLFLOW_TRACKING_URI'] = "http://mlflow.default.svc.cluster.local:5000"
os.environ['MLFLOW_S3_ENDPOINT_URL'] = MINIO_HOST
os.environ['AWS_ACCESS_KEY_ID'] = MINIO_ACCESS_KEY
os.environ['AWS_SECRET_ACCESS_KEY'] = MINIO_SECRET_KEY

In [3]:
class DataLoader(object):
    def __init__(
            self,
            path=FULL_PATH,
            aborted_cols=ABORTED_COLUMNS
    ):
        self.path = path
        self.aborted_cols = aborted_cols

    def load__(self):
        cols = list(pd.read_csv(self.path, nrows=1))
        df = pd.read_csv(
            self.path,
            usecols=[col for col in cols if col not in self.aborted_cols]
        )
        df['sTtl'] = df['sTtl'].fillna(0.0)
        df['sHops'] = df['sHops'].fillna(0.0)
        return df

    @staticmethod
    def transform_numerics(
            df
    ):
        scalers = {}
        for col in df.columns:
            if is_numeric_dtype(df[col]):
                scalers[col] = StandardScaler()
                df[col] = scalers[col].fit_transform(df[[col]])
        return df, scalers

    @staticmethod
    def transform_nominals(
            df
    ):
        encoders = {}
        for col in df.columns:
            if is_string_dtype(df[col]):
                encoders[col] = LabelEncoder()
                df[col] = encoders[col].fit_transform(df[col])
        return df, encoders

In [4]:
class ModelsTrainer(DataLoader):
    def __init__(self):
        super().__init__()
        self.models = {
            'dt': DecisionTreeClassifier(random_state=0),
            'rf': RandomForestClassifier(max_depth=2, random_state=0),
            'gnb': GaussianNB(),
            'ada': AdaBoostClassifier(random_state=42),
            'sgd': SGDClassifier(max_iter=1000, tol=1e-3),
            # 'svm': svm.SVC(decision_function_shape='ovo')
        }

    @staticmethod
    def set_train_test(
            df,
            test_size=0.3,
            multi_class=False
    ):
        train, test = train_test_split(df, test_size=test_size)
        if multi_class:
            y_train = train['Attack Type']
            y_test = test['Attack Type']
        else:
            y_train = train['Label']
            y_test = test['Label']
        x_train = train[[col for col in df.columns if col not in ['Label', 'Attack Type']]]
        x_test = test[[col for col in df.columns if col not in ['Label', 'Attack Type']]]
        return x_train, y_train, x_test, y_test

    def train__(
            self,
            x_train,
            y_train
    ):
        for name in self.models.keys():
            self.models[name].fit(
                x_train.to_numpy(),
                y_train.to_numpy()
            )

    @staticmethod
    def metrics__(
            model,
            x_test,
            y_test
    ):
        y_pred = model.predict(x_test)
        accuracy_score = metrics.accuracy_score(y_pred, y_test)
        f1_score = metrics.f1_score(y_pred, y_test)
        precision_score = metrics.precision_score(y_pred, y_test)
        recall_score = metrics.recall_score(y_pred, y_test)
        classification_report = metrics.classification_report(y_pred, y_test, output_dict=True)
        confusion_matrix = metrics.confusion_matrix(y_pred, y_test)
        return accuracy_score, classification_report, confusion_matrix, y_pred, f1_score, precision_score, recall_score

    def evaluate(
            self,
            x_test,
            y_test
    ):
        metrics_dict = {}
        predictions_dict = {}
        classification_reports = {}
        for item in self.models.keys():
            model = self.models[item]
            accuracy_score, classification_report, confusion_matrix, y_pred, \
            f1_score, precision_score, recall_score = self.metrics__(
                model,
                x_test,
                y_test
            )
            metrics_dict[item] = {
                'accuracy_score': accuracy_score,
                'f1_score': f1_score,
                'precision_score': precision_score,
                'recall_score': recall_score
            }
            classification_reports[item] = classification_report
            predictions_dict[item] = y_pred
        return metrics_dict, confusion_matrix, predictions_dict, classification_reports

In [5]:
class ArtifactStorage(object):
    def __init__(self,
                 experiment_name,
                ):
        self.experiment_name = experiment_name

    @staticmethod
    def print_class_report(
            name,
            model_classification_report
    ):
        df_report = pd.DataFrame(
            model_classification_report
        ).transpose()
        fig, ax = plt.subplots(figsize=(10, 7))
        sns.heatmap(df_report.iloc[:-1, :-1], annot=True)
        plt.title("classification_report")
        plt.tight_layout()
        plt.savefig(
            f'figures/classification-report-{name}'
        )
        plt.close(fig)
        return fig

    @staticmethod
    def confusion_matrix(
            name,
            y_test,
            y_pred
    ):
        ax = skplt.metrics.plot_confusion_matrix(
            y_test, y_pred, normalize=False, title=f'Confusion Matrix for {name}'
        )
        plt.savefig(
            f'figures/{name}'
        )
        fig = ax.get_figure()
        plt.close(fig)
        return fig
        
    def _log_to_mlflow(self,
                       model,
                       model_name,
                       metrics_dict,
                       y_test,
                       predictions,
                       classification_report,
                       scalers_path_dict=None
                      ):
        conf_matr_fig = self.confusion_matrix(
            name=model_name,
            y_test=y_test,
            y_pred=predictions
        )
        class_report_fig = self.print_class_report(
            name=model_name,
            model_classification_report=classification_report
        )
        figs_list_dict = [
            {'path': f'figures/{model_name}.png', 'fig': conf_matr_fig},
            {'path': f'figures/classification-report-{model_name}.png', 'fig': class_report_fig}
        ]
        mlflow.set_experiment(self.experiment_name)
        with mlflow.start_run():
            mlflow.sklearn.log_model(
                model,
                model_name,
                registered_model_name=model_name
            )
            if scalers_path_dict:
                for scaler_path in scalers_path_dict:
                    mlflow.log_artifact(scaler_path)
            if figs_list_dict:
                for fig_dict in figs_list_dict:
                    fig_path = fig_dict['path']
                    fig = fig_dict['fig']
                    mlflow.log_figure(fig, fig_path)
            for metrics_tuple in metrics_dict.items():
                mlflow.log_param(metrics_tuple[0], metrics_tuple[1])

    def persist_experiment(
        self,
        trained_models,
        evaluation_metrics,
        y_test,
        predictions,
        classification_reports
    ):
        for model_key in trained_models.keys():
            self._log_to_mlflow(
                model=trained_models[model_key],
                model_name=f'{self.experiment_name}-{model_key}',
                metrics_dict=evaluation_metrics[model_key],
                y_test=y_test,
                predictions=predictions[model_key],
                classification_report=classification_reports[model_key]
            )

In [6]:
obj = ModelsTrainer()

In [7]:
df = obj.load__()

In [8]:
df_transformed = df.copy()
df_transformed, scalers = obj.transform_numerics(df)
df_transformed, label_encoders = obj.transform_nominals(df_transformed)

In [9]:
x_train, y_train, x_test, y_test = obj.set_train_test(df=df_transformed)

In [10]:
obj.train__(x_train, y_train)



In [11]:
trained_models = obj.models

In [12]:
trained_models

{'dt': DecisionTreeClassifier(random_state=0),
 'rf': RandomForestClassifier(max_depth=2, random_state=0),
 'gnb': GaussianNB(),
 'ada': AdaBoostClassifier(random_state=42),
 'sgd': SGDClassifier()}

In [13]:
metrics_dict, confusion_matrix, predictions_dict, classification_reports = obj.evaluate(x_test, y_test)

