In [1]:
#Imports
import io
import os
import gc
import copy
import math
import ctypes
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import confusion_matrix, classification_report, precision_recall_fscore_support, accuracy_score, balanced_accuracy_score, matthews_corrcoef
from sklearn import metrics

import tensorflow as tf
from tensorflow.keras import Input, Model, layers, losses, optimizers, callbacks
import tensorflow_decision_forests as tfdf

#Notebook settings
default_max_columns = None
default_max_rows = 10
pd.set_option('display.max_columns', default_max_columns)
pd.set_option('display.max_rows', default_max_rows)

model_name = "RANDOM_FOREST"
label_column = "Label"
batch_size = 512
num_datasets = 6
cwd = os.getcwd()

2025-02-05 12:15:27.757477: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-02-05 12:15:27.830280: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-02-05 12:15:27.889118: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-02-05 12:15:27.947413: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-02-05 12:15:27.964095: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-02-05 12:15:28.063169: I tensorflow/core/platform/cpu_feature_gu

In [2]:
#Utility functions
def split_xy(df, colname):
    y_cols = [c for c in df.columns if c.startswith(colname)]
    x_cols = np.setdiff1d(df.columns, y_cols)
    X = df[x_cols]
    Y = df[y_cols]
    return X, Y

In [3]:
%%time
#Training and evaluating loop designed to work with memory constraints 

df_names = ['NF-UNSW-NB15', 'NF-CSE-CIC-IDS2018', 'NF-BoT-IoT', 'NF-ToN-IoT', 'X-IIoTiD', 'WUSTL-IIOT']
result_df = pd.DataFrame(columns=("TRAIN_DATASET", "EVAL_DATASET", "METRIC", "VAlUE", "LABEL", "MODEL"))

for i in range(num_datasets):

    #Model definiton
    #Random Forrest Classifier with 50 Trees, max depth of 4
    random_forest_model = tfdf.keras.RandomForestModel(
        task=tfdf.keras.Task.CLASSIFICATION,
        missing_value_policy = "GLOBAL_IMPUTATION",
        num_trees = 50,
        max_depth = 10,
        #growing_strategy="BEST_FIRST_GLOBAL",
        #max_num_nodes = 5,
        verbose = 0,
        #compute_oob_variable_importances=True, #Uses too much RAM
        sorting_strategy="IN_NODE",
    )

    #Load training and validation data
    print("Loading "+df_names[i]+" trainingset")
    df_train = pd.read_csv(cwd+"/datasets/"+df_names[i]+"_Trainset.csv", dtype=float)
    print(df_train.shape)
    df_val = pd.read_csv(cwd+"/datasets/"+df_names[i]+"_Valset.csv", dtype=float)
    trainset = tfdf.keras.pd_dataframe_to_tf_dataset(df_train, label=label_column, batch_size = batch_size)
    del df_train
    valset = tfdf.keras.pd_dataframe_to_tf_dataset(df_val, label=label_column, batch_size = batch_size)
    del df_val
    gc.collect()
    print("Loading complete")
    
    #Train
    print("Training on "+df_names[i]+" trainingset")
    random_forest_model.fit(trainset,validation_data = valset)
    del trainset, valset
    gc.collect
    print("Training complete")
    
    #Evaluation
    for j in range(num_datasets):

        print("Loading "+df_names[j]+" testset")
        #Load test data
        df_test = pd.read_csv(cwd+"/datasets/"+df_names[j]+"_Testset.csv", dtype=float)
        print(df_test.shape)
        testset = tfdf.keras.pd_dataframe_to_tf_dataset(df_test, label=label_column, batch_size = batch_size)
        xtest, ytest = split_xy(df_test, label_column)
        del df_test,xtest
        print("Loading complete")

        #Predict
        print("Evaluating model trained on "+df_names[i]+" trainset on "+df_names[j]+" testset")
        ypred = ((random_forest_model.predict(testset) > 0.5).astype("int32"))
        del testset
        print("Evaluation complete")

        #Results
        print("Results for:...")
        print("Model trained on "+df_names[i]+" and evaluated on "+df_names[j])
        print(confusion_matrix(ytest, ypred))
        print(classification_report(ytest, ypred))
        precision, recall, f1_score, support = precision_recall_fscore_support(ytest,ypred, zero_division = 0.0)
        accuracy = accuracy_score(ytest, ypred)
        balanced_accuracy = balanced_accuracy_score(ytest, ypred)
        mcc = matthews_corrcoef(ytest, ypred)
        print("---")

        del ypred,ytest
        gc.collect

        result_df = pd.concat([result_df,pd.DataFrame({"TRAIN_DATASET":df_names[i], "EVAL_DATASET":df_names[j], "METRIC":"ACCURACY", "VAlUE":accuracy, "LABEL":"BOTH", "MODEL":model_name}, index = [0])], ignore_index=True)
        result_df = pd.concat([result_df,pd.DataFrame({"TRAIN_DATASET":df_names[i], "EVAL_DATASET":df_names[j], "METRIC":"BALANCED_ACCURACY", "VAlUE":balanced_accuracy, "LABEL":"BOTH", "MODEL":model_name}, index = [0])], ignore_index=True)
        result_df = pd.concat([result_df,pd.DataFrame({"TRAIN_DATASET":df_names[i], "EVAL_DATASET":df_names[j], "METRIC":"MCC", "VAlUE":mcc, "LABEL":"BOTH", "MODEL":model_name}, index = [0])], ignore_index=True)
        result_df = pd.concat([result_df,pd.DataFrame({"TRAIN_DATASET":df_names[i], "EVAL_DATASET":df_names[j], "METRIC":"PRECISION", "VAlUE":precision[0], "LABEL":"0", "MODEL":model_name}, index = [0])], ignore_index=True)
        result_df = pd.concat([result_df,pd.DataFrame({"TRAIN_DATASET":df_names[i], "EVAL_DATASET":df_names[j], "METRIC":"PRECISION", "VAlUE":precision[1], "LABEL":"1", "MODEL":model_name}, index = [0])], ignore_index=True)   
        result_df = pd.concat([result_df,pd.DataFrame({"TRAIN_DATASET":df_names[i], "EVAL_DATASET":df_names[j], "METRIC":"RECALL", "VAlUE":recall[0], "LABEL":"0", "MODEL":model_name}, index = [0])], ignore_index=True)
        result_df = pd.concat([result_df,pd.DataFrame({"TRAIN_DATASET":df_names[i], "EVAL_DATASET":df_names[j], "METRIC":"RECALL", "VAlUE":recall[1], "LABEL":"1", "MODEL":model_name}, index = [0])], ignore_index=True)
        result_df = pd.concat([result_df,pd.DataFrame({"TRAIN_DATASET":df_names[i], "EVAL_DATASET":df_names[j], "METRIC":"F1_SCORE", "VAlUE":f1_score[0], "LABEL":"0", "MODEL":model_name}, index = [0])], ignore_index=True)
        result_df = pd.concat([result_df,pd.DataFrame({"TRAIN_DATASET":df_names[i], "EVAL_DATASET":df_names[j], "METRIC":"F1_SCORE", "VAlUE":f1_score[1], "LABEL":"1", "MODEL":model_name}, index = [0])], ignore_index=True)
        result_df = pd.concat([result_df,pd.DataFrame({"TRAIN_DATASET":df_names[i], "EVAL_DATASET":df_names[j], "METRIC":"SUPPORT", "VAlUE":support[0], "LABEL":"0", "MODEL":model_name}, index = [0])], ignore_index=True)
        result_df = pd.concat([result_df,pd.DataFrame({"TRAIN_DATASET":df_names[i], "EVAL_DATASET":df_names[j], "METRIC":"SUPPORT", "VAlUE":support[1], "LABEL":"1", "MODEL":model_name}, index = [0])], ignore_index=True)
        
    del random_forest_model
    gc.collect
    libc = ctypes.CDLL("libc.so.6") # clearing cache 
    libc.malloc_trim(0)

    print("Cleanup complete")
    print("---+++---")
        

Loading NF-UNSW-NB15 trainingset
(1298494, 33)
Loading complete
Training on NF-UNSW-NB15 trainingset
Num validation examples: tf.Tensor(162312, shape=(), dtype=int32)


I0000 00:00:1738754156.287662    5704 kernel.cc:774] Start Yggdrasil model training
I0000 00:00:1738754156.297354    5704 kernel.cc:775] Collect training examples
I0000 00:00:1738754156.297407    5704 kernel.cc:787] Dataspec guide:
column_guides {
  column_name_pattern: "^__LABEL$"
  type: CATEGORICAL
  categorial {
    min_vocab_frequency: 0
    max_vocab_count: -1
  }
}
default_column_guide {
  categorial {
    max_vocab_count: 2000
  }
  discretized_numerical {
    maximum_num_bins: 255
  }
}
ignore_columns_without_guides: false
detect_numerical_as_discretized_numerical: false

I0000 00:00:1738754156.298467    5704 kernel.cc:394] Number of batches: 2537
I0000 00:00:1738754156.298491    5704 kernel.cc:395] Number of examples: 1298494
I0000 00:00:1738754156.568764    5704 kernel.cc:794] Training dataset:
Number of records: 1298494
Number of columns: 33

Number of columns by type:
	NUMERICAL: 32 (96.9697%)
	CATEGORICAL: 1 (3.0303%)

Columns:

NUMERICAL: 32 (96.9697%)
	0: "FLOW_DURATION

Training complete
Loading NF-UNSW-NB15 testset
(324624, 33)
Loading complete
Evaluating model trained on NF-UNSW-NB15 trainset on NF-UNSW-NB15 testset
Evaluation complete
Results for:...
Model trained on NF-UNSW-NB15 and evaluated on NF-UNSW-NB15
[[309234   1020]
 [  2381  11989]]
              precision    recall  f1-score   support

         0.0       0.99      1.00      0.99    310254
         1.0       0.92      0.83      0.88     14370

    accuracy                           0.99    324624
   macro avg       0.96      0.92      0.94    324624
weighted avg       0.99      0.99      0.99    324624

---
Loading NF-CSE-CIC-IDS2018 testset




(1678480, 33)
Loading complete
Evaluating model trained on NF-UNSW-NB15 trainset on NF-CSE-CIC-IDS2018 testset
Evaluation complete
Results for:...
Model trained on NF-UNSW-NB15 and evaluated on NF-CSE-CIC-IDS2018
[[1253838  221383]
 [ 202079    1180]]
              precision    recall  f1-score   support

         0.0       0.86      0.85      0.86   1475221
         1.0       0.01      0.01      0.01    203259

    accuracy                           0.75   1678480
   macro avg       0.43      0.43      0.43   1678480
weighted avg       0.76      0.75      0.75   1678480

---
Loading NF-BoT-IoT testset
(120020, 33)
Loading complete
Evaluating model trained on NF-UNSW-NB15 trainset on NF-BoT-IoT testset
Evaluation complete
Results for:...
Model trained on NF-UNSW-NB15 and evaluated on NF-BoT-IoT
[[  2798      2]
 [117219      1]]
              precision    recall  f1-score   support

         0.0       0.02      1.00      0.05      2800
         1.0       0.33      0.00      0.00    117

I0000 00:00:1738754355.641130    5704 kernel.cc:774] Start Yggdrasil model training
I0000 00:00:1738754355.641171    5704 kernel.cc:775] Collect training examples
I0000 00:00:1738754355.641187    5704 kernel.cc:787] Dataspec guide:
column_guides {
  column_name_pattern: "^__LABEL$"
  type: CATEGORICAL
  categorial {
    min_vocab_frequency: 0
    max_vocab_count: -1
  }
}
default_column_guide {
  categorial {
    max_vocab_count: 2000
  }
  discretized_numerical {
    maximum_num_bins: 255
  }
}
ignore_columns_without_guides: false
detect_numerical_as_discretized_numerical: false

I0000 00:00:1738754355.641459    5704 kernel.cc:394] Number of batches: 13114
I0000 00:00:1738754355.641477    5704 kernel.cc:395] Number of examples: 6713919
I0000 00:00:1738754356.866453    5704 kernel.cc:794] Training dataset:
Number of records: 6713919
Number of columns: 33

Number of columns by type:
	NUMERICAL: 32 (96.9697%)
	CATEGORICAL: 1 (3.0303%)

Columns:

NUMERICAL: 32 (96.9697%)
	0: "FLOW_DURATIO

Training complete
Loading NF-UNSW-NB15 testset
(324624, 33)
Loading complete
Evaluating model trained on NF-CSE-CIC-IDS2018 trainset on NF-UNSW-NB15 testset
Evaluation complete
Results for:...
Model trained on NF-CSE-CIC-IDS2018 and evaluated on NF-UNSW-NB15
[[310254      0]
 [ 14370      0]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

         0.0       0.96      1.00      0.98    310254
         1.0       0.00      0.00      0.00     14370

    accuracy                           0.96    324624
   macro avg       0.48      0.50      0.49    324624
weighted avg       0.91      0.96      0.93    324624

---
Loading NF-CSE-CIC-IDS2018 testset
(1678480, 33)
Loading complete
Evaluating model trained on NF-CSE-CIC-IDS2018 trainset on NF-CSE-CIC-IDS2018 testset
Evaluation complete
Results for:...
Model trained on NF-CSE-CIC-IDS2018 and evaluated on NF-CSE-CIC-IDS2018
[[1475216       5]
 [  10955  192304]]
              precision    recall  f1-score   support

         0.0       0.99      1.00      1.00   1475221
         1.0       1.00      0.95      0.97    203259

    accuracy                           0.99   1678480
   macro avg       1.00      0.97      0.98   1678480
weighted avg       0.99      0.99      0.99   1678480

---
Loading NF-BoT-IoT testset
(120020, 33)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


---
Loading NF-ToN-IoT testset
(275855, 33)
Loading complete
Evaluating model trained on NF-CSE-CIC-IDS2018 trainset on NF-ToN-IoT testset
Evaluation complete
Results for:...
Model trained on NF-CSE-CIC-IDS2018 and evaluated on NF-ToN-IoT
[[ 53924      0]
 [221931      0]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

         0.0       0.20      1.00      0.33     53924
         1.0       0.00      0.00      0.00    221931

    accuracy                           0.20    275855
   macro avg       0.10      0.50      0.16    275855
weighted avg       0.04      0.20      0.06    275855

---
Loading X-IIoTiD testset
(164018, 33)
Loading complete
Evaluating model trained on NF-CSE-CIC-IDS2018 trainset on X-IIoTiD testset
Evaluation complete
Results for:...
Model trained on NF-CSE-CIC-IDS2018 and evaluated on X-IIoTiD
[[84070     0]
 [79948     0]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

         0.0       0.51      1.00      0.68     84070
         1.0       0.00      0.00      0.00     79948

    accuracy                           0.51    164018
   macro avg       0.26      0.50      0.34    164018
weighted avg       0.26      0.51      0.35    164018

---
Loading WUSTL-IIOT testset
(238165, 33)
Loading complete
Evaluating model trained on NF-CSE-CIC-IDS2018 trainset on WUSTL-IIOT testset
Evaluation complete
Results for:...
Model trained on NF-CSE-CIC-IDS2018 and evaluated on WUSTL-IIOT
[[220811      0]
 [ 17354      0]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

         0.0       0.93      1.00      0.96    220811
         1.0       0.00      0.00      0.00     17354

    accuracy                           0.93    238165
   macro avg       0.46      0.50      0.48    238165
weighted avg       0.86      0.93      0.89    238165

---
Cleanup complete
---+++---
Loading NF-BoT-IoT trainingset
(480080, 33)
Loading complete
Training on NF-BoT-IoT trainingset
Num validation examples: tf.Tensor(60010, shape=(), dtype=int32)


I0000 00:00:1738754717.593387    5704 kernel.cc:774] Start Yggdrasil model training
I0000 00:00:1738754717.593415    5704 kernel.cc:775] Collect training examples
I0000 00:00:1738754717.593427    5704 kernel.cc:787] Dataspec guide:
column_guides {
  column_name_pattern: "^__LABEL$"
  type: CATEGORICAL
  categorial {
    min_vocab_frequency: 0
    max_vocab_count: -1
  }
}
default_column_guide {
  categorial {
    max_vocab_count: 2000
  }
  discretized_numerical {
    maximum_num_bins: 255
  }
}
ignore_columns_without_guides: false
detect_numerical_as_discretized_numerical: false

I0000 00:00:1738754717.594344    5704 kernel.cc:394] Number of batches: 938
I0000 00:00:1738754717.594365    5704 kernel.cc:395] Number of examples: 480080
I0000 00:00:1738754717.708767    5704 kernel.cc:794] Training dataset:
Number of records: 480080
Number of columns: 33

Number of columns by type:
	NUMERICAL: 32 (96.9697%)
	CATEGORICAL: 1 (3.0303%)

Columns:

NUMERICAL: 32 (96.9697%)
	0: "FLOW_DURATION_MI

Training complete
Loading NF-UNSW-NB15 testset
(324624, 33)
Loading complete
Evaluating model trained on NF-BoT-IoT trainset on NF-UNSW-NB15 testset
Evaluation complete
Results for:...
Model trained on NF-BoT-IoT and evaluated on NF-UNSW-NB15
[[309653    601]
 [ 14230    140]]
              precision    recall  f1-score   support

         0.0       0.96      1.00      0.98    310254
         1.0       0.19      0.01      0.02     14370

    accuracy                           0.95    324624
   macro avg       0.57      0.50      0.50    324624
weighted avg       0.92      0.95      0.93    324624

---
Loading NF-CSE-CIC-IDS2018 testset
(1678480, 33)
Loading complete
Evaluating model trained on NF-BoT-IoT trainset on NF-CSE-CIC-IDS2018 testset
Evaluation complete
Results for:...
Model trained on NF-BoT-IoT and evaluated on NF-CSE-CIC-IDS2018
[[1038024  437197]
 [  93797  109462]]
              precision    recall  f1-score   support

         0.0       0.92      0.70      0.80   1475221

I0000 00:00:1738754813.745387    5704 kernel.cc:774] Start Yggdrasil model training
I0000 00:00:1738754813.745431    5704 kernel.cc:775] Collect training examples
I0000 00:00:1738754813.745450    5704 kernel.cc:787] Dataspec guide:
column_guides {
  column_name_pattern: "^__LABEL$"
  type: CATEGORICAL
  categorial {
    min_vocab_frequency: 0
    max_vocab_count: -1
  }
}
default_column_guide {
  categorial {
    max_vocab_count: 2000
  }
  discretized_numerical {
    maximum_num_bins: 255
  }
}
ignore_columns_without_guides: false
detect_numerical_as_discretized_numerical: false

I0000 00:00:1738754813.745691    5704 kernel.cc:394] Number of batches: 2156
I0000 00:00:1738754813.745707    5704 kernel.cc:395] Number of examples: 1103419
I0000 00:00:1738754813.979871    5704 kernel.cc:794] Training dataset:
Number of records: 1103419
Number of columns: 33

Number of columns by type:
	NUMERICAL: 32 (96.9697%)
	CATEGORICAL: 1 (3.0303%)

Columns:

NUMERICAL: 32 (96.9697%)
	0: "FLOW_DURATION

Training complete
Loading NF-UNSW-NB15 testset
(324624, 33)
Loading complete
Evaluating model trained on NF-ToN-IoT trainset on NF-UNSW-NB15 testset
Evaluation complete
Results for:...
Model trained on NF-ToN-IoT and evaluated on NF-UNSW-NB15
[[298612  11642]
 [ 13492    878]]
              precision    recall  f1-score   support

         0.0       0.96      0.96      0.96    310254
         1.0       0.07      0.06      0.07     14370

    accuracy                           0.92    324624
   macro avg       0.51      0.51      0.51    324624
weighted avg       0.92      0.92      0.92    324624

---
Loading NF-CSE-CIC-IDS2018 testset
(1678480, 33)
Loading complete
Evaluating model trained on NF-ToN-IoT trainset on NF-CSE-CIC-IDS2018 testset
Evaluation complete
Results for:...
Model trained on NF-ToN-IoT and evaluated on NF-CSE-CIC-IDS2018
[[1185270  289951]
 [ 141471   61788]]
              precision    recall  f1-score   support

         0.0       0.89      0.80      0.85   1475221







Num validation examples: tf.Tensor(82009, shape=(), dtype=int32)


I0000 00:00:1738754930.159080    5704 kernel.cc:774] Start Yggdrasil model training
I0000 00:00:1738754930.159124    5704 kernel.cc:775] Collect training examples
I0000 00:00:1738754930.159142    5704 kernel.cc:787] Dataspec guide:
column_guides {
  column_name_pattern: "^__LABEL$"
  type: CATEGORICAL
  categorial {
    min_vocab_frequency: 0
    max_vocab_count: -1
  }
}
default_column_guide {
  categorial {
    max_vocab_count: 2000
  }
  discretized_numerical {
    maximum_num_bins: 255
  }
}
ignore_columns_without_guides: false
detect_numerical_as_discretized_numerical: false

I0000 00:00:1738754930.159418    5704 kernel.cc:394] Number of batches: 1282
I0000 00:00:1738754930.159439    5704 kernel.cc:395] Number of examples: 656070
I0000 00:00:1738754930.307783    5704 kernel.cc:794] Training dataset:
Number of records: 656070
Number of columns: 33

Number of columns by type:
	NUMERICAL: 32 (96.9697%)
	CATEGORICAL: 1 (3.0303%)

Columns:

NUMERICAL: 32 (96.9697%)
	0: "FLOW_DURATION_M

Training complete
Loading NF-UNSW-NB15 testset
(324624, 33)
Loading complete
Evaluating model trained on X-IIoTiD trainset on NF-UNSW-NB15 testset
Evaluation complete
Results for:...
Model trained on X-IIoTiD and evaluated on NF-UNSW-NB15
[[310167     87]
 [ 14370      0]]
              precision    recall  f1-score   support

         0.0       0.96      1.00      0.98    310254
         1.0       0.00      0.00      0.00     14370

    accuracy                           0.96    324624
   macro avg       0.48      0.50      0.49    324624
weighted avg       0.91      0.96      0.93    324624

---
Loading NF-CSE-CIC-IDS2018 testset
(1678480, 33)
Loading complete
Evaluating model trained on X-IIoTiD trainset on NF-CSE-CIC-IDS2018 testset
Evaluation complete
Results for:...
Model trained on X-IIoTiD and evaluated on NF-CSE-CIC-IDS2018
[[1446274   28947]
 [ 203162      97]]
              precision    recall  f1-score   support

         0.0       0.88      0.98      0.93   1475221
       







Num validation examples: tf.Tensor(119083, shape=(), dtype=int32)


I0000 00:00:1738755024.064419    5704 kernel.cc:774] Start Yggdrasil model training
I0000 00:00:1738755024.064463    5704 kernel.cc:775] Collect training examples
I0000 00:00:1738755024.064482    5704 kernel.cc:787] Dataspec guide:
column_guides {
  column_name_pattern: "^__LABEL$"
  type: CATEGORICAL
  categorial {
    min_vocab_frequency: 0
    max_vocab_count: -1
  }
}
default_column_guide {
  categorial {
    max_vocab_count: 2000
  }
  discretized_numerical {
    maximum_num_bins: 255
  }
}
ignore_columns_without_guides: false
detect_numerical_as_discretized_numerical: false

I0000 00:00:1738755024.064759    5704 kernel.cc:394] Number of batches: 1861
I0000 00:00:1738755024.064778    5704 kernel.cc:395] Number of examples: 952656
I0000 00:00:1738755024.265976    5704 kernel.cc:794] Training dataset:
Number of records: 952656
Number of columns: 33

Number of columns by type:
	NUMERICAL: 32 (96.9697%)
	CATEGORICAL: 1 (3.0303%)

Columns:

NUMERICAL: 32 (96.9697%)
	0: "FLOW_DURATION_M

Training complete
Loading NF-UNSW-NB15 testset
(324624, 33)
Loading complete
Evaluating model trained on WUSTL-IIOT trainset on NF-UNSW-NB15 testset
Evaluation complete
Results for:...
Model trained on WUSTL-IIOT and evaluated on NF-UNSW-NB15
[[ 34042 276212]
 [  2103  12267]]
              precision    recall  f1-score   support

         0.0       0.94      0.11      0.20    310254
         1.0       0.04      0.85      0.08     14370

    accuracy                           0.14    324624
   macro avg       0.49      0.48      0.14    324624
weighted avg       0.90      0.14      0.19    324624

---
Loading NF-CSE-CIC-IDS2018 testset
(1678480, 33)
Loading complete
Evaluating model trained on WUSTL-IIOT trainset on NF-CSE-CIC-IDS2018 testset
Evaluation complete
Results for:...
Model trained on WUSTL-IIOT and evaluated on NF-CSE-CIC-IDS2018
[[994344 480877]
 [102195 101064]]
              precision    recall  f1-score   support

         0.0       0.91      0.67      0.77   1475221
   

In [4]:
result_df.to_csv(cwd+"/results/"+model_name+"_results.csv",index=False)