<a href="https://colab.research.google.com/github/onlysubgroup/SNN_IDS/blob/main/NF_UQ_NIDS_ExtraTree.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Coresponding to the paper "NetFlow Datasets for Machine Learning-based Network Intrusion Detection Systems" (https://arxiv.org/abs/2011.09144), this colab aims at reproducing the results for the NF-UQ-NIDS dataset summarized in table 8 of the paper. 

In [1]:
import pandas as pd
#import the dataset with pandas
url = "https://cloudstor.aarnet.edu.au/plus/s/N0JTc8JFNtZtUE4/download?path=%2F&files=NF-UQ-NIDS.csv"
df = pd.read_csv(url)


In [2]:
# just like in the paper IP Adreses and Ports are droped to avoid overfitting
# Attack type label is also droped as this is only a binary classification algorithm
df = df.drop(['IPV4_SRC_ADDR', 'IPV4_DST_ADDR', 'L4_SRC_PORT', 'L4_DST_PORT',  'Attack', 'Dataset', 'PROTOCOL', 'L7_PROTO', 'TCP_FLAGS',], axis=1)
# drop PROTOCOL TYPES and TCP FLAGS aswell, because why not


In [3]:
# Now Sumarize IN/OUT BYTES/PKTS by calculating the absolute difference
df['ABS_BYTES'] = abs(df['OUT_BYTES'] - df['IN_BYTES']) 
df['ABS_PKTS'] = abs(df['OUT_PKTS'] - df['IN_PKTS'])
# Drop IN/OPUT BYTES/PKTS as no longer needed
df.drop(['IN_BYTES', 'OUT_BYTES', 'IN_PKTS', 'OUT_PKTS'], axis=1)

Unnamed: 0,FLOW_DURATION_MILLISECONDS,Label,ABS_BYTES,ABS_PKTS
0,15,0,9256,3
1,0,0,1672,4
2,1111,0,606,4
3,124,0,8296,2
4,1459,0,554,2
...,...,...,...,...
11994888,4263037,0,2330065,2523
11994889,4263062,0,1054423,1513
11994890,4263062,0,62422,1357
11994891,4264935,0,9636,0


In [4]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [5]:
# as a proof of concept just picking random samples for training and test
#TODO: Split dataset and loop over training set while picking random_samples.
train_sample = df.sample(n=1111)
train_sample.to_hdf('gdrive/MyDrive/NF_UQ_NIDS_train.h5', key='train', mode='w') 
test_sample = df.sample(n=111)
test_sample.to_hdf('gdrive/MyDrive/NF_UQ_NIDS_test.h5', key='test', mode='w') 

In [6]:
!pip install --quiet tonic

[K     |████████████████████████████████| 88 kB 4.5 MB/s 
[K     |████████████████████████████████| 112 kB 19.1 MB/s 
[?25h  Building wheel for importRosbag (setup.py) ... [?25l[?25hdone


In [7]:
# from https://tonic.readthedocs.io/en/latest/tutorials/wrapping_own_data.html
from tonic import Dataset, transforms
import pandas as pd
class NF_UQ_NIDS(Dataset):
    def __init__(
        self,
        train=True,
        transform=None,
        target_transform=None,
        targets=None,
        data=None,
    ):
        super(NF_UQ_NIDS, self).__init__(
            transform=transform, target_transform=target_transform
        )
        self.train = train

        # url = "https://cloudstor.aarnet.edu.au/plus/s/N0JTc8JFNtZtUE4/download?path=%2F&files=NF-UQ-NIDS.csv"
        if train:
            self.filenames = 'gdrive/MyDrive/NF_UQ_NIDS_train.h5'
        else :
            self.filenames = 'gdrive/MyDrive/NF_UQ_NIDS_test.h5'
        data = pd.read_hdf(self.filenames)

        label = 'Label'
        data_without_label = data.loc[:, data.columns != label]
        self.events = torch.Tensor(data_without_label.values)
        self.targets = torch.Tensor(data[label].values)
    def __getitem__(self, index):
        if self.transform is not None:
            self.events = self.transform(self.events)
            self.tragets = self.transform(self.targets)
        event = self.events[index]
        target = self.targets[index]
        return event, target

    def __len__(self):
          return len(self.events)


In [8]:
import torchvision
import torch

BATCH_SIZE = 4

transform = torchvision.transforms.Compose(
    [
        torchvision.transforms.ToTensor(),
        #torchvision.transforms.Normalize((0.1307,), (0.3081,)),
    ]
)

train_data = NF_UQ_NIDS(train=True)

train_loader = torch.utils.data.DataLoader(
    train_data,
    batch_size=BATCH_SIZE,
    shuffle=True
)

test_loader = torch.utils.data.DataLoader(
    NF_UQ_NIDS(train=False),
    batch_size=BATCH_SIZE
)

Further Information on ExtraTrees: 
https://scikit-learn.org/stable/modules/ensemble.html#forest

Tutorial used: https://machinelearningmastery.com/extra-trees-ensemble-with-python/

In [9]:
# check scikit-learn version
import sklearn
print(sklearn.__version__)

1.0.2


In [10]:
# evaluate extra trees algorithm for classification
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.ensemble import ExtraTreesClassifier
# define dataset
X, y = train_data[:]
# define the model
model = ExtraTreesClassifier()
# evaluate the model
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
# report performance
print('Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

Accuracy: 0.941 (0.021)
