In [1]:
!pip install pytorch_tabnet



In [2]:
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import torch
from torch.utils.data import Dataset, DataLoader
import torch.optim as torch_optim
import torch.nn as nn
import torch.nn.functional as F
from torchvision import models

In [3]:
path = './drive/MyDrive/Materiale_Pellegrino_personal/CIDDS_Meta/CIDDS_Meta.csv'
dataset = pd.read_csv(path)

### ***PRE-ELABORAZIONE DATI***

In [4]:
dataset

Unnamed: 0,Duration,Proto,Packets,Bytes,Flows,Flags,Tos,multilabel
0,0.245,TCP,2,670,1,.AP...,0,normal
1,0.000,TCP,1,66,1,.A....,0,normal
2,0.000,TCP,1,58,1,....S.,0,portScan
3,0.000,TCP,1,58,1,....S.,0,portScan
4,0.047,TCP,11,1027,1,.AP...,0,normal
...,...,...,...,...,...,...,...,...
399995,0.034,TCP,2,598,1,.AP...,0,normal
399996,0.000,TCP,1,95,1,.AP...,32,normal
399997,0.005,TCP,5,479,1,.AP.SF,0,dos
399998,0.000,TCP,1,66,1,.A...F,32,normal


In [5]:
print(Counter(dataset['Flows']))
print(Counter(dataset['multilabel']))

dataset = dataset.drop('Flows', axis=1)

Counter({1: 400000})
Counter({'normal': 243363, 'dos': 117904, 'portScan': 37723, 'pingScan': 646, 'bruteForce': 364})


In [6]:
dep_var = 'multilabel'
cat_names = ["Proto", "Flags", "Bytes"]
cont_names = [col for col in dataset.columns if col not in cat_names and col != dep_var]

print(cont_names, 'len: ', len(cont_names))

['Duration', 'Packets', 'Tos'] len:  3


In [7]:
# LabelEncoding della variabile target 
target_index = dataset.columns.get_loc(dep_var)
dataset.iloc[:, target_index] = LabelEncoder().fit_transform(dataset[dep_var])

#LabelEncoding delle variabili categoriali
for col in cat_names:
  target_index = dataset.columns.get_loc(col)
  dataset.iloc[:, target_index] = LabelEncoder().fit_transform(dataset[col])

In [9]:
# Indice e dim delle variabili categoriali per l'incorporamento

features = [col for col in dataset.columns if col != dep_var]

cat_idxs = [i for i, f in enumerate(features) if f in cat_names]

cat_dims = []
for col in cat_names:
  cat_dims.append(len(Counter(dataset[col])))

In [10]:
print(features)
print(cat_idxs)
print(cat_dims)

['Duration', 'Proto', 'Packets', 'Bytes', 'Flags', 'Tos']
[1, 3, 4]
[4, 20, 13414]


In [11]:
from sklearn.model_selection import train_test_split

# train 50% e test 50%
train, test = train_test_split(dataset, test_size=0.50)

In [12]:
y_train = train[dep_var]
train = train.drop(dep_var, axis=1)
y_test = test[dep_var]
test = test.drop(dep_var, axis=1)

# validation di 2500 righe da train
train, validation, y_train, y_val = train_test_split(train, y_train, test_size=(2500/len(train)), random_state=0)

In [13]:
train = np.array(train)
validation = np.array(validation)
test = np.array(test)

y_train = np.array(y_train)
y_val = np.array(y_val)

In [14]:
import pytorch_tabnet
from pytorch_tabnet.tab_model import TabNetClassifier
import torch

In [15]:
model = TabNetClassifier(n_d=64, n_a=64, n_steps=5,optimizer_fn=torch.optim.Adam, cat_idxs=cat_idxs, cat_dims=cat_dims,
                       optimizer_params=dict(lr=2e-2),
                       scheduler_params={"step_size":10, # how to use learning rate scheduler
                                         "gamma":0.9},
                       scheduler_fn=torch.optim.lr_scheduler.StepLR,
                       mask_type='entmax' # "sparsemax"
                      )

Device used : cpu


In [16]:
# fit the model 
model.fit(
    train,y_train,
    eval_set=[(train, y_train), (validation, y_val)],
    eval_name=['train', 'validation'],
    eval_metric=['accuracy'],
    max_epochs=500 , patience=50,
    batch_size=512, virtual_batch_size=128,
    num_workers=0,
    weights=1,
    drop_last=False
)  

IndexError: ignored

In [None]:
y_pred = model.predict(test)

In [None]:
# Matrice di confusione, accuracy, classification_report
from sklearn.metrics import *

# y_test è la variabile che contiene i valori effettivi
# y_pred contiene i valori predetti dal modello
cm = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred)

acc = accuracy_score(y_test, y_pred)
mcc = matthews_corrcoef(y_test, y_pred)
recall = recall_score(y_test, y_pred, average='weighted')
precision = precision_score(y_test, y_pred, average='weighted')
# non presente nella libreria, calcolo mediante formula
f2 = (1+2**2)*((precision*recall)/((2**2*precision)+recall))

In [None]:
mcm = multilabel_confusion_matrix(y_test, y_pred)
print(mcm)

In [None]:
FP = cm.sum (axis = 0) - np.diag (cm) 
FN = cm.sum (axis = 1) - np.diag (cm) 
TP = np.diag (cm) 
TN = cm.sum () - (FP + FN + TP)

print('True positive: ', TP)
print('True negative: ', TN)
print('False positive: ', FP)
print('False negative: ', FN)

FP = FP.astype(float)
FN = FN.astype(float)
TP = TP.astype(float)
TN = TN.astype(float)

# Sensitivity, hit rate, recall, or true positive rate
TPR = TP/(TP+FN)
# Specificity or true negative rate
TNR = TN/(TN+FP) 
# Fall out or false positive rate
FPR = FP/(FP+TN)
# False negative rate
FNR = FN/(TP+FN)

print('True positive rate: ', TPR)
print('True negative rate: ', TNR)
print('False positive rate: ', FPR)
print('False negative rate: ', FNR)

In [None]:
print(cm)

In [None]:
print(report)

In [None]:
print('Accuracy: ', acc)
print('Precision_weighted: ', precision)
print('Recall_weighted: ', recall)
print('mcc: ', mcc)
print('f2: ', f2)