In [1]:
from src.data import CICIDS2017, balanced_sample, TabPFNDataGenerator
from src.models import TabNetModel, TabPFNModel
from src.models import PreConfigured_LogisticRegression, PreConfigured_RandomForest, PreConfigured_LinearSVC, PreConfigured_DecisionTree, PreConfigured_KNeighbors
from src.pipelines import TTPipeline, plot_accuracies

import logging
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')

### Data

In [2]:
dataset = CICIDS2017(pca=False, classes_mapping=False)
dataset.load()
# dataset.balance_(n=2000, category_col="Label")
train, test = dataset.train_test_split(test_size=0.3)
X_train = train.drop(columns=["Label"]).values
y_train = train["Label"].values
X_test = test.drop(columns=["Label"]).values
y_test = test["Label"].values

In [3]:
train["Label"].value_counts()

Label
BENIGN                       1467210
DoS Hulk                      121248
DDoS                           89614
PortScan                       63577
DoS GoldenEye                   7272
FTP-Patator                     4195
DoS slowloris                   3769
DoS Slowhttptest                3631
SSH-Patator                     2302
Bot                             1361
Web Attack  Brute Force          979
Web Attack  XSS                  447
Infiltration                      23
Web Attack  Sql Injection         18
Heartbleed                         7
Name: count, dtype: int64

In [4]:
test["Label"].value_counts()

Label
BENIGN                       629274
DoS Hulk                      51601
DDoS                          38402
PortScan                      27242
DoS GoldenEye                  3014
FTP-Patator                    1738
DoS slowloris                  1616
DoS Slowhttptest               1597
SSH-Patator                     917
Bot                             592
Web Attack  Brute Force         491
Web Attack  XSS                 205
Infiltration                     13
Heartbleed                        4
Web Attack  Sql Injection         3
Name: count, dtype: int64

### Models

#### Preparing

In [5]:
logreg = PreConfigured_LogisticRegression()
svc = PreConfigured_LinearSVC()
randomforest = PreConfigured_RandomForest()
kneighbors = PreConfigured_KNeighbors()
decision_tree = PreConfigured_DecisionTree()
tabnet = TabNetModel(pretrain=True)
tabpfn = TabPFNModel()



#### Training

##### Logistic Regression

In [None]:
logreg_pl = TTPipeline(logreg)
logreg_pl.train(X_train, y_train, cv=5)
logreg_results = logreg_pl.evaluate(X_test, y_test)
logreg.save()

In [None]:
print("Precision:", logreg_results["precision"])
print("Recall:", logreg_results["recall"])
print("F1-Score:", logreg_results["f1_score"])

##### Support Vector Machine

In [None]:
svc_pl = TTPipeline(svc)
svc_pl.train(X_train, y_train, cv=5)
svc_results = svc_pl.evaluate(X_test, y_test)
svc.save()

In [None]:
print("Precision:", svc_results["precision"])
print("Recall:", svc_results["recall"])
print("F1-Score:", svc_results["f1_score"])

##### Random Forest

In [None]:
randomforest_pl = TTPipeline(randomforest)
randomforest_pl.train(X_train, y_train, cv=5)
randomforest_results = randomforest_pl.evaluate(X_test, y_test)
randomforest.save()

In [None]:
print("Precision:", randomforest_results["precision"])
print("Recall:", randomforest_results["recall"])
print("F1-Score:", randomforest_results["f1_score"])

##### K-Neightbors

In [None]:
kneighbors_pl = TTPipeline(kneighbors)
kneighbors_pl.train(X_train, y_train, cv=5)
kneighbors_results = kneighbors_pl.evaluate(X_test, y_test)
kneighbors.save()

In [None]:
print("Precision:", kneighbors_results["precision"])
print("Recall:", kneighbors_results["recall"])
print("F1-Score:", kneighbors_results["f1_score"])

##### Decision Tree

In [None]:
decision_tree_pl = TTPipeline(decision_tree)
decision_tree_pl.train(X_train, y_train, cv=5)
decision_tree_results = decision_tree_pl.evaluate(X_test, y_test)
decision_tree.save()

In [None]:
print("Precision:", decision_tree_results["precision"])
print("Recall:", decision_tree_results["recall"])
print("F1-Score:", decision_tree_results["f1_score"])

##### TabNet

In [None]:
tabnet_pl = TTPipeline(tabnet)
tabnet_pl.train(X_train, y_train, X_test, y_test, augmentation=True)
tabnet_results = tabnet_pl.evaluate(X_test, y_test)
tabnet.save()

In [None]:
print("Accuracy:", tabnet_results["accuracy"])
print("Precision:", tabnet_results["precision"])
print("Recall:", tabnet_results["recall"])
print("F1-Score:", tabnet_results["f1_score"])

In [None]:
tabnet.plot_metrics()

##### TabPFN

In [6]:
# Balance training samples
smaller_train = balanced_sample(train, "Label", 2000)
new_x = smaller_train.drop(columns=["Label"]).values
new_y = smaller_train["Label"].values

In [None]:
# Augment samples
generator = TabPFNDataGenerator()
augmented_new_x, augmented_new_y = generator.generate(100, new_x, new_y, threshold=0.25)

In [None]:
smaller_train["Label"].value_counts()

In [8]:
tabpfn_pl = TTPipeline(tabpfn)
tabpfn_pl.train(new_x, new_y)



In [9]:
tabpfn_results = tabpfn_pl.evaluate(X_test, y_test)
tabpfn.save()

KeyboardInterrupt: 

In [None]:
print("Accuracy:", tabpfn_results["accuracy"])
print("Precision:", tabpfn_results["precision"])
print("Recall:", tabpfn_results["recall"])
print("F1-Score:", tabpfn_results["f1_score"])

#### From loaded

In [None]:
logreg.name = "..."
logreg.load()
logreg_pl = TTPipeline(logreg)
logreg_results = logreg_pl.evaluate(X_test, y_test)

In [None]:
svc.load()
svc_pl = TTPipeline(svc)
svc_results = svc_pl.evaluate(X_test, y_test)
svc_results["accuracy"]

In [None]:
randomforest.load()
randomforest_pl = TTPipeline(randomforest)
randomforest_results = randomforest_pl.evaluate(X_test, y_test)
randomforest_results["accuracy"]

In [None]:
kneighbors.name = "..."
kneighbors.load()
kneighbors_pl = TTPipeline(kneighbors)
kneighbors_results = kneighbors_pl.evaluate(X_test, y_test)

In [None]:
decision_tree.name = "..."
decision_tree.load()
decision_tree_pl = TTPipeline(decision_tree)
decision_tree_results = decision_tree_pl.evaluate(X_test, y_test)

In [None]:
tabnet.name = "..."
tabnet.load()
tabnet_pl = TTPipeline(tabnet)
tabnet_results = tabnet_pl.evaluate(X_test, y_test)

In [None]:
tabpfn.load()
tabpfn_pl = TTPipeline(tabpfn)
tabpfn_results = tabpfn_pl.evaluate(X_test, y_test)
tabnet_results["accuracy"]

#### Performance

In [None]:
accuracies = [
    # logreg_results["accuracy"],
    # svc_results["accuracy"],
    # randomforest_results["accuracy"],
    # kneighbors_results["accuracy"],
    # decision_tree_results["accuracy"],
    # tabnet_results["accuracy"],
    # tabpfn_results["accuracy"]
]

models_names = [
    # 'Logistic Regression',
    # 'SVM',
    # "Random Forest",
    # "KNeighbors",
    # "Decision Tree",
    # "TabNet",
    # "TabPFN"
]

In [None]:
accuracy_plot = plot_accuracies(accuracies, models_names)