In [1]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '1'

import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
import pickle
from tqdm.notebook import tqdm 
import pyarrow as pa
import pyarrow.parquet as pq
import torch
from sklearn.metrics import confusion_matrix
import seaborn as sns
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

import sys
sys.path.append('..')
from utils.preprocess import NDF



# Modes of NDF
There are 3 modes how we can use NDF now

## Classical NDF, is not specified dga mode is set to False

In [2]:
input_data = {
    'benign': '../feature-extraction/floor/benign_2312.parquet',
    'malign': '../feature-extraction/floor/misp_2402.parquet'
}

dataset = NDF("xgboost", True, input_data=input_data, one_line_processing=False)

2024-05-25 22:03:10,031 - utils.preprocess - INFO - Benign dataset path: ../feature-extraction/floor/benign_2312.parquet
2024-05-25 22:03:10,036 - utils.preprocess - INFO - Malign dataset path: ../feature-extraction/floor/misp_2402.parquet
2024-05-25 22:03:18,146 - utils.preprocess - INFO - Number of records in combined dataset: 572503
2024-05-25 22:03:19,228 - utils.preprocess - INFO - Decision tree model saved to models/decision_tree_model.joblib
2024-05-25 22:03:19,281 - utils.preprocess - INFO - New feature 'dtree_prob' created from decision tree predictions.
2024-05-25 22:03:19,335 - utils.preprocess - INFO - Decision Tree Train Accuracy: 0.96
2024-05-25 22:03:19,339 - utils.preprocess - INFO - Decision Tree Test Accuracy: 0.92
2024-05-25 22:03:19,986 - utils.preprocess - INFO - Decision Tree Cross-Validation Scores: [0.91147197 0.90828092 0.9129979 ]
2024-05-25 22:03:19,999 - utils.preprocess - INFO - Generated class map: {'benign_2312:unknown': 0, 'misp_2310:phishing': 1, 'phish


Dataset Subset:
Name: dataset_../feature-extraction/floor/misp2402_2024-05-25.parquet
Features:
   Feature_0  Feature_1  Feature_2  Feature_3  Feature_4  Feature_5  \
0        0.0      0.125        0.2      0.000   0.000000   0.000000   
1        0.0      0.000        0.0      0.000   0.000000   0.000000   
2        0.0      0.250        0.0      0.000   0.000000   0.000000   
3        0.0      0.125        0.0      0.000   0.166667   0.000000   
4        0.0      0.125        0.0      0.375   0.166667   0.076923   
5        0.0      0.125        0.0      0.625   0.250000   0.076923   
6        0.0      0.250        0.0      0.000   0.000000   0.000000   
7        0.0      0.125        0.0      0.125   0.166667   0.153846   
8        0.0      0.250        0.4      0.000   0.166667   0.000000   
9        0.0      0.125        0.0      0.000   0.000000   0.000000   

   Feature_6  Feature_7  Feature_8  Feature_9  ...  Feature_161  Feature_162  \
0        0.0        0.0        0.0   0.00

## Binary NDF, 2 classes - benign domains, DGA domains

In [3]:
input_data = {
    'benign': '../feature-extraction/floor/benign_2312.parquet',
    'malign': '../feature-extraction/floor/dga_2310.parquet'
}

dataset = NDF("xgboost", True, input_data=input_data, one_line_processing=False,dga="binary")
class_map = dataset['class_map']

2024-05-25 22:03:22,175 - utils.preprocess - INFO - Benign dataset path: ../feature-extraction/floor/benign_2312.parquet
2024-05-25 22:03:22,184 - utils.preprocess - INFO - Malign dataset path: ../feature-extraction/floor/dga_2310.parquet
2024-05-25 22:03:23,728 - utils.preprocess - INFO - Number of records in combined dataset: 692262
2024-05-25 22:03:23,782 - utils.preprocess - INFO - Generated class map: {'benign_2312:unknown': 0, 'dga:pushdo:dga': 1, 'dga:virut:dga': 1, 'dga:necurs:dga': 1, 'dga:murofet:dga': 1, 'dga:dnschanger:dga': 1, 'dga:gameover:dga': 1, 'dga:dyre:dga': 1, 'dga:conficker:dga': 1, 'dga:murofetweekly:dga': 1, 'dga:cryptolocker:dga': 1, 'dga:mydoom:dga': 1, 'dga:qakbot:dga': 1, 'dga:qadars:dga': 1, 'dga:proslikefan:dga': 1, 'dga:suppobox:dga': 1, 'dga:gozi:dga': 1, 'dga:pykspa:dga': 1, 'dga:banjori:dga': 1, 'dga:corebot:dga': 1, 'dga:monerominer:dga': 1, 'dga:sphinx:dga': 1, 'dga:qsnatch:dga': 1, 'dga:ranbyus:dga': 1, 'dga:emotet:dga': 1, 'dga:chinad:dga': 1, 'dga


Dataset Subset:
Name: dataset_../feature-extraction/floor/dga2310_2024-05-25.parquet
Features:
   Feature_0  Feature_1  Feature_2  Feature_3  Feature_4  Feature_5  \
0   0.109375        0.0        0.0        0.0       0.00       0.10   
1   0.406250        1.0        0.0        0.0       0.00       0.15   
2   0.187500        0.0        0.0        0.0       0.00       0.30   
3   0.375000        1.0        0.0        0.0       0.25       0.45   
4   0.078125        0.0        0.0        0.0       0.00       0.15   
5   0.109375        0.0        0.0        0.0       0.00       0.10   
6   0.250000        0.0        0.0        0.0       0.00       0.15   
7   0.093750        0.0        0.0        0.0       0.00       0.15   
8   0.312500        0.0        0.0        0.0       0.25       0.30   
9   0.171875        0.0        0.0        0.0       0.25       0.30   

   Feature_6  Feature_7  Feature_8  Feature_9  ...  Feature_52  Feature_53  \
0   0.000000   0.990445   0.048780   1.00000

## Multiclass NDF - takes only 1 dataset, there is multiple classes based on number of DGA families

In [4]:
input_data = {
    'malign': '../feature-extraction/floor/dga_2310.parquet'
}

dataset = NDF("xgboost", True, input_data=input_data, one_line_processing=False,dga="multiclass")
class_map = dataset['class_map']

2024-05-25 22:03:24,515 - utils.preprocess - INFO - Malign dataset path: ../feature-extraction/floor/dga_2310.parquet
2024-05-25 22:03:25,037 - utils.preprocess - INFO - Number of records in combined dataset: 230070
2024-05-25 22:03:25,071 - utils.preprocess - INFO - Class counts: label
dga:murofet:dga          556
dga:virut:dga            484
dga:gameover:dga         406
dga:necurs:dga           245
dga:qakbot:dga            76
dga:cryptolocker:dga      50
dga:qsnatch:dga           45
dga:conficker:dga         45
dga:dyre:dga              37
dga:locky:dga             33
dga:ranbyus:dga           33
dga:pykspa:dga            32
dga:dnschanger:dga        27
dga:suppobox:dga          24
dga:monerominer:dga       22
dga:nymaim:dga            20
dga:corebot:dga           19
dga:chinad:dga            14
dga:gameoverp2p:dga       13
dga:pitou:dga             13
dga:qadars:dga            13
dga:pushdo:dga            11
dga:emotet:dga            10
dga:banjori:dga            9
dga:oderoor:dga 


Dataset Subset:
Name: dataset_../feature-extraction/floor/dga2310_2024-05-25.parquet
Features:
   Feature_0  Feature_1  Feature_2  Feature_3  Feature_4  Feature_5  \
0   0.342105        0.0        0.0        0.0   0.000000   0.166667   
1   0.210526        0.0        0.0        0.0   0.666667   0.166667   
2   0.657895        1.0        0.0        0.0   0.000000   0.166667   
3   0.315789        0.0        0.0        0.0   0.000000   0.000000   
4   0.368421        0.0        0.0        0.0   0.000000   0.166667   
5   0.421053        0.0        0.0        0.0   0.333333   0.166667   
6   0.105263        0.0        0.0        0.0   0.000000   0.166667   
7   0.394737        0.0        0.5        0.0   0.000000   0.333333   
8   0.657895        1.0        0.0        0.0   0.000000   0.166667   
9   0.342105        0.0        0.5        0.0   0.333333   0.333333   

   Feature_6  Feature_7  Feature_8  Feature_9  ...  Feature_52  Feature_53  \
0   0.000000   0.449602   0.333333   0.24674