In [1]:

from utils import load_config
from loadDataset import LoadDataset
from trainModule import TrainModule, TestModule
from fcgVectorize import FCGVectorize
import warnings
warnings.filterwarnings("ignore")
               

In [2]:
options = {
    "dataset": {
        "pack_filter": "diec",
        "cpu_arch": "x86_64",
        "reverse_tool": "ghidra",
        "raw": "malware_diec_ghidra_x86_64_fcg_dataset.csv",
        "split_by_cpu": False,
        "pretrain_family": [
            "gafgyt",
            "ngioweb",
            "mirai",
            "tsunami"
        ]
    },
    "pretrain": {
        "name": "x86_pretrained",
        "use": True,
        "raw_dataset": "malware_diec_ghidra_x86_64_fcg_pretrain_dataset.csv",
        "batch_size": 128
    },
    "settings": {
        "name": "10way_5shot_ProtoNet_with_pretrain",
        "model": {
            "model_name": "GraphSAGE",
            "input_size": 128,
            "hidden_size": 128,
            "output_size": 128,
            "num_layers": 2,
            "projection": True,
            "load_weights": "x86_pretrained_20241121_1653"
        },
        "train": {
            "training": True,
            "validation": True,
            "num_epochs": 500,
            "device": "cuda:1",
            "parallel": False,
            "parallel_device": [],
            "iterations": 100,
            "lr": 0.0005,
            "projection_lr": 0.001,
            "lr_scheduler": {
                "use": True,
                "method": "ReduceLROnPlateau",
                "step_size": 20,
                "gamma": 0.5,
                "patience": 10,
                "factor": 0.5
            },
            "early_stopping": {
                "use": True,
                "patience":  30
            },
            "loss": "CrossEntropyLoss",
            "distance": "euclidean",
            "optimizer": "AdamW",
            "save_model": True,
        },
        "few_shot": {
            "method": "ProtoNet",
            "train": {
                "support_shots": 5,
                "query_shots": 15,
                "class_per_iter": 10
            },
            "test": {
                "support_shots": 5,
                "query_shots": 15,
                "class_per_iter": 10
            }
        },
        "vectorize": {
            "node_embedding_method": "word2vec",
            "node_embedding_size": 128,
            "num_workers": 4
        },
        "seed": 10
    },
    "paths": {
        "data": {
            "fcg_dataset": "./dataset/data_ghidra_fcg",
            "csv_folder": "./dataset/raw_csv",
            "split_folder": "./dataset/split",
            "embedding_folder": "./embeddings",
            "pretrain_dataset": "./dataset/data_ghidra_fcg_pretrain"
        },
        "model": {
            "model_folder": "./checkpoints",
            "pretrained_folder": "./pretrained"
        }
    }
}



In [3]:
dataset = LoadDataset(options)                                   
vectorize = FCGVectorize(options, dataset)                  
vectorize.node_embedding(dataset.rawDataset)                                                    
train = TrainModule(options, dataset)


Loading all datasets...
Split dataset for train does not exist, creating split dataset...
train dataset shape: (1180, 16)
train dataset family number: 59
test dataset shape: (200, 16)
test dataset family number: 10
val dataset shape: (200, 16)
val dataset family number: 10
Training opcodeSet & Sentence not exist, start to get opcodeSet & Sentence...
Start to get OpcodeSet & Sentence...


100%|██████████| 1180/1180 [00:55<00:00, 21.41it/s]


Save opcodeSet to ./embeddings/x86_64_withVal_withPretrain_ghidra_10/word2vec/opcodeSet.pkl
Save sentence to ./embeddings/x86_64_withVal_withPretrain_ghidra_10/word2vec/opcodeSentences.pkl
Finish getting opcodeSet & Sentence
Number of opcodeSet:  573
Training word2vec model...
Finish training word2vec model, save word2vec model to ./embeddings/x86_64_withVal_withPretrain_ghidra_10/word2vec
Start to get node embedding...


100%|██████████| 1580/1580 [05:36<00:00,  4.70it/s]


Finish getting node embedding
Setting up the training module...
Loading data from ./embeddings/x86_64_withVal_withPretrain_ghidra_10/word2vec...
Loading training data...


100%|██████████| 1180/1180 [02:43<00:00,  7.24it/s]


Saving data to ./embeddings/x86_64_withVal_withPretrain_ghidra_10/word2vec/trainData.pkl
Loading validation data...


100%|██████████| 200/200 [00:20<00:00,  9.82it/s]


Saving data to ./embeddings/x86_64_withVal_withPretrain_ghidra_10/word2vec/valData.pkl
Model loaded from ./pretrained/x86_pretrained_20241121_1653/epoch_2060_best_backbone.pth
Device: cuda:1
Model: GraphSAGE(
  (sage_convs): ModuleList(
    (0-1): 2 x SAGEConv(128, 128, aggr=mean)
  )
  (norms): ModuleList(
    (0-1): 2 x BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  )
  (output_proj): Sequential(
    (0): Linear(in_features=128, out_features=128, bias=True)
    (1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
  )
)
Loss function: <loss.ProtoLoss object at 0x7ffa29feebd0>
Optimizer: AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    lr: 0.001
    maximize: False
    weight_decay: 0.01

Parameter Group 1
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    different