In [2]:
from utils import load_config
from loadDataset import LoadDataset
from trainModule import TrainModule, TestModule
from fcgVectorize import FCGVectorize
import warnings
warnings.filterwarnings("ignore")
               

In [3]:
options = {
    "dataset": {
        "pack_filter": "diec",
        "cpu_arch": "x86_64",
        "reverse_tool": "ghidra",
        "raw": "malware_diec_ghidra_x86_64_fcg_dataset.csv",
        "split_by_cpu": False,
        "pretrain_family": [
            "gafgyt",
            "ngioweb",
            "mirai",
            "tsunami"
        ]
    },
    "pretrain": {
        "name": "x86_pretrained",
        "use": True,
        "raw_dataset": "malware_diec_ghidra_x86_64_fcg_pretrain_dataset.csv",
        "batch_size": 128
    },
    "settings": {
        "name": "10way_5shot_ProtoNet_with_pretrain",
        "model": {
            "model_name": "GraphSAGE",
            "input_size": 128,
            "hidden_size": 128,
            "output_size": 128,
            "num_layers": 2,
            "projection": True,
            "load_weights": "x86_pretrained_20241121_1653"
        },
        "train": {
            "training": True,
            "validation": True,
            "num_epochs": 500,
            "device": "cuda:0",
            "parallel": False,
            "parallel_device": [],
            "iterations": 100,
            "lr": 0.0005,
            "projection_lr": 0.001,
            "lr_scheduler": {
                "use": True,
                "method": "ReduceLROnPlateau",
                "step_size": 20,
                "gamma": 0.5,
                "patience": 10,
                "factor": 0.5
            },
            "early_stopping": {
                "use": True,
                "patience":  30
            },
            "loss": "CrossEntropyLoss",
            "distance": "euclidean",
            "optimizer": "AdamW",
            "save_model": True,
        },
        "few_shot": {
            "method": "ProtoNet",
            "train": {
                "support_shots": 5,
                "query_shots": 15,
                "class_per_iter": 10
            },
            "test": {
                "support_shots": 5,
                "query_shots": 15,
                "class_per_iter": 10
            }
        },
        "vectorize": {
            "node_embedding_method": "word2vec",
            "node_embedding_size": 128,
            "num_workers": 4
        },
        "seed": 7
    },
    "paths": {
        "data": {
            "fcg_dataset": "./dataset/data_ghidra_fcg",
            "csv_folder": "./dataset/raw_csv",
            "split_folder": "./dataset/split",
            "embedding_folder": "/mnt/ssd2t/mandy/Projects/few_shot_fcg/embeddings/",
            "pretrain_dataset": "./dataset/data_ghidra_fcg_pretrain"
        },
        "model": {
            "model_folder": "./checkpoints",
            "pretrained_folder": "./pretrained"
        }
    }
}

In [4]:
dataset = LoadDataset(options)                                   
vectorize = FCGVectorize(options, dataset)                  
vectorize.node_embedding(dataset.rawDataset)                                                    
train = TrainModule(options, dataset)


Loading all datasets...
train dataset shape: (1180, 16)
train dataset family number: 59
test dataset shape: (200, 16)
test dataset family number: 10
val dataset shape: (200, 16)
val dataset family number: 10
Word2vec model exist, load word2vec model...
Start to get node embedding...


100%|██████████| 1580/1580 [00:00<00:00, 20311.96it/s]

Finish getting node embedding
Setting up the training module...
Loading data from /mnt/ssd2t/mandy/Projects/few_shot_fcg/embeddings/x86_64_withVal_withPretrain_ghidra_7/word2vec...
Loading training data...
Loading data from /mnt/ssd2t/mandy/Projects/few_shot_fcg/embeddings/x86_64_withVal_withPretrain_ghidra_7/word2vec/trainData.pkl...





Loading validation data...
Loading data from /mnt/ssd2t/mandy/Projects/few_shot_fcg/embeddings/x86_64_withVal_withPretrain_ghidra_7/word2vec/valData.pkl...
Model loaded from ./pretrained/x86_pretrained_20241121_1653/epoch_2060_best_backbone.pth
Device: cuda:0
Model: GraphSAGE(
  (sage_convs): ModuleList(
    (0-1): 2 x SAGEConv(128, 128, aggr=mean)
  )
  (norms): ModuleList(
    (0-1): 2 x BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  )
  (output_proj): Sequential(
    (0): Linear(in_features=128, out_features=128, bias=True)
    (1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
  )
)
Loss function: <loss.ProtoLoss object at 0x738fdf173f50>
Optimizer: AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    lr: 0.001
    maximize: False
    weight_decay: 0.01

Parameter Group 1
    amsgr

In [8]:
import torch

for i, data in enumerate(train.valGraph):
    for node in data.x:
        typeOfNode = node.dtype
        if typeOfNode != torch.float32:
            print(node)
            print(typeOfNode)
            print(data)
            print(i)
            break



# for i,data in enumerate(train.valLoader):
#     data = data.to(train.device)
#     with torch.no_grad():
#         model_output = train.model(data)
#         loss, acc = train.loss_fn(model_output, data.y)
#     print(data.y)

    

tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0])
torch.int64
Data(x=[13, 128], edge_index=[2, 0], label=[13], name='code', y=9)
194
