## Загрузка датасета

In [360]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [361]:
%cd /content/drive/My Drive/Colab Notebooks/Diplom2024/Code/dataset

/content/drive/My Drive/Colab Notebooks/Diplom2024/Code/dataset


In [362]:
import os
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
import pandas as pd


In [363]:
dataset = pd.read_csv("syscalls_dataset.csv", delimiter=";")

In [364]:
dataset

Unnamed: 0,last syscalls,syscall
0,"execvesyscall(""/usr/bin/uname"", [""uname""], 0x7...","openatsyscall(AT_FDCWD, ""/etc/ld.so.cache"", O_..."
1,brksyscall(NULL)syscall0x55f6bffa6000last_n_sy...,"newfstatatsyscall(3, """", {st_mode=S_IFREG|0644..."
2,"arch_prctlsyscall(0x3001 /* ARCH_??? */, 0x7ff...","mmapsyscall(NULL, 108635, PROT_READ, MAP_PRIVA..."
3,"mmapsyscall(NULL, 8192, PROT_READ|PROT_WRITE, ...",closesyscall(3)syscall0
4,"accesssyscall(""/etc/ld.so.preload"", R_OK)sysca...","openatsyscall(AT_FDCWD, ""/lib/x86_64-linux-gnu..."
5,"openatsyscall(AT_FDCWD, ""/etc/ld.so.cache"", O_...","readsyscall(3, ""\177ELF\2\1\1\3\0\0\0\0\0\0\0\..."
6,"newfstatatsyscall(3, """", {st_mode=S_IFREG|0644...","pread64syscall(3, ""\6\0\0\0\4\0\0\0@\0\0\0\0\0..."
7,"mmapsyscall(NULL, 108635, PROT_READ, MAP_PRIVA...","pread64syscall(3, ""\4\0\0\0 \0\0\0\5\0\0\0GNU\..."
8,closesyscall(3)syscall0last_n_syscallsopenatsy...,"pread64syscall(3, ""\4\0\0\0\24\0\0\0\3\0\0\0GN..."
9,"openatsyscall(AT_FDCWD, ""/lib/x86_64-linux-gnu...","newfstatatsyscall(3, """", {st_mode=S_IFREG|0755..."


## Выбор девайса

In [365]:
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

Using cpu device


In [366]:
from collections import Counter, defaultdict

import numpy as np
import pandas as pd
import torch
from torch.nn import Embedding, Linear, LSTM, Module
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, SubsetRandomSampler
from tqdm import tqdm
import re

## Разбиение данных на токены

In [367]:
import re

def tokenizer(syscalls):
    syscalls = syscalls.split(sep='last_n_syscalls')
    tokens = []
    for syscall in syscalls:
        str = syscall.split(sep='syscall')

        str = [s if len(s) > 0 else '<pad>' for s in str]

        # print(40*'-')
        # print('arguments: ', str[1])
        # print('res: ', str[2])
        # print(40*'-')

        tokens.extend(['syscallname',str[0], 'syscallargs', str[1], 'syscallres', str[2], 'syscallend'])

    return tokens

In [368]:
print(tokenizer('namesyscallsyscall3289last_n_syscallsnamesyscall(wekj)syscall3289'))

['syscallname', 'name', 'syscallargs', '<pad>', 'syscallres', '3289', 'syscallend', 'syscallname', 'name', 'syscallargs', '(wekj)', 'syscallres', '3289', 'syscallend']


In [369]:
# dataset['tokens'] = dataset.apply(lambda row: tokenizer(row['last syscalls']), axis=1)

## Преобразование токены в коды

In [370]:
from collections import Counter

In [371]:
def token_to_sequence(tokens, token2index):
    seq = []
    for token in tokens:
        index = token2index.get(token, 0) # 0 означает неизвестное слово
        # Неизвестные слова не добавляем в выходную последовательность
        if index != 1:
            seq.append(index)
    return seq

## Класс данных системных вызовов для удобного обучения

In [372]:
class SyscallDataset(Dataset):
    def __init__(self, dataset, token_size=15000):
        self.dataset = dataset
        self.token_size = token_size

        # Разбиваем системные вызовы на токены
        self.last_syscalls_tokens = [tokenizer(self.dataset.iloc[i]['last syscalls']) for i in range(len(self.dataset))]
        self.syscall_tokens = [tokenizer(self.dataset.iloc[i]['syscall']) for i in range(len(self.dataset))]


        # Вычисляем наиболее встречающиеся токены и индексируем их
        tokens_counter = Counter()
        for syscalls_token in self.last_syscalls_tokens:
            tokens_counter.update(syscalls_token)

        most_common_token2index = {
            token[0]: i + 2
            for i, token in enumerate(tokens_counter.most_common()[: (token_size - 2)])
        }

        # Словарь, отображающий токены в коды
        self.token2index = dict()
        # Словарь, отображающий коды в токены
        self.index2token = dict()


        self.token2index.update(most_common_token2index)
        self.token2index["<unk>"] = 0
        self.token2index["<pad>"] = 1


        self.index2token = {v: k for k, v in self.token2index.items()}

        l = min(len(self.index2token), token_size)
        self.vocabulary = [self.index2token[i] for i in range(l)]


    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, ix):
        X = torch.LongTensor(
            token_to_sequence(self.last_syscalls_tokens[ix], self.token2index)
        )

        y = token_to_sequence(self.syscall_tokens[ix], self.token2index)

        return X, y[1]


## Класс нейронной сети системных вызовов

In [373]:
class SyscallNetwork(Module):
    def __init__(
        self,
        token_size=15000,
        embedding_dim=2,
        hidden_dim_1=8,
        hidden_dim_2=32,
        max_norm=2,
        n_layers=1,
    ):
        super().__init__()

        self.embedding = Embedding(
                token_size,
                embedding_dim,
                padding_idx=0,
                norm_type=2,
                max_norm=max_norm,
        )
        self.lstm = LSTM(
                embedding_dim, hidden_dim_1, batch_first=True, num_layers=n_layers
        )
        self.linear_1 = Linear(hidden_dim_1, hidden_dim_2)
        self.linear_2 = Linear(hidden_dim_2, token_size)
        self.sm = torch.nn.Softmax(dim=1)


    def forward(self, x, h=None, c=None):
        emb = self.embedding(x)  # (n_samples, window_size, embedding_dim)
        if h is not None and c is not None:
            _, (h, c) = self.lstm(emb, (h, c))
        else:
            _, (h, c) = self.lstm(emb)  # (n_layers, n_samples, hidden_dim)

        h_mean = h.mean(dim=0)  # (n_samples, hidden_dim)
        x = self.linear_1(h_mean)  # (n_samples, dense_dim)
        pred = self.linear_2(x)  # (n_samples, token_size)

        return pred, h, c

def compute_loss_and_accuracy(loss, net, dataloader):
    net.eval()
    all_losses = []
    all_accuracy = []

    for X_batch, y_batch in dataloader:
        pred, _, _ = net(X_batch)

        all_losses.append(loss(pred, y_batch).item())

        print()
        print(80*'-')
        print('pred ', pred)
        print(80*'-')
        print(80*'-')
        # print('y_batch ', y_batch.argmax(dim=1))
        print('y_batch ', y_batch)
        print(80*'-')

        # all_accuracy.append((pred == y_batch.argmax(dim=1)).float())

    return np.mean(all_losses), np.mean(all_accuracy)


In [382]:
syscalldataset = SyscallDataset(dataset)

In [384]:
syscalldataset.index2token

{2: 'syscallname',
 3: 'syscallargs',
 4: 'syscallres',
 5: 'syscallend',
 6: '0',
 7: 'mmap',
 8: 'pread64',
 9: 'mprotect',
 10: 'newfstatat',
 11: 'close',
 12: 'openat',
 13: '3',
 14: '(3)',
 15: 'brk',
 16: '(3, "\\6\\0\\0\\0\\4\\0\\0\\0@\\0\\0\\0\\0\\0\\0\\0@\\0\\0\\0\\0\\0\\0\\0@\\0\\0\\0\\0\\0\\0\\0"..., 784, 64)',
 17: '784',
 18: 'arch_prctl',
 19: '-1',
 20: '(NULL)',
 21: '0x55f6bffa6000',
 22: 'access',
 23: '("/etc/ld.so.preload", R_OK)',
 24: '(AT_FDCWD, "/etc/ld.so.cache", O_RDONLY|O_CLOEXEC)',
 25: '(3, "", {st_mode=S_IFREG|0644, st_size=108635, ...}, AT_EMPTY_PATH)',
 26: '(NULL, 108635, PROT_READ, MAP_PRIVATE, 3, 0)',
 27: '0x7fcbef6e3000',
 28: '(AT_FDCWD, "/lib/x86_64-linux-gnu/libc.so.6", O_RDONLY|O_CLOEXEC)',
 29: 'read',
 30: '(3, "\\177ELF\\2\\1\\1\\3\\0\\0\\0\\0\\0\\0\\0\\0\\3\\0>\\0\\1\\0\\0\\0P\\237\\2\\0\\0\\0\\0\\0"..., 832)',
 31: '832',
 32: '(3, "\\4\\0\\0\\0 \\0\\0\\0\\5\\0\\0\\0GNU\\0\\2\\0\\0\\300\\4\\0\\0\\0\\3\\0\\0\\0\\0\\0\\0\\0"..., 48, 848)',


#### Задание гиперпараметров

In [374]:
token_size = 15000 # максимальное количество токенов
embedding_dim = 2
hidden_dim_1 = 32
hidden_dim_2 = 64
n_layers = 1 # кол-во повторяющихся слоев в LSTM
max_norm = 2

n_epochs = 100
train_val_split = 0.8
batch_size = 128

#### Функция потерь

In [375]:
loss_f = torch.nn.CrossEntropyLoss()

## Создание тренировочного и валидационного датасета

In [376]:
syscalldataset = SyscallDataset(dataset, token_size)


n_samples = len(dataset)
split_ix = int(n_samples * train_val_split)

train_indices, val_indices = np.arange(split_ix), np.arange(split_ix, n_samples)

train_dataloader = DataLoader(
        syscalldataset, sampler=SubsetRandomSampler(train_indices),
        batch_size=batch_size
)
val_dataloader = DataLoader(
        syscalldataset, sampler=SubsetRandomSampler(val_indices),
        batch_size=batch_size
)

In [377]:
syscalldataset[0]

(tensor([ 2, 84,  3, 85,  4,  6,  5,  2, 15,  3, 20,  4, 21,  5,  2, 18,  3, 79,
          4, 19,  5,  2,  7,  3, 76,  4, 77,  5,  2, 22,  3, 23,  4, 19,  5]),
 12)

In [378]:
train_dataloader.dataset

<__main__.SyscallDataset at 0x7e29d012ecb0>

## Создание нейронной сети и оптимайзера


In [379]:
net = SyscallNetwork(
        token_size,
        hidden_dim_1=hidden_dim_1,
        n_layers=n_layers,
        hidden_dim_2=hidden_dim_2,
        embedding_dim=embedding_dim,
        max_norm=max_norm,
)

optimizer = torch.optim.Adam(
        net.parameters(),
        lr=1e-2,
)


## Обучение нейронной сети

In [380]:
c = 0

for i in range(len(train_dataloader.dataset)):
    if len(train_dataloader.dataset[i][0]) != 35:
        c += 1
        # print(80*'-')
        # print(len(train_dataloader.dataset[i][0]))
        # print(train_dataloader.dataset[i][0].numpy())
        # for token in train_dataloader.dataset[i][0].numpy():
        #     print(syscalldataset.index2token[token], end=' ; ')
        # print(80*'-')
print(c)

0


In [381]:
emb_history = []

for epoch in range(n_epochs):
    net.train()
    for X_batch, y_batch in tqdm(train_dataloader):
        optimizer.zero_grad()
        pred, _, _ = net(X_batch)
        loss = loss_f(pred, y_batch)
        loss.backward()

        optimizer.step()


    if epoch % 10 == 0:
        train_loss, _ = compute_loss_and_accuracy(loss_f, net, train_dataloader)
        val_loss, val_acc = compute_loss_and_accuracy(loss_f, net, val_dataloader)
        print(f"Epoch: {epoch}, train loss: {train_loss=:.3f}, validation loss {val_loss=:.3f}")
        print(f'Accuracy: {val_acc}')


    # Prepare DataFrame
    weights = net.embedding.weight.detach().clone().numpy()

    df = pd.DataFrame(weights, columns=[f"dim_{i}" for i in range(embedding_dim)])
    df["epoch"] = epoch
    df["token"] = dataset.vocabulary

    emb_history.append(df)

final_df = pd.concat(emb_history)
final_df.to_csv("res.csv", index=False)

100%|██████████| 1/1 [00:00<00:00, 23.37it/s]


--------------------------------------------------------------------------------
pred  tensor([[-0.0258, -0.2608,  0.0710,  ..., -0.0751, -0.1458, -0.0057],
        [-0.0243, -0.2371,  0.0746,  ..., -0.0837, -0.1391,  0.0100],
        [-0.0156, -0.2340,  0.0711,  ..., -0.0769, -0.1387,  0.0066],
        ...,
        [-0.0207, -0.2282,  0.0749,  ..., -0.0837, -0.1394,  0.0144],
        [-0.0184, -0.2310,  0.0732,  ..., -0.0792, -0.1421,  0.0110],
        [-0.0184, -0.2222,  0.0743,  ..., -0.0834, -0.1391,  0.0174]],
       grad_fn=<AddmmBackward0>)
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
y_batch  tensor([ 7,  9,  7, 18, 10, 51, 56, 11,  9, 10, 12, 65,  8,  8,  7, 12, 29,  8,
        11, 15, 54, 15,  8,  7, 63, 61,  7,  9,  9,  7,  7, 12])
--------------------------------------------------------------------------------

-----------------------------------------------


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


AttributeError: 'DataFrame' object has no attribute 'vocabulary'

### Сохранение обученной модели

In [None]:
net.save('syscall-prediction.en-2')

### Загрузка сохраненной модели

In [None]:
from google.colab import drive
drive.mount('/content/drive')

%cd /content/drive/My Drive/Colab Notebooks/Diplom2024/Code/dataset

In [None]:
import torch

new_model = torch.load('syscall-prediction.en-2')

### График embeddings

In [None]:
import plotly
import plotly.graph_objects as go
import plotly.express as px

In [None]:
df = pd.read_csv("res.csv")

In [None]:
px.scatter(
    df,
    x='dim_0', y='dim_1',
    text = 'token'
    animation_frame='epoch',
    animation_group='token'
    # size='pop',
    # color='continent',
    # hover_name='',
    # log_x=True,
    # size_max=55,
    # range_x=[100, 100000], range_y=[25, 90]
)