<a href="https://colab.research.google.com/github/ndhpro/Colab_Notebooks/blob/master/Syscall_LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
! git clone https://github.com/ndhpro/syscall-lstm.git

Cloning into 'syscall-lstm'...
remote: Enumerating objects: 738, done.[K
remote: Counting objects: 100% (738/738), done.[K
remote: Compressing objects: 100% (249/249), done.[K
remote: Total 738 (delta 487), reused 735 (delta 484), pack-reused 0[K
Receiving objects: 100% (738/738), 226.39 KiB | 4.44 MiB/s, done.
Resolving deltas: 100% (487/487), done.


In [2]:
cd syscall-lstm/

/content/syscall-lstm


In [3]:
from glob import glob
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
import numpy as np

In [4]:
X, y = [], []
paths = glob('data/*/*')
for path in tqdm(paths, desc='Reading data'):
    with open(path, 'r') as f:
        seq = f.read()
    X.append(seq)
    if 'malware' in path:
        y.append(1)
    else:
        y.append(0)
X = np.array(X)
y = np.array(y)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=2020)
X_mal = X_train[y_train == 1]
X_beg = X_train[y_train == 0]

Reading data: 100%|██████████| 889/889 [00:00<00:00, 19480.26it/s]


In [5]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
vocab = tokenizer.word_index
vocab_size = len(vocab) + 1

X_mal = tokenizer.texts_to_sequences(X_mal)
X_beg = tokenizer.texts_to_sequences(X_beg)

In [20]:
def create_model():
    model = Sequential()
    model.add(LSTM(units=1000, return_sequences=True,
                   input_shape=(100, len(vocab)+1)))
    # model.add(LSTM(units=1000, return_sequences=True))
    # model.add(LSTM(units=1000, return_sequences=True))
    model.add(LSTM(units=1000))
    model.add(Dense(units=len(vocab)+1, activation='softmax'))
    model.compile(loss='categorical_crossentropy',
                  optimizer='adam', metrics=['accuracy'])
    return model


malicious_model = create_model()
trusted_model = create_model()
print(malicious_model.summary())

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_8 (LSTM)                (None, 100, 1000)         4552000   
_________________________________________________________________
lstm_9 (LSTM)                (None, 1000)              8004000   
_________________________________________________________________
dense_2 (Dense)              (None, 137)               137137    
Total params: 12,693,137
Trainable params: 12,693,137
Non-trainable params: 0
_________________________________________________________________
None


In [14]:
def prepare_sentence(seq):
    # Pads seq and slides windows
    x = []
    y = []
    for i in range(min(101, len(seq))):
        x_padded = pad_sequences([seq[:i]], maxlen=100, padding='pre')[0]
        x_onehot = [to_categorical(i, num_classes=vocab_size) for i in x_padded]
        x.append(x_onehot)
        y.append(to_categorical(seq[i], num_classes=vocab_size))
    return x, y

In [21]:
# Pad sequences and slide windows
x = []
y = []
for seq in X_mal:
    x_windows, y_windows = prepare_sentence(seq)
    x += x_windows[-10:]
    y += y_windows[-10:]
x = np.array(x)
y = np.array(y)

print(x.shape, y.shape)
malicious_model.fit(x, y, epochs=50, batch_size=128, verbose=2)

(4260, 100, 137) (4260, 137)
Epoch 1/50
34/34 - 6s - loss: 1.4696 - accuracy: 0.6721
Epoch 2/50
34/34 - 6s - loss: 0.9685 - accuracy: 0.6948
Epoch 3/50
34/34 - 6s - loss: 0.8731 - accuracy: 0.7272
Epoch 4/50
34/34 - 6s - loss: 0.8032 - accuracy: 0.7415
Epoch 5/50
34/34 - 6s - loss: 0.7341 - accuracy: 0.7711
Epoch 6/50
34/34 - 6s - loss: 0.6099 - accuracy: 0.8038
Epoch 7/50
34/34 - 6s - loss: 0.5321 - accuracy: 0.8171
Epoch 8/50
34/34 - 6s - loss: 0.4564 - accuracy: 0.8441
Epoch 9/50
34/34 - 6s - loss: 0.3968 - accuracy: 0.8765
Epoch 10/50
34/34 - 6s - loss: 0.3137 - accuracy: 0.9075
Epoch 11/50
34/34 - 6s - loss: 0.2566 - accuracy: 0.9237
Epoch 12/50
34/34 - 6s - loss: 0.2112 - accuracy: 0.9385
Epoch 13/50
34/34 - 6s - loss: 0.1779 - accuracy: 0.9448
Epoch 14/50
34/34 - 6s - loss: 0.1646 - accuracy: 0.9549
Epoch 15/50
34/34 - 6s - loss: 0.1351 - accuracy: 0.9622
Epoch 16/50
34/34 - 6s - loss: 0.1115 - accuracy: 0.9671
Epoch 17/50
34/34 - 6s - loss: 0.1007 - accuracy: 0.9714
Epoch 18/50

<tensorflow.python.keras.callbacks.History at 0x7fd02f4e9cc0>

In [22]:
# Pad sequences and slide windows
x = []
y = []
for seq in X_beg:
    x_windows, y_windows = prepare_sentence(seq)
    x += x_windows[-10:]
    y += y_windows[-10:]
x = np.array(x)
y = np.array(y)

print(x.shape, y.shape)
trusted_model.fit(x, y, epochs=50, batch_size=128, verbose=2)

(1960, 100, 137) (1960, 137)
Epoch 1/50
16/16 - 3s - loss: 3.5486 - accuracy: 0.2056
Epoch 2/50
16/16 - 3s - loss: 2.5735 - accuracy: 0.2495
Epoch 3/50
16/16 - 3s - loss: 2.3739 - accuracy: 0.2944
Epoch 4/50
16/16 - 3s - loss: 2.2654 - accuracy: 0.3102
Epoch 5/50
16/16 - 3s - loss: 2.2498 - accuracy: 0.3372
Epoch 6/50
16/16 - 3s - loss: 2.1692 - accuracy: 0.3306
Epoch 7/50
16/16 - 3s - loss: 1.9803 - accuracy: 0.3709
Epoch 8/50
16/16 - 3s - loss: 1.7773 - accuracy: 0.4612
Epoch 9/50
16/16 - 3s - loss: 1.5985 - accuracy: 0.5699
Epoch 10/50
16/16 - 3s - loss: 1.3428 - accuracy: 0.6286
Epoch 11/50
16/16 - 3s - loss: 1.2059 - accuracy: 0.6434
Epoch 12/50
16/16 - 3s - loss: 1.1170 - accuracy: 0.6689
Epoch 13/50
16/16 - 3s - loss: 1.0074 - accuracy: 0.7071
Epoch 14/50
16/16 - 3s - loss: 0.9556 - accuracy: 0.7015
Epoch 15/50
16/16 - 3s - loss: 0.8653 - accuracy: 0.7281
Epoch 16/50
16/16 - 3s - loss: 0.8174 - accuracy: 0.7342
Epoch 17/50
16/16 - 3s - loss: 0.8313 - accuracy: 0.7393
Epoch 18/50

<tensorflow.python.keras.callbacks.History at 0x7fd032671a58>

In [23]:
malicious_model.save_weights('model/malicious.h5')
trusted_model.save_weights('model/trusted.h5')

In [36]:
vocab_inv = {v: k for k, v in tokenizer.word_index.items()}

correct = 0
for sentence, y_true in zip(X_test, y_test):
  tok = tokenizer.texts_to_sequences([sentence])[0]
  x_test, y = prepare_sentence(tok)
  x_test = np.array(x_test)
  y = np.array(y)

  p_pred = malicious_model.predict(x_test)
  log_p_sentence = 0
  for i, prob in enumerate(p_pred):
      prob_word = prob[np.argmax(y[i])]
      log_p_sentence += np.log(prob_word)
  mal_p = np.exp(log_p_sentence)

  p_pred = trusted_model.predict(x_test)
  log_p_sentence = 0
  for i, prob in enumerate(p_pred):
      prob_word = prob[np.argmax(y[i])]
      log_p_sentence += np.log(prob_word)
  beg_p = np.exp(log_p_sentence)

  pred = 0 if beg_p > mal_p else 1
  print(mal_p, beg_p, pred, y_true)
  if pred == y_true:
    correct += 1
print('Accuracy:', correct / len(X_test))

0.0 3.1685352113609216e-13 0 0
0.0 5.915102987707892e-43 0 0
1.091018461794263e-189 3.3266092514091274e-279 1 0
1.6713030172683961e-186 0.0 1 1
1.7331120916559089e-150 0.0 1 1
2.2844103314271426e-124 3.6805254968051538e-233 1 1
7.797186314862153e-183 7.940347203479314e-122 0 1
0.0 3.552869550046356e-284 0 0
1.2405077283267185e-223 3.766531714320793e-182 0 0
0.0 5.915102987707892e-43 0 0
1.6738806850866854e-186 0.0 1 1
2.418743272345645e-137 2.2253869990592885e-140 1 0
1.6738806850866854e-186 0.0 1 1
8.851983007758856e-155 0.0 1 1
2.9464014623393895e-110 1.3314015614778075e-279 1 1
2.9464014623393895e-110 1.3314015614778075e-279 1 1
8.851983007758856e-155 0.0 1 1
7.167243595154529e-103 2.1840159760194103e-269 1 1
1.6738806850866854e-186 0.0 1 1
1.0455707390813251e-176 1.8955884768529245e-247 1 1
2.0933370202368114e-129 4.0003876988850045e-213 1 1
0.0 2.476617462390339e-46 0 0
2.2912894787250445e-175 1.2701715548288877e-286 1 1
5.8760702835122874e-179 6.495067014e-315 1 1
7.7971863148621