<a href="https://colab.research.google.com/github/ndhpro/Colab_Notebooks/blob/master/Syscall_LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
! git clone https://github.com/ndhpro/syscall-lstm.git

Cloning into 'syscall-lstm'...
remote: Enumerating objects: 740, done.[K
remote: Counting objects: 100% (740/740), done.[K
remote: Compressing objects: 100% (251/251), done.[K
remote: Total 740 (delta 487), reused 737 (delta 484), pack-reused 0[K
Receiving objects: 100% (740/740), 226.61 KiB | 4.53 MiB/s, done.
Resolving deltas: 100% (487/487), done.


In [2]:
cd syscall-lstm/

/content/syscall-lstm


In [3]:
from glob import glob
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from tensorflow.keras.callbacks import ModelCheckpoint
import numpy as np

In [4]:
X, y = [], []
paths = glob('data/*/*')
for path in tqdm(paths, desc='Reading data'):
    with open(path, 'r') as f:
        seq = f.read()
    X.append(seq)
    if 'malware' in path:
        y.append(1)
    else:
        y.append(0)
X = np.array(X)
y = np.array(y)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=2020)
X_mal = X_train[y_train == 1]
X_beg = X_train[y_train == 0]

Reading data: 100%|██████████| 889/889 [00:00<00:00, 20769.90it/s]


In [5]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
vocab = tokenizer.word_index
vocab_size = len(vocab) + 1

X_mal = tokenizer.texts_to_sequences(X_mal)
X_beg = tokenizer.texts_to_sequences(X_beg)

In [9]:
SYSCALL_LEN = 100

def create_model():
    model = Sequential()
    model.add(LSTM(units=1000, return_sequences=True,
                   input_shape=(SYSCALL_LEN, len(vocab)+1)))
    # model.add(LSTM(units=1000, return_sequences=True))
    # model.add(LSTM(units=1000, return_sequences=True))
    model.add(LSTM(units=1000))
    model.add(Dense(units=len(vocab)+1, activation='softmax'))
    model.compile(loss='categorical_crossentropy',
                  optimizer='adam', metrics=['accuracy'])
    return model


malicious_model = create_model()
trusted_model = create_model()
print(malicious_model.summary())

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_4 (LSTM)                (None, 100, 1000)         4564000   
_________________________________________________________________
lstm_5 (LSTM)                (None, 1000)              8004000   
_________________________________________________________________
dense_2 (Dense)              (None, 140)               140140    
Total params: 12,708,140
Trainable params: 12,708,140
Non-trainable params: 0
_________________________________________________________________
None


In [10]:
def prepare_sentence(seq):
    x = []
    y = []
    for i in range(min(SYSCALL_LEN+1, len(seq))):
        x_padded = pad_sequences([seq[:i]], maxlen=SYSCALL_LEN, padding='pre')[0]
        x_onehot = [to_categorical(i, num_classes=vocab_size) for i in x_padded]
        x.append(x_onehot)
        y.append(to_categorical(seq[i], num_classes=vocab_size))
    return x, y

In [11]:
x = []
y = []
for seq in X_mal:
    x_windows, y_windows = prepare_sentence(seq)
    x += x_windows[-10:]
    y += y_windows[-10:]
x = np.array(x)
y = np.array(y)

print(x.shape, y.shape)
mc = ModelCheckpoint('malicious.h5', monitor='loss', save_weights_only=True, save_best_only=True)
malicious_model.fit(x, y, epochs=50, batch_size=512, verbose=2, callbacks=[mc])
malicious_model.load_weights('malicious.h5')

(4260, 100, 140) (4260, 140)
Epoch 1/50
9/9 - 10s - loss: 2.6026 - accuracy: 0.6131
Epoch 2/50
9/9 - 10s - loss: 1.5321 - accuracy: 0.6965
Epoch 3/50
9/9 - 10s - loss: 1.7582 - accuracy: 0.6284
Epoch 4/50
9/9 - 10s - loss: 1.5934 - accuracy: 0.6869
Epoch 5/50
9/9 - 10s - loss: 1.4052 - accuracy: 0.6859
Epoch 6/50
9/9 - 10s - loss: 1.0834 - accuracy: 0.6923
Epoch 7/50
9/9 - 10s - loss: 0.8939 - accuracy: 0.7242
Epoch 8/50
9/9 - 10s - loss: 0.8846 - accuracy: 0.7176
Epoch 9/50
9/9 - 10s - loss: 0.8702 - accuracy: 0.7254
Epoch 10/50
9/9 - 10s - loss: 0.8530 - accuracy: 0.7296
Epoch 11/50
9/9 - 10s - loss: 0.8560 - accuracy: 0.7329
Epoch 12/50
9/9 - 10s - loss: 0.8419 - accuracy: 0.7444
Epoch 13/50
9/9 - 10s - loss: 0.8209 - accuracy: 0.7432
Epoch 14/50
9/9 - 10s - loss: 0.8057 - accuracy: 0.7413
Epoch 15/50
9/9 - 10s - loss: 0.7844 - accuracy: 0.7488
Epoch 16/50
9/9 - 10s - loss: 0.7630 - accuracy: 0.7631
Epoch 17/50
9/9 - 10s - loss: 0.7290 - accuracy: 0.7862
Epoch 18/50
9/9 - 10s - loss

In [12]:
x = []
y = []
for seq in X_beg:
    x_windows, y_windows = prepare_sentence(seq)
    x += x_windows[-10:]
    y += y_windows[-10:]
x = np.array(x)
y = np.array(y)

print(x.shape, y.shape)
mc = ModelCheckpoint('trusted.h5', monitor='loss', save_weights_only=True, save_best_only=True)
trusted_model.fit(x, y, epochs=50, batch_size=512, verbose=2, callbacks=[mc])
trusted_model.load_weights('trusted.h5')

(1960, 100, 140) (1960, 140)
Epoch 1/50
4/4 - 4s - loss: 4.7084 - accuracy: 0.1770
Epoch 2/50
4/4 - 4s - loss: 3.4136 - accuracy: 0.2015
Epoch 3/50
4/4 - 4s - loss: 2.7256 - accuracy: 0.2490
Epoch 4/50
4/4 - 4s - loss: 2.5568 - accuracy: 0.2684
Epoch 5/50
4/4 - 5s - loss: 2.4097 - accuracy: 0.2867
Epoch 6/50
4/4 - 5s - loss: 2.3198 - accuracy: 0.2602
Epoch 7/50
4/4 - 5s - loss: 2.2792 - accuracy: 0.2980
Epoch 8/50
4/4 - 5s - loss: 2.2437 - accuracy: 0.3117
Epoch 9/50
4/4 - 5s - loss: 2.1899 - accuracy: 0.3347
Epoch 10/50
4/4 - 4s - loss: 2.1122 - accuracy: 0.3311
Epoch 11/50
4/4 - 4s - loss: 2.1304 - accuracy: 0.3260
Epoch 12/50
4/4 - 4s - loss: 2.1842 - accuracy: 0.3184
Epoch 13/50
4/4 - 4s - loss: 2.1130 - accuracy: 0.3321
Epoch 14/50
4/4 - 4s - loss: 2.0374 - accuracy: 0.3403
Epoch 15/50
4/4 - 4s - loss: 1.9693 - accuracy: 0.3495
Epoch 16/50
4/4 - 4s - loss: 1.8977 - accuracy: 0.3648
Epoch 17/50
4/4 - 4s - loss: 1.8360 - accuracy: 0.3730
Epoch 18/50
4/4 - 4s - loss: 1.7773 - accurac

In [13]:
pred = []
for sentence in tqdm(X_test):
  tok = tokenizer.texts_to_sequences([sentence])[0]
  x_test, y = prepare_sentence(tok)
  x_test = np.array(x_test)
  y = np.array(y)

  p_pred = malicious_model.predict(x_test)
  log_p_sentence = 0
  for i, prob in enumerate(p_pred):
      prob_word = prob[np.argmax(y[i])]
      log_p_sentence += np.log(prob_word)
  mal_p = np.exp(log_p_sentence)

  p_pred = trusted_model.predict(x_test)
  log_p_sentence = 0
  for i, prob in enumerate(p_pred):
      prob_word = prob[np.argmax(y[i])]
      log_p_sentence += np.log(prob_word)
  beg_p = np.exp(log_p_sentence)

  if beg_p > mal_p:
    pred.append(0)
  else:
    pred.append(1)

100%|██████████| 267/267 [01:49<00:00,  2.44it/s]


In [14]:
from sklearn.metrics import classification_report
print(classification_report(y_test, pred, digits=4))

              precision    recall  f1-score   support

           0     0.9759    0.9643    0.9701        84
           1     0.9837    0.9891    0.9864       183

    accuracy                         0.9813       267
   macro avg     0.9798    0.9767    0.9782       267
weighted avg     0.9812    0.9813    0.9812       267

