<a href="https://colab.research.google.com/github/ndhpro/Colab_Notebooks/blob/master/Syscall_LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
! git clone https://github.com/ndhpro/syscall-lstm.git

Cloning into 'syscall-lstm'...
remote: Enumerating objects: 740, done.[K
remote: Counting objects: 100% (740/740), done.[K
remote: Compressing objects: 100% (251/251), done.[K
remote: Total 740 (delta 487), reused 737 (delta 484), pack-reused 0[K
Receiving objects: 100% (740/740), 226.61 KiB | 4.44 MiB/s, done.
Resolving deltas: 100% (487/487), done.


In [2]:
cd syscall-lstm/

/content/syscall-lstm


In [3]:
from glob import glob
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
import numpy as np

In [4]:
X, y = [], []
paths = glob('data/*/*')
for path in tqdm(paths, desc='Reading data'):
    with open(path, 'r') as f:
        seq = f.read()
    X.append(seq)
    if 'malware' in path:
        y.append(1)
    else:
        y.append(0)
X = np.array(X)
y = np.array(y)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=2020)
X_mal = X_train[y_train == 1]
X_beg = X_train[y_train == 0]

Reading data: 100%|██████████| 889/889 [00:00<00:00, 20734.55it/s]


In [5]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
vocab = tokenizer.word_index
vocab_size = len(vocab) + 1

X_mal = tokenizer.texts_to_sequences(X_mal)
X_beg = tokenizer.texts_to_sequences(X_beg)

In [6]:
def create_model():
    model = Sequential()
    model.add(LSTM(units=1000, return_sequences=True,
                   input_shape=(100, len(vocab)+1)))
    # model.add(LSTM(units=1000, return_sequences=True))
    # model.add(LSTM(units=1000, return_sequences=True))
    model.add(LSTM(units=1000))
    model.add(Dense(units=len(vocab)+1, activation='softmax'))
    model.compile(loss='categorical_crossentropy',
                  optimizer='adam', metrics=['accuracy'])
    return model


malicious_model = create_model()
trusted_model = create_model()
print(malicious_model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 100, 1000)         4552000   
_________________________________________________________________
lstm_1 (LSTM)                (None, 1000)              8004000   
_________________________________________________________________
dense (Dense)                (None, 137)               137137    
Total params: 12,693,137
Trainable params: 12,693,137
Non-trainable params: 0
_________________________________________________________________
None


In [7]:
def prepare_sentence(seq):
    x = []
    y = []
    for i in range(min(101, len(seq))):
        x_padded = pad_sequences([seq[:i]], maxlen=100, padding='pre')[0]
        x_onehot = [to_categorical(i, num_classes=vocab_size) for i in x_padded]
        x.append(x_onehot)
        y.append(to_categorical(seq[i], num_classes=vocab_size))
    return x, y

In [12]:
x = []
y = []
for seq in X_mal:
    x_windows, y_windows = prepare_sentence(seq)
    x += x_windows[-10:]
    y += y_windows[-10:]
x = np.array(x)
y = np.array(y)

print(x.shape, y.shape)
malicious_model.fit(x, y, epochs=50, batch_size=512, verbose=2)
malicious_model.save_weights('malicious.h5')

(4260, 100, 137) (4260, 137)
Epoch 1/50
9/9 - 10s - loss: 0.2916 - accuracy: 0.9329
Epoch 2/50
9/9 - 10s - loss: 0.2798 - accuracy: 0.9347
Epoch 3/50
9/9 - 10s - loss: 0.2742 - accuracy: 0.9345
Epoch 4/50
9/9 - 11s - loss: 0.2713 - accuracy: 0.9347
Epoch 5/50
9/9 - 10s - loss: 0.2509 - accuracy: 0.9415
Epoch 6/50
9/9 - 10s - loss: 0.2400 - accuracy: 0.9441
Epoch 7/50
9/9 - 10s - loss: 0.2318 - accuracy: 0.9451
Epoch 8/50
9/9 - 10s - loss: 0.2399 - accuracy: 0.9439
Epoch 9/50
9/9 - 10s - loss: 0.2131 - accuracy: 0.9509
Epoch 10/50
9/9 - 10s - loss: 1.1303 - accuracy: 0.7526
Epoch 11/50
9/9 - 10s - loss: 1.5794 - accuracy: 0.6516
Epoch 12/50
9/9 - 10s - loss: 1.5706 - accuracy: 0.6516
Epoch 13/50
9/9 - 10s - loss: 1.5895 - accuracy: 0.6509
Epoch 14/50
9/9 - 10s - loss: 1.6365 - accuracy: 0.6521
Epoch 15/50
9/9 - 10s - loss: 1.3844 - accuracy: 0.6737
Epoch 16/50
9/9 - 10s - loss: 1.4299 - accuracy: 0.6777
Epoch 17/50
9/9 - 10s - loss: 1.0313 - accuracy: 0.7002
Epoch 18/50
9/9 - 10s - loss

In [14]:
x = []
y = []
for seq in X_beg:
    x_windows, y_windows = prepare_sentence(seq)
    x += x_windows[-10:]
    y += y_windows[-10:]
x = np.array(x)
y = np.array(y)

print(x.shape, y.shape)
trusted_model.fit(x, y, epochs=50, batch_size=512, verbose=2)
trusted_model.save_weights('trusted.h5')

(1960, 100, 137) (1960, 137)
Epoch 1/50
4/4 - 4s - loss: 0.7349 - accuracy: 0.7638
Epoch 2/50
4/4 - 4s - loss: 0.7172 - accuracy: 0.7653
Epoch 3/50
4/4 - 4s - loss: 0.6851 - accuracy: 0.7745
Epoch 4/50
4/4 - 4s - loss: 0.6408 - accuracy: 0.7923
Epoch 5/50
4/4 - 5s - loss: 0.6169 - accuracy: 0.7934
Epoch 6/50
4/4 - 5s - loss: 0.6024 - accuracy: 0.7883
Epoch 7/50
4/4 - 5s - loss: 0.5797 - accuracy: 0.8041
Epoch 8/50
4/4 - 5s - loss: 0.5790 - accuracy: 0.7985
Epoch 9/50
4/4 - 5s - loss: 0.6265 - accuracy: 0.7995
Epoch 10/50
4/4 - 5s - loss: 0.6301 - accuracy: 0.7903
Epoch 11/50
4/4 - 5s - loss: 0.5910 - accuracy: 0.7934
Epoch 12/50
4/4 - 4s - loss: 0.5485 - accuracy: 0.8097
Epoch 13/50
4/4 - 4s - loss: 0.5224 - accuracy: 0.8240
Epoch 14/50
4/4 - 4s - loss: 0.4764 - accuracy: 0.8388
Epoch 15/50
4/4 - 4s - loss: 0.4563 - accuracy: 0.8403
Epoch 16/50
4/4 - 4s - loss: 0.4286 - accuracy: 0.8520
Epoch 17/50
4/4 - 4s - loss: 0.4007 - accuracy: 0.8719
Epoch 18/50
4/4 - 4s - loss: 0.3800 - accurac

In [15]:
pred = []
for sentence in tqdm(X_test):
  tok = tokenizer.texts_to_sequences([sentence])[0]
  x_test, y = prepare_sentence(tok)
  x_test = np.array(x_test)
  y = np.array(y)

  p_pred = malicious_model.predict(x_test)
  log_p_sentence = 0
  for i, prob in enumerate(p_pred):
      prob_word = prob[np.argmax(y[i])]
      log_p_sentence += np.log(prob_word)
  mal_p = np.exp(log_p_sentence)

  p_pred = trusted_model.predict(x_test)
  log_p_sentence = 0
  for i, prob in enumerate(p_pred):
      prob_word = prob[np.argmax(y[i])]
      log_p_sentence += np.log(prob_word)
  beg_p = np.exp(log_p_sentence)

  if beg_p > mal_p:
    pred.append(0)
  else:
    pred.append(1)

100%|██████████| 267/267 [01:52<00:00,  2.38it/s]


In [17]:
from sklearn.metrics import classification_report
print(classification_report(y_test, pred, digits=4))

              precision    recall  f1-score   support

           0     1.0000    0.9405    0.9693        84
           1     0.9734    1.0000    0.9865       183

    accuracy                         0.9813       267
   macro avg     0.9867    0.9702    0.9779       267
weighted avg     0.9818    0.9813    0.9811       267

