## Import and Prepare the DataSets

### 1. Import Train & Test CSV(s)

In [3]:
import pandas as pd
import json

# Set the OVER_SAMPLING_TECHNIQUE as follows
# SMOTE ---> 'sm'
# SMOTETomek ---> 'smt'
OVER_SAMPLING_TECHNIQUE = 'sm'

# Set the UNDER_SAMPLING_TECHNIQUE as follows
# RandomUnderSampler ---> 'rus'
# TomekLinks ---> 'tmk'
UNDER_SAMPLING_TECHNIQUE = 'rus'

# Importing Training Set
train = pd.read_csv(f'./split_ds/opcode_{OVER_SAMPLING_TECHNIQUE}_TRAIN.csv')
train['opcode'] = train['opcode'].apply(lambda x: json.loads(x))

# Importing Testing Set
test = pd.read_csv(f'./split_ds/opcode_{UNDER_SAMPLING_TECHNIQUE}_TEST.csv')
test['opcode'] = test['opcode'].apply(lambda x: json.loads(x))

print(train, test)

 opcode label
0 [2, 2, 13, 31, 11, 6, 15, 2, 5, 22, 9, 9, 33, ...  0
1 [2, 2, 13, 31, 11, 6, 15, 2, 5, 22, 9, 33, 2, ...  0
2 [2, 2, 2, 18, 18, 18, 55, 5, 16, 2, 76, 2, 24,...  0
3 [2, 2, 13, 31, 11, 6, 15, 2, 5, 22, 9, 6, 5, 6...  0
4 [2, 2, 13, 87, 2, 32, 87, 2, 32, 63, 2, 32, 2,...  0
...  ...  ...
94293 [2, 2, 13, 12, 5, 22, 9, 2, 22, 27, 10, 6, 11,...  1
94294 [2, 2, 13, 31, 5, 11, 5, 15, 2, 5, 22, 9, 4, 3...  1
94295 [2, 2, 13, 22, 4, 16, 31, 11, 10, 12, 18, 8, 7...  1
94296 [2, 2, 2, 18, 18, 18, 55, 5, 16, 2, 76, 2, 24,...  1
94297 [2, 2, 13, 31, 5, 11, 6, 15, 2, 5, 22, 9, 4, 6...  1

[94298 rows x 2 columns] opcode label
0 [2, 2, 13, 2, 2, 2, 6, 28, 7, 19, 7, 2, 26, 36...  0
1 [2, 2, 13, 33, 2, 2, 6, 28, 7, 19, 7, 12, 26, ...  0
2 [2, 2, 2, 18, 18, 18, 55, 5, 16, 2, 76, 2, 24,...  0
3 [2, 2, 13, 31, 5, 11, 6, 15, 2, 5, 22, 9, 4, 3...  1
4 [2, 2, 13, 31, 5, 11, 6, 15, 2, 5, 22, 9, 4, 3...  0
...  ...  ...
12432 [2, 2, 13, 31, 11, 6, 15, 2, 5, 22, 9, 33, 2, ...  0
12433 [2, 2, 13

### 2. Split the Test DataSet into Testing and Validiation DataSet

In [4]:
from sklearn.model_selection import train_test_split
test, val = train_test_split(test, test_size=0.5, random_state=69, shuffle=True, stratify=test['label'])

### 3. Convert DataFrame(s) to Numpy N-D Array(s)

In [5]:
import numpy as np

train_sequences = np.array(train['opcode'].tolist())
train_labels = np.array(train['label'].tolist())

test_sequences = np.array(test['opcode'].tolist())
test_labels = np.array(test['label'].tolist())

val_sequences = np.array(val['opcode'].tolist())
val_labels = np.array(val['label'].tolist())

print("Train-Sequences", train_sequences.shape, type(train_sequences[0]))
print("Train-Labels", train_labels.shape, type(train_labels[0]))
 
print("Test-Sequences", test_sequences.shape, type(test_sequences[0]))
print("Test-Labels", test_labels.shape, type(test_labels[0]))
 
print("Validiation-Sequences", val_sequences.shape, type(val_sequences[0]))
print("Validiation-Labels", val_labels.shape, type(val_labels[0]))

Train-Sequences (94298, 3800) <class 'numpy.ndarray'>
Train-Labels (94298,) <class 'numpy.int64'>
Test-Sequences (6218, 3800) <class 'numpy.ndarray'>
Test-Labels (6218,) <class 'numpy.int64'>
Validiation-Sequences (6219, 3800) <class 'numpy.ndarray'>
Validiation-Labels (6219,) <class 'numpy.int64'>


## Create & Evaluate the Deep-Learning Model (RNN based on LSTM architecture)

### 1. Define the Hyper-Parameters

In [2]:
OPCODE_SEQ_LEN = 3800
EMBEDDING_DIM = 50
NUM_EPOCS = 128
BATCH_SIZE = 64

### 2. Define the Neural Network Structure (Layers)

In [6]:
import tensorflow as tf

model = tf.keras.models.Sequential()

model.add(tf.keras.layers.Embedding(150, EMBEDDING_DIM, input_length=OPCODE_SEQ_LEN))
model.add(tf.keras.layers.CuDNNLSTM(128, name='lstm1', return_sequences=True))
model.add(tf.keras.layers.CuDNNLSTM(64, name='lstm2'))
model.add(tf.keras.layers.Dense(256, name='hi_layer', activation='relu'))
model.add(tf.keras.layers.Dropout(0.5))
model.add(tf.keras.layers.Dense(1, name='out_layer', activation='sigmoid'))

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 3800, 64)          9600      
_________________________________________________________________
lstm1 (CuDNNLSTM)            (None, 3800, 128)         99328     
_________________________________________________________________
lstm2 (CuDNNLSTM)            (None, 64)                49664     
_________________________________________________________________
hi_layer (Dense)             (None, 256)               16640     
_________________________________________________________________
dropout_1 (Dropout)          (None, 256)               0         
_________________________________________________________________
out_layer (Dense)            (None, 1)                 257       
Total params: 175,489
Trainable params: 175,489
Non-trainable params: 0
_________________________________________________________________


### 3. Compile the model

In [5]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

### 4. Fit and train the RNN model with Training and Validiation Data

In [51]:
history = model.fit(train_sequences, train_labels, 
 epochs=NUM_EPOCS, 
 validation_data=(val_sequences, val_labels),
 batch_size=BATCH_SIZE, verbose=1)

Train on 94298 samples, validate on 6219 samples
Epoch 1/1


### 5. Evaluate performance of model using Testing DataSet

In [52]:
results = model.evaluate(test_sequences, test_labels, batch_size=BATCH_SIZE)
print("Test Loss, Test Accuracy:", results)

Test Loss, Test Accuracy: [0.40623094332720466, 0.8295271791381137]


### 6. Save the model as HDF5 file

In [None]:
model.save(f'./models/model_{OVER_SAMPLING_TECHNIQUE}.h5') 

In [None]:
# Save History as Pickle
import pickle
with open(f'./models/history_{OVER_SAMPLING_TECHNIQUE}.pickle', 'wb') as fh:
 pickle.dump(history.history, fh)

# Save Results as Pickle
import pickle
with open(f'./models/results_{OVER_SAMPLING_TECHNIQUE}.pickle', 'wb') as fh:
 pickle.dump(results, fh)

### 7. Plot performance metrics of the Deep-Learning Model 

In [4]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

In [None]:
# Accuracy Metrics
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.xlabel("Epochs")
plt.ylabel('Accuracy')
plt.legend(['accuracy', 'val_accuracy'])
plt.show()

In [None]:
# Loss Metrics
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.xlabel("Epochs")
plt.ylabel('Loss')
plt.legend(['loss', 'val_loss'])
plt.show()