## SET ENVIRONMENT VARIABLES

In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "7"

print(os.environ["CUDA_VISIBLE_DEVICES"])

7


## Import and Prepare the DataSets

### 1. Import Train & Test CSV(s)

In [2]:
import pandas as pd
import json

# Set the TRAIN_SAMPLING_TECHNIQUE as follows
# SMOTE ---> 'sm'
# SMOTETomek ---> 'smt'
# ADASYN ---> 'ada'
TRAIN_SAMPLING_TECHNIQUE = 'sm'

# Set the TEST_SAMPLING_TECHNIQUE as follows
# RandomUnderSampler ---> 'rus'
# TomekLinks ---> 'tmk'
TEST_SAMPLING_TECHNIQUE = 'rus'

# Importing Training Set
train = pd.read_csv(f'./split_ds/opcode_{TRAIN_SAMPLING_TECHNIQUE}_TRAIN.csv')
train['opcode'] = train['opcode'].apply(lambda x: json.loads(x))

# Importing Testing Set
test = pd.read_csv(f'./split_ds/opcode_{TEST_SAMPLING_TECHNIQUE}_TEST.csv')
test['opcode'] = test['opcode'].apply(lambda x: json.loads(x))

print(train, test)

                                                  opcode  swc_label
0      [2, 2, 10, 2, 50, 33, 5, 13, 2, 30, 62, 3, 32,...          0
1      [2, 2, 10, 31, 4, 12, 5, 13, 2, 4, 22, 8, 6, 2...          0
2      [2, 2, 10, 2, 50, 33, 5, 13, 2, 30, 62, 3, 32,...          0
3      [2, 2, 10, 2, 50, 33, 5, 13, 2, 30, 62, 3, 32,...          0
4      [2, 2, 10, 2, 30, 62, 3, 32, 25, 11, 4, 25, 26...          1
...                                                  ...        ...
21931  [2, 2, 10, 19, 35, 22, 39, 17, 38, 2, 12, 26, ...          1
21932  [2, 2, 10, 13, 40, 26, 6, 15, 2, 23, 52, 2, 31...          1
21933  [2, 2, 10, 30, 7, 12, 8, 13, 30, 46, 4, 29, 16...          1
21934  [95, 21, 26, 12, 66, 13, 21, 97, 0, 0, 0, 0, 0...          1
21935  [2, 2, 10, 2, 30, 62, 3, 32, 15, 17, 13, 16, 2...          1

[21936 rows x 2 columns]                                                  opcode  swc_label
0     [2, 2, 10, 31, 4, 12, 5, 13, 2, 4, 22, 8, 6, 2...          0
1     [2, 2, 10, 2, 50, 

### 2. Split the Test DataSet into Testing and Validiation DataSet

In [3]:
from sklearn.model_selection import train_test_split
test, val = train_test_split(test, test_size=0.5, random_state=69, shuffle=True, stratify=test['swc_label'])

### 3. Convert DataFrame(s) to Numpy N-D Array(s)

In [4]:
import numpy as np

train_sequences = np.array(train['opcode'].tolist())
train_labels = np.array(train['swc_label'].tolist())

test_sequences = np.array(test['opcode'].tolist())
test_labels = np.array(test['swc_label'].tolist())

val_sequences = np.array(val['opcode'].tolist())
val_labels = np.array(val['swc_label'].tolist())

print("Train-Sequences", train_sequences.shape, type(train_sequences[0]))
print("Train-Labels", train_labels.shape, type(train_labels[0]))
 
print("Test-Sequences", test_sequences.shape, type(test_sequences[0]))
print("Test-Labels", test_labels.shape, type(test_labels[0]))
 
print("Validiation-Sequences", val_sequences.shape, type(val_sequences[0]))
print("Validiation-Labels", val_labels.shape, type(val_labels[0]))

Train-Sequences (21936, 1800) <class 'numpy.ndarray'>
Train-Labels (21936,) <class 'numpy.int64'>
Test-Sequences (1299, 1800) <class 'numpy.ndarray'>
Test-Labels (1299,) <class 'numpy.int64'>
Validiation-Sequences (1299, 1800) <class 'numpy.ndarray'>
Validiation-Labels (1299,) <class 'numpy.int64'>


## Create & Evaluate the Deep-Learning Model (RNN based on LSTM architecture)

### 1. Define the Hyper-Parameters

In [5]:
OPCODE_SEQ_LEN = 1800
EMBEDDING_DIM = 50
NUM_EPOCS = 128
BATCH_SIZE = 1

### 1A. Import Evaluation metrics

In [6]:
from sklearn.metrics import f1_score

def f1(y_true, y_pred):
    return f1_score(y_true, y_pred)

def f1M(y_true, y_pred):
    return f1_score(y_true, y_pred, average='macro')

### 2. Define the Neural Network Structure (Layers)

In [7]:
import tensorflow as tf
from tensorflow.python.keras import layers, models, applications, Input, Model
from tensorflow.keras.layers import Convolution2D, MaxPooling2D, UpSampling2D


input = Input(shape=(OPCODE_SEQ_LEN,))

embd = tf.keras.layers.Embedding(150, EMBEDDING_DIM, input_length=OPCODE_SEQ_LEN)(input)
lstm1 = tf.keras.layers.LSTM(128, name='lstm1', return_sequences=True)(embd)
lstm2 = tf.keras.layers.LSTM(64, name='lstm2', return_sequences=True)(lstm1)
dense1 = tf.keras.layers.Dense(256, name='hi_layer', activation='relu')(lstm2)
drop1 = tf.keras.layers.Dropout(0.5)(dense1)
output = tf.keras.layers.Dense(1, name='out_layer', activation='sigmoid')(drop1)

# model = CustomModel(input, output)
model = Model(input, output)

model.summary()


2022-11-08 19:32:12.616356: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2022-11-08 19:32:12.616904: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-11-08 19:32:12.625402: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.


Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 1800)]            0         
_________________________________________________________________
embedding (Embedding)        (None, 1800, 50)          7500      
_________________________________________________________________
lstm1 (LSTM)                 (None, 1800, 128)         91648     
_________________________________________________________________
lstm2 (LSTM)                 (None, 1800, 64)          49408     
_________________________________________________________________
hi_layer (Dense)             (None, 1800, 256)         16640     
_________________________________________________________________
dropout (Dropout)            (None, 1800, 256)         0         
_________________________________________________________________
out_layer (Dense)            (None, 1800, 1)           257   

In [8]:
import tensorflow.keras.backend as K

tensor = K.print_tensor(lstm2)

print(tensor)

from model import callback


KerasTensor(type_spec=TensorSpec(shape=(None, 1800, 64), dtype=tf.float32, name=None), name='tf.keras.backend.print_tensor/Identity:0', description="created by layer 'tf.keras.backend.print_tensor'")


### 3. Compile the model

In [9]:
model.compile(loss='binary_crossentropy', optimizer='adam',
              metrics=['accuracy'], run_eagerly=True)


### 4. Fit and train the RNN model with Training and Validiation Data

In [10]:
history = model.fit(train_sequences, train_labels, 
 epochs=NUM_EPOCS, 
 validation_data=(val_sequences, val_labels),
    batch_size=BATCH_SIZE, verbose=1, callbacks=[callback(model=model, data=train_sequences[0])])


TypeError: __init__() missing 1 required positional argument: 'data'

### 5. Evaluate performance of model using Testing DataSet

In [None]:
results = model.evaluate(test_sequences, test_labels, batch_size=BATCH_SIZE)
print("Test Loss, Test Accuracy:", results)

### 6. Save the model as HDF5 file

In [None]:
model.save(f'./models/model_{TRAIN_SAMPLING_TECHNIQUE}_{TEST_SAMPLING_TECHNIQUE}_{NUM_EPOCS}.h5') 

In [None]:
# Save History as Pickle
import pickle
with open(f'./models/history_{TRAIN_SAMPLING_TECHNIQUE}_{TEST_SAMPLING_TECHNIQUE}_{NUM_EPOCS}.pickle', 'wb') as fh:
 pickle.dump(history.history, fh)

# Save Results as Pickle
import pickle
with open(f'./models/results_{TRAIN_SAMPLING_TECHNIQUE}_{TEST_SAMPLING_TECHNIQUE}_{NUM_EPOCS}.pickle', 'wb') as fh:
 pickle.dump(results, fh)

### 7. Plot performance metrics of the Deep-Learning Model 

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

In [None]:
# Accuracy Metrics
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.xlabel("Epochs")
plt.ylabel('Accuracy')
plt.legend(['accuracy', 'val_accuracy'])
plt.show()

In [None]:
# Loss Metrics
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.xlabel("Epochs")
plt.ylabel('Loss')
plt.legend(['loss', 'val_loss'])
plt.show()

In [None]:
pred_test_classes = model.predict_classes(test_sequences, verbose=1, batch_size=128)
pred_train_classes = model.predict_classes(train_sequences, verbose=1, batch_size=128)

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix

print('Train Metrics\n-------------------------')
# accuracy: (tp + tn) / (p + n)
accuracy = accuracy_score(train_labels, pred_train_classes)
print('Accuracy: %f' % accuracy)
# precision tp / (tp + fp)
precision = precision_score(train_labels, pred_train_classes)
print('Precision: %f' % precision)
# recall: tp / (tp + fn)
recall = recall_score(train_labels, pred_train_classes)
print('Recall: %f' % recall)
# f1: 2 tp / (2 tp + fp + fn)
f1 = f1_score(train_labels, pred_train_classes)
print('F1 score: %f' % f1)
f1M = f1_score(train_labels, pred_train_classes, average='macro')
print('F1-Macro score: %f' % f1M)
# confusion matrix
matrix = confusion_matrix(train_labels, pred_train_classes)
print(matrix)

print('Test Metrics\n-------------------------')
# accuracy: (tp + tn) / (p + n)
accuracy = accuracy_score(test_labels, pred_test_classes)
print('Accuracy: %f' % accuracy)
# precision tp / (tp + fp)
precision = precision_score(test_labels, pred_test_classes)
print('Precision: %f' % precision)
# recall: tp / (tp + fn)
recall = recall_score(test_labels, pred_test_classes)
print('Recall: %f' % recall)
# f1: 2 tp / (2 tp + fp + fn)
f1 = f1_score(test_labels, pred_test_classes)
print('F1 score: %f' % f1)
f1M = f1_score(test_labels, pred_test_classes, average='macro')
print('F1-Macro score: %f' % f1M)
# confusion matrix
matrix = confusion_matrix(test_labels, pred_test_classes)
print(matrix)

Train Metrics
-------------------------
Accuracy: 0.997128
Precision: 0.995977
Recall: 0.998285
F1 score: 0.997130
F1-Macro score: 0.997128
[[139006    562]
 [   239 139129]]
Test Metrics
-------------------------
Accuracy: 0.977043
Precision: 0.986637
Recall: 0.967189
F1 score: 0.976816
F1-Macro score: 0.977041
[[8060  107]
 [ 268 7900]]
