## SET ENVIRONMENT VARIABLES

In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "7"

print(os.environ["CUDA_VISIBLE_DEVICES"])

7


## Import and Prepare the DataSets

### 1. Import Train & Test CSV(s)

In [2]:
import pandas as pd
import json

# Set the TRAIN_SAMPLING_TECHNIQUE as follows
# SMOTE ---> 'sm'
# SMOTETomek ---> 'smt'
# ADASYN ---> 'ada'
TRAIN_SAMPLING_TECHNIQUE = 'sm'

# Set the TEST_SAMPLING_TECHNIQUE as follows
# RandomUnderSampler ---> 'rus'
# TomekLinks ---> 'tmk'
TEST_SAMPLING_TECHNIQUE = 'rus'

# Importing Training Set
train = pd.read_csv(f'./split_ds/opcode_{TRAIN_SAMPLING_TECHNIQUE}_TRAIN.csv')
train['opcode'] = train['opcode'].apply(lambda x: json.loads(x))

# Importing Testing Set
test = pd.read_csv(f'./split_ds/opcode_{TEST_SAMPLING_TECHNIQUE}_TEST.csv')
test['opcode'] = test['opcode'].apply(lambda x: json.loads(x))

print(train, test)

                                                  opcode  swc_label
0      [2, 2, 10, 2, 50, 33, 5, 13, 2, 30, 62, 3, 32,...          0
1      [2, 2, 10, 31, 4, 12, 5, 13, 2, 4, 22, 8, 6, 2...          0
2      [2, 2, 10, 2, 50, 33, 5, 13, 2, 30, 62, 3, 32,...          0
3      [2, 2, 10, 2, 50, 33, 5, 13, 2, 30, 62, 3, 32,...          0
4      [2, 2, 10, 2, 30, 62, 3, 32, 25, 11, 4, 25, 26...          1
...                                                  ...        ...
21931  [2, 2, 10, 19, 35, 22, 39, 17, 38, 2, 12, 26, ...          1
21932  [2, 2, 10, 13, 40, 26, 6, 15, 2, 23, 52, 2, 31...          1
21933  [2, 2, 10, 30, 7, 12, 8, 13, 30, 46, 4, 29, 16...          1
21934  [95, 21, 26, 12, 66, 13, 21, 97, 0, 0, 0, 0, 0...          1
21935  [2, 2, 10, 2, 30, 62, 3, 32, 15, 17, 13, 16, 2...          1

[21936 rows x 2 columns]                                                  opcode  swc_label
0     [2, 2, 10, 31, 4, 12, 5, 13, 2, 4, 22, 8, 6, 2...          0
1     [2, 2, 10, 2, 50, 

### 2. Split the Test DataSet into Testing and Validiation DataSet

In [3]:
from sklearn.model_selection import train_test_split
test, val = train_test_split(test, test_size=0.5, random_state=69, shuffle=True, stratify=test['swc_label'])

### 3. Convert DataFrame(s) to Numpy N-D Array(s)

In [4]:
import torch
from torch.autograd import Variable

def pandas2tensor(data, X=True):
    tensor = Variable(torch.Tensor(data.tolist()))
    if X:
        return torch.reshape(tensor, (tensor.shape[0], 1, tensor.shape[1]))
    else:
        return tensor


  _dtype_to_storage = {data_type(0).dtype: data_type for data_type in _storages}


In [5]:
train_sequences = pandas2tensor(train['opcode'])
train_labels = pandas2tensor(train['swc_label'], X=False)

test_sequences = pandas2tensor(test['opcode'])
test_labels = pandas2tensor(test['swc_label'], X=False)

val_sequences = pandas2tensor(val['opcode'])
val_labels = pandas2tensor(val['swc_label'], X=False)

print("Train-Sequences", train_sequences.shape, type(train_sequences[0]))
print("Train-Labels", train_labels.shape, type(train_labels[0]))
 
print("Test-Sequences", test_sequences.shape, type(test_sequences[0]))
print("Test-Labels", test_labels.shape, type(test_labels[0]))
 
print("Validiation-Sequences", val_sequences.shape, type(val_sequences[0]))
print("Validiation-Labels", val_labels.shape, type(val_labels[0]))


Train-Sequences torch.Size([21936, 1, 1800]) <class 'torch.Tensor'>
Train-Labels torch.Size([21936]) <class 'torch.Tensor'>
Test-Sequences torch.Size([1299, 1, 1800]) <class 'torch.Tensor'>
Test-Labels torch.Size([1299]) <class 'torch.Tensor'>
Validiation-Sequences torch.Size([1299, 1, 1800]) <class 'torch.Tensor'>
Validiation-Labels torch.Size([1299]) <class 'torch.Tensor'>


## Create & Evaluate the Deep-Learning Model (RNN based on LSTM architecture)

### 1. Define the Hyper-Parameters

In [6]:
OPCODE_SEQ_LEN = 1800
EMBEDDING_DIM = 50
NUM_EPOCS = 128
BATCH_SIZE = 128

LEARNING_RATE = 0.001  # 0.001 lr
INPUT_SIZE = 1800  # number of features
HIDDEN_SIZE = 2  # number of features in hidden state
NUM_LAYERS = 1  # number of stacked lstm layers
NUM_CLASSES = 1  # number of output classes

### 1A. Import Evaluation metrics

In [7]:
from sklearn.metrics import f1_score

def f1(y_true, y_pred):
    return f1_score(y_true, y_pred)

def f1M(y_true, y_pred):
    return f1_score(y_true, y_pred, average='macro')

### 2. Define the Neural Network Structure (Layers)

In [8]:
from model import LSTM

model = LSTM(num_classes=NUM_CLASSES, input_size=INPUT_SIZE, 
    hidden_size=HIDDEN_SIZE, num_layers=NUM_LAYERS, 
    seq_length=train_sequences.shape[1])

model.model

NameError: name 'X' is not defined

### 3. Compile the model

In [None]:
model.compile(learning_rate=LEARNING_RATE)

### 4. Fit and train the RNN model with Training and Validiation Data

In [None]:
model.fit(num_epochs=NUM_EPOCS, X=train_sequences, y=train_labels, batch_size=1,
    X_validation=val_sequences, y_validation=val_labels)


SyntaxError: invalid syntax (2164458196.py, line 2)

### 5. Evaluate performance of model using Testing DataSet

In [None]:
results = model.evaluate(test_sequences, test_labels, batch_size=BATCH_SIZE)
print("Test Loss, Test Accuracy:", results)

### 6. Save the model as HDF5 file

In [None]:
model.save(f'./models/model_{TRAIN_SAMPLING_TECHNIQUE}_{TEST_SAMPLING_TECHNIQUE}_{NUM_EPOCS}.h5') 

In [None]:
# Save History as Pickle
import pickle
with open(f'./models/history_{TRAIN_SAMPLING_TECHNIQUE}_{TEST_SAMPLING_TECHNIQUE}_{NUM_EPOCS}.pickle', 'wb') as fh:
 pickle.dump(history.history, fh)

# Save Results as Pickle
import pickle
with open(f'./models/results_{TRAIN_SAMPLING_TECHNIQUE}_{TEST_SAMPLING_TECHNIQUE}_{NUM_EPOCS}.pickle', 'wb') as fh:
 pickle.dump(results, fh)

### 7. Plot performance metrics of the Deep-Learning Model 

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

In [None]:
# Accuracy Metrics
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.xlabel("Epochs")
plt.ylabel('Accuracy')
plt.legend(['accuracy', 'val_accuracy'])
plt.show()

In [None]:
# Loss Metrics
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.xlabel("Epochs")
plt.ylabel('Loss')
plt.legend(['loss', 'val_loss'])
plt.show()

In [None]:
pred_test_classes = model.predict_classes(test_sequences, verbose=1, batch_size=128)
pred_train_classes = model.predict_classes(train_sequences, verbose=1, batch_size=128)

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix

print('Train Metrics\n-------------------------')
# accuracy: (tp + tn) / (p + n)
accuracy = accuracy_score(train_labels, pred_train_classes)
print('Accuracy: %f' % accuracy)
# precision tp / (tp + fp)
precision = precision_score(train_labels, pred_train_classes)
print('Precision: %f' % precision)
# recall: tp / (tp + fn)
recall = recall_score(train_labels, pred_train_classes)
print('Recall: %f' % recall)
# f1: 2 tp / (2 tp + fp + fn)
f1 = f1_score(train_labels, pred_train_classes)
print('F1 score: %f' % f1)
f1M = f1_score(train_labels, pred_train_classes, average='macro')
print('F1-Macro score: %f' % f1M)
# confusion matrix
matrix = confusion_matrix(train_labels, pred_train_classes)
print(matrix)

print('Test Metrics\n-------------------------')
# accuracy: (tp + tn) / (p + n)
accuracy = accuracy_score(test_labels, pred_test_classes)
print('Accuracy: %f' % accuracy)
# precision tp / (tp + fp)
precision = precision_score(test_labels, pred_test_classes)
print('Precision: %f' % precision)
# recall: tp / (tp + fn)
recall = recall_score(test_labels, pred_test_classes)
print('Recall: %f' % recall)
# f1: 2 tp / (2 tp + fp + fn)
f1 = f1_score(test_labels, pred_test_classes)
print('F1 score: %f' % f1)
f1M = f1_score(test_labels, pred_test_classes, average='macro')
print('F1-Macro score: %f' % f1M)
# confusion matrix
matrix = confusion_matrix(test_labels, pred_test_classes)
print(matrix)