In [13]:
#import all necesssary packages
import torch 
import torch.nn as nn
import numpy as np
import matplotlib.pyplot as plt
import os
import pickle
from os.path import expanduser
from collections import defaultdict
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from datetime import datetime
from torch.utils import data
import random
from tqdm import tqdm

In [14]:
#some necessary helper functions
def load_txt(txt_path):
    with open(txt_path, 'r', encoding='utf-8') as file:
        text = file.read()
    return text

def save_txt(text, save_path):
    with open(save_path, 'w', encoding='utf-8') as file:
        file.write(text)

def load_pkl(file_path):
    with open(file_path, 'rb') as pkl_file:
        content = pickle.load(pkl_file)
    return content

def save_pkl(content, folder_path, file_name):
    path = os.path.join(folder_path, file_name)
    with open(path, 'wb') as pkl_file:
        pickle.dump(content, pkl_file)

In [15]:
#load both benign and malware datasets (fasttext dataset) and build the dataset
benign_pkls_list = 'tensors/i386_train'
malware_pkls_list = 'tensors/i386_malware'

benign_pkls_path_list = [os.path.join(benign_pkls_list, file_name) for file_name in os.listdir(benign_pkls_list)]
malware_pkls_path_list = [os.path.join(malware_pkls_list, file_name) for file_name in os.listdir(malware_pkls_list)]

# benign_pkls_path_list = random.sample(benign_pkls_path_list, 10)
# malware_pkls_path_list = random.sample(malware_pkls_path_list, 10)

x86_training_labels = [0] * len(benign_pkls_path_list) + [1] * len(malware_pkls_path_list)
x86_training_dataset = benign_pkls_path_list + malware_pkls_path_list

# for path in benign_pkls_path_list:
#     print(load_pkl(path).shape)

In [16]:
X_train, X_test, y_train, y_test = train_test_split(x86_training_dataset, x86_training_labels, test_size=0.2, random_state=42)
print(y_test)
print(len(y_test))

[0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0]
280


In [17]:
from concurrent.futures import ProcessPoolExecutor
import numpy as np
import torch

def data_generator(X, y, batch_size):
    n_batches = int(np.ceil(len(y) / batch_size))
    
    for i in range(n_batches):
        start = i * batch_size
        end = min((i + 1) * batch_size, len(y))
        
        current_X_paths = X[start:end]
        loaded_X_data = [load_pkl(file) for file in current_X_paths]
        # print(f"file loaded: {file}")
        # loaded_X_data = []
        # for file in current_X_paths:
        #     loaded_X_data.append(load_pkl(file))
        #     print(f"file loaded: {file}") 
        X_batch = torch.stack(loaded_X_data).float()
        y_batch = torch.from_numpy(np.array(y[start:end])).float()
        
        yield X_batch, y_batch

# for inputs, targets in data_generator(x86_training_dataset, x86_training_labels):
#     print("inputs", inputs, "shape: ", inputs.shape)
#     print("targets", targets, "shape: ", targets.shape)
#     break

In [18]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [19]:
# Define the model
class RNN(nn.Module):
    # embed_dim is 100 in our case
    # n_hidden: the dimension of each hidden layer
    # n_rnnlayers: the number of layers
    def __init__(self, embed_dim, n_hidden, n_rnnlayers, n_outputs):
        super(RNN, self).__init__()
        self.D = embed_dim
        self.M = n_hidden
        self.K = n_outputs
        self.L = n_rnnlayers
        
        self.rnn = nn.LSTM(
            input_size = self.D,
            hidden_size = self.M, 
            num_layers = self.L,
            batch_first = True)
        
        self.fc = nn.Linear(self.M, self.K)
    
    def forward(self, X):
        # initial hidden states
        h0 = torch.zeros(self.L, X.size(0), self.M).to(device)
        c0 = torch.zeros(self.L, X.size(0), self.M).to(device)

 
        # get RNN unit output
        out, _ = self.rnn(X, (h0, c0))
        
        # max pool
        out, _ = torch.max(out, 1)
        
        # we only want h(T) at the final time step
        out = self.fc(out)
        return out   

In [20]:
embed_dim = 24
n_hidden = 8
n_rnnlayers = 2
n_outputs = 1
batch_size = 10
model = RNN(embed_dim, n_hidden, n_rnnlayers, n_outputs)
model.to(device)

RNN(
  (rnn): LSTM(24, 8, num_layers=2, batch_first=True)
  (fc): Linear(in_features=8, out_features=1, bias=True)
)

In [21]:
# Loss and optimizer
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters())

In [22]:
train_gen = lambda: data_generator(X_train, y_train, batch_size)
test_gen = lambda: data_generator(X_test, y_test, batch_size)

In [23]:
def batch_gd(model, criterion, optimizer, epochs):
    train_losses = np.zeros(epochs)
    train_accs = np.zeros(epochs)
    
    test_losses = np.zeros(epochs) 
    test_accs = np.zeros(epochs)
    
    test_arm_losses = np.zeros(epochs)
    test_arm_accs = np.zeros(epochs)
    
    for it in range(epochs):
        t0 = datetime.now()
        train_loss = []   
        n_correct = 0.
        n_total = 0.
        for inputs, targets in tqdm(train_gen(), desc=f"Epoch {it+1}/{epochs} - Training"):
            targets = targets.view(-1, 1).float()
            # move data to GPU
            inputs, targets = inputs.to(device), targets.to(device)
            
            # zero the parameter gradients
            optimizer.zero_grad()

            # Forward pass
            outputs = model(inputs)
            loss = criterion(outputs, targets)

            # backward and optimize
            loss.backward()
            optimizer.step()
            
            predictions = (outputs > 0)
            n_correct += (predictions == targets).sum().item()
            n_total += targets.shape[0]

            train_loss.append(loss.item())   
        train_loss = np.mean(train_loss)
        train_acc = n_correct / n_total
        
        
        # Get test loss and acc
        n_correct = 0.
        n_total = 0.
        test_loss = []
        for inputs, targets in tqdm(test_gen(), desc=f"Epoch {it+1}/{epochs} - Testing"):  # Added tqdm here for testing
            targets = targets.view(-1, 1).float()
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            test_loss.append(loss.item())
            
            predictions = (outputs > 0) 
            n_correct += (predictions == targets).sum().item()
            n_total += targets.shape[0]  
        test_loss = np.mean(test_loss)
        test_acc = n_correct / n_total

        # save everything!!!
        train_losses[it] = train_loss
        test_losses[it] = test_loss
        train_accs[it] = train_acc
        test_accs[it] = test_acc
        # test_arm_losses[it] = test_arm_loss
        # test_arm_accs[it] = test_arm_acc
                
        dt = datetime.now() - t0
#         if it % 100 == 0:
#         torch.save(model.state_dict(), 'lstm100_it{}.pt'.format(it))
        print(f'Epoch {it}/{epochs},  Duration: {dt}')
        print(f'Train Loss: {train_loss:.4f},     Train Acc: {train_acc:.4f}')
        print(f'Test Loss: {test_loss:.4f},       Test Acc: {test_acc:.4f}')
        # print(f'Test ARM Loss: {test_arm_loss:.4f},       Test ARM ACC: {test_arm_acc:.4f}') 
        save_path = f'x86_training_result/{embed_dim}_{n_hidden}_{n_rnnlayers}_{n_outputs}'
        os.makedirs(save_path, exist_ok=True)
        filename = 'lstm_it{}.pt'.format(it)
        torch.save(model.state_dict(), os.path.join(save_path, filename))
    return train_losses, test_losses, train_accs, test_accs, test_arm_losses, test_arm_accs  


In [24]:
train_losses, test_losses, train_accs, test_accs, test_arm_losses, test_arm_accs  = batch_gd(model, criterion, optimizer, 20)

Epoch 1/20 - Training: 112it [11:54,  6.38s/it]
Epoch 1/20 - Testing: 28it [02:06,  4.52s/it]


Epoch 0/20,  Duration: 0:14:00.815858
Train Loss: 0.6896,     Train Acc: 0.5000
Test Loss: 0.6790,       Test Acc: 0.5000


Epoch 2/20 - Training: 112it [12:13,  6.55s/it]
Epoch 2/20 - Testing: 28it [02:06,  4.51s/it]


Epoch 1/20,  Duration: 0:14:19.536371
Train Loss: 0.6397,     Train Acc: 0.7179
Test Loss: 0.5818,       Test Acc: 0.7821


Epoch 3/20 - Training: 112it [12:12,  6.54s/it]
Epoch 3/20 - Testing: 28it [02:06,  4.51s/it]


Epoch 2/20,  Duration: 0:14:19.086642
Train Loss: 0.5423,     Train Acc: 0.7955
Test Loss: 0.5114,       Test Acc: 0.7821


Epoch 4/20 - Training: 112it [12:09,  6.52s/it]
Epoch 4/20 - Testing: 28it [02:06,  4.50s/it]


Epoch 3/20,  Duration: 0:14:15.850179
Train Loss: 0.5198,     Train Acc: 0.7330
Test Loss: 0.4597,       Test Acc: 0.7821


Epoch 5/20 - Training: 112it [12:18,  6.60s/it]
Epoch 5/20 - Testing: 28it [02:08,  4.60s/it]


Epoch 4/20,  Duration: 0:14:27.684728
Train Loss: 0.4497,     Train Acc: 0.7661
Test Loss: 0.4112,       Test Acc: 0.8929


Epoch 6/20 - Training: 112it [12:11,  6.53s/it]
Epoch 6/20 - Testing: 28it [02:06,  4.51s/it]


Epoch 5/20,  Duration: 0:14:18.042227
Train Loss: 0.3706,     Train Acc: 0.9187
Test Loss: 0.3477,       Test Acc: 0.9393


Epoch 7/20 - Training: 112it [12:28,  6.68s/it]
Epoch 7/20 - Testing: 28it [02:06,  4.53s/it]


Epoch 6/20,  Duration: 0:14:34.976640
Train Loss: 0.3967,     Train Acc: 0.8277
Test Loss: 0.3384,       Test Acc: 0.9393


Epoch 8/20 - Training: 112it [12:13,  6.54s/it]
Epoch 8/20 - Testing: 28it [02:08,  4.59s/it]


Epoch 7/20,  Duration: 0:14:21.609811
Train Loss: 0.3108,     Train Acc: 0.9366
Test Loss: 0.3088,       Test Acc: 0.9357


Epoch 9/20 - Training: 112it [12:14,  6.56s/it]
Epoch 9/20 - Testing: 28it [02:04,  4.44s/it]


Epoch 8/20,  Duration: 0:14:18.940083
Train Loss: 0.2861,     Train Acc: 0.9384
Test Loss: 0.2839,       Test Acc: 0.9321


Epoch 10/20 - Training: 26it [02:57,  6.82s/it]


KeyboardInterrupt: 

: 

In [None]:
# n_correct = 0.
# n_total = 0.
# test_loss = []
# for inputs, targets in test_gen():
#     targets = targets.view(-1, 1).float()
#     inputs, targets = inputs.to(device), targets.to(device)
#     outputs = model(inputs)
#     loss = criterion(outputs, targets)
#     test_loss.append(loss.item())
    
#     predictions = (outputs > 0) 
#     n_correct += (predictions == targets).sum().item()
#     # print(n_correct)
#     n_total += targets.shape[0]  
#     print("=====targets=====")
#     print(targets)
#     print("=====outputs=====")
#     print(outputs)
# test_loss = np.mean(test_loss)
# test_acc = n_correct / n_total
# print(test_acc)

In [None]:
#save model



In [None]:
# X_test = [load_pkl(file_path) for file_path in X_test]

# print(X_test[0].shape)

In [None]:
# import sklearn.metrics as metrics
# import matplotlib.pyplot as plt
# # Define a batch generator
# def batched_data_loader(file_paths, labels, batch_size=10):
#     for i in range(0, len(file_paths), batch_size):
#         batch_files = file_paths[i:i+batch_size]
#         batch_data = [load_pkl(file) for file in batch_files]
#         yield torch.stack(batch_data), torch.tensor(labels[i:i+batch_size])

# # Parameters
# batch_size = 20
# all_probs = []

# # Process in batches
# for inputs, targets in batched_data_loader(X_test, y_test, batch_size):
#     inputs, targets = inputs.to(device), targets.to(device)
#     batch_output = model(inputs)
#     batch_probs = torch.sigmoid(batch_output)
#     all_probs.append(batch_probs.detach().cpu().numpy())

# # Concatenate all outputs
# mips_probs = np.concatenate(all_probs, axis=0)
# # print(f"mips_probs {mips_probs}")
# # Now, you can calculate fpr, tpr, etc.
# mips_preds = mips_probs[:,0]
# fpr_mips, tpr_mips, threshold_mips = metrics.roc_curve(y_test, mips_preds)
# roc_auc_mips = metrics.auc(fpr_mips, tpr_mips)

# # method I: plt
# import matplotlib.pyplot as plt
# plt.title('Receiver Operating Characteristic for mips Data')
# plt.plot(fpr_mips, tpr_mips, 'b', label = 'AUC = %0.2f' % roc_auc_mips)
# plt.legend(loc = 'lower right')
# plt.plot([0, 1], [0, 1],'r--')
# plt.xlim([0, 1])
# plt.ylim([0, 1])
# plt.ylabel('True Positive Rate')
# plt.xlabel('False Positive Rate')
# plt.show()

In [None]:
# import sklearn.metrics as metrics
# import matplotlib.pyplot as plt

# def get_model_predictions(model, X, device, batch_size=50):
#     model.eval()  # Set model to evaluation mode
#     all_preds = []

#     # Loop over the dataset in mini-batches
#     for i in range(0, len(X), batch_size):
#         batch_X = X[i:i+batch_size]
#         X_tensor = torch.stack(batch_X).to(device)

#         with torch.no_grad():
#             outputs = model(X_tensor)
#             probs = torch.sigmoid(outputs)
#             preds = probs[:, 0].cpu().numpy()

#         all_preds.extend(preds)

#     return np.array(all_preds)

# # Fetch model predictions using the function
# preds = get_model_predictions(model, X_test, device)
# print(len(preds))
# # Compute ROC curve and AUC
# fpr, tpr, threshold = metrics.roc_curve(y_test, preds)
# roc_auc = metrics.auc(fpr, tpr)
# print(roc_auc)
# # Plotting the ROC curve
# plt.title('Receiver Operating Characteristic')
# plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
# plt.legend(loc = 'lower right')
# plt.plot([0, 1], [0, 1],'r--')
# plt.xlim([0, 1])
# plt.ylim([0, 1])
# plt.ylabel('True Positive Rate')
# plt.xlabel('False Positive Rate')
# plt.show()


In [None]:

# import sklearn.metrics as metrics

# X_test_tensor = torch.stack(X_test).to(device)
# outputs = model(X_test_tensor)
# probs = torch.sigmoid(outputs)
# preds = probs[:,0].detach().cpu().numpy()


# # Compute ROC curve and AUC
# fpr, tpr, threshold = metrics.roc_curve(y_test, preds)
# roc_auc = metrics.auc(fpr, tpr)

# # method I: plt
# import matplotlib.pyplot as plt
# plt.title('Receiver Operating Characteristic')
# plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
# plt.legend(loc = 'lower right')
# plt.plot([0, 1], [0, 1],'r--')
# plt.xlim([0, 1])
# plt.ylim([0, 1])
# plt.ylabel('True Positive Rate')
# plt.xlabel('False Positive Rate')
# plt.show()