In [None]:
import os
import re
import time

import numpy as np
import mxnet as mx
from mxnet import gluon, autograd

from extractor import extract_features
from model_mxnet import custom_model

In [None]:
# Custom dataset to load the data
class CustomDataset(gluon.data.Dataset):
    def __init__(self, path_to_b_files, path_to_m_files, features_size=1024):
        self.features_size = features_size
        b_files = [os.path.join(path_to_b_files, f) for f in os.listdir(path_to_b_files)]
        m_files = [os.path.join(path_to_m_files, f) for f in os.listdir(path_to_m_files)]
        self.list_files = b_files + m_files
        self.length = len(self.list_files)
        self.labels = mx.nd.concat(mx.nd.zeros(shape=(len(b_files))),
                                   mx.nd.ones(shape=(len(m_files))),
                                   dim=0)

    def __getitem__(self, idx):
        with open(self.list_files[idx], 'rb') as f:
            content = f.read()
            
        data = extract_features(content, hash_dim=self.features_size, split_regex=rb"\s+")
        return mx.nd.array(data), self.labels[idx]

    def __len__(self):
        return self.length

In [None]:
# Constants
BATCH_SIZE = 128
EPOCHS = 2
LOG_INTERVAL = 100
VAL_INTERVAL = 1
FEATURES_SIZE = 1024

# Fixed the seed for randomness
mx.random.seed(999)

In [None]:
# Function to get train and val dataloader
def get_dataloader():
    path_to_train_b_files = 'data/html/benign_files/training/'
    path_to_train_m_files = 'data/html/malicious_files/training/'
    path_to_validation_b_files = 'data/html/benign_files/validation/'
    path_to_validation_m_files = 'data/html/malicious_files/validation/'

    train_dataset = CustomDataset(path_to_train_b_files,
                                  path_to_train_m_files,
                                  FEATURES_SIZE)
    train_dataloader = mx.gluon.data.DataLoader(train_dataset, batch_size=BATCH_SIZE,
                                                num_workers=8, shuffle=True)

    val_dataset = CustomDataset(path_to_validation_b_files,
                                path_to_validation_m_files,
                                FEATURES_SIZE)
    val_dataloader = mx.gluon.data.DataLoader(val_dataset, batch_size=BATCH_SIZE,
                                              num_workers=8, shuffle=False)

    return train_dataloader, val_dataloader

In [None]:
# Function to get binary labels
def facc(label, pred):
    pred = pred.ravel()
    label = label.ravel()
    return ((pred > 0.5) == label).mean()

In [None]:
# Function to evaluate accuracy for a model
def evaluate(model, val_data, ctx):
    metric = mx.metric.CustomMetric(facc)
    for data, label in val_data:
        data = data.as_in_context(ctx)
        label = label.as_in_context(ctx)
        output = model(data)
        metric.update(label, output)

    return metric.get()

In [None]:
if mx.context.num_gpus() > 0:
    print("Running the script on single GPU")
    ctx = mx.gpu(0)
else:
    print("Running the script on CPU")
    ctx = mx.cpu()

In [None]:
# Create a model
net = custom_model()
net.cast('float32')
net.hybridize(static_alloc=True, static_shape=True)

In [None]:
# Initialize parameters
initializer = mx.init.Xavier(rnd_type='gaussian', factor_type="in",
                             magnitude=2)
net.initialize(initializer, ctx=ctx)

# Create optimizer
optimizer_params = {'learning_rate': 0.02, 'momentum': 0.9}

opt = mx.optimizer.create('sgd', **optimizer_params)
trainer = gluon.Trainer(net.collect_params(), opt)
loss_fn = gluon.loss.SigmoidBinaryCrossEntropyLoss(from_sigmoid=True)

In [None]:
train_dataloader, val_dataloader = get_dataloader()

In [None]:
# Function to train the model
def train(net, train_dataloader, val_dataloader):

    best_acc = 0.0

    train_metric = mx.metric.CustomMetric(facc)
    start = time.time() #B
    
    for epoch in range(EPOCHS):
        print('Epoch {}/{}'.format(epoch+1, EPOCHS))
        print('-' * 10)
        tic = time.time()
        # reset metric at beginning of epoch.
        train_metric.reset()
        for i, (data, label) in enumerate(train_dataloader):
            # Copy data to ctx if necessary
            data = data.as_in_context(ctx)
            label = label.as_in_context(ctx)

            # Start recording computation graph with record() section.
            # Recorded graphs can then be differentiated with backward.
            with autograd.record():
                output = net(data)
                L = loss_fn(output, label)
            L.backward()
            curr_loss = mx.ndarray.mean(L).asscalar()

            # take a gradient step with batch_size equal to data.shape[0]
            trainer.step(BATCH_SIZE)
            # update metric at last.
            train_metric.update(label, output)

            if i % LOG_INTERVAL == 0:
                name, acc = train_metric.get()
                print('[Epoch %d Batch %4d] Training loss: %3.4f accuracy: %2.4f' %
                             (epoch +1, i, curr_loss, acc))
        elapsed = time.time() - tic
        speed = i * BATCH_SIZE / elapsed
        print('Epoch %d Speed=%.2f samples/sec Time cost=%f secs'% (epoch+1, speed, elapsed))
        
        # Evaluate the model
        if (epoch + 1) % VAL_INTERVAL == 0:
            val_name, val_acc = evaluate(net, val_dataloader, ctx)
            if val_acc > best_acc:
                best_acc = val_acc
                net.save_parameters('net.params')
            print('Validation accuracy: %f' % (val_acc))
            print()
    print()        
    time_elapsed = time.time() - start
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    print('Best validation accuracy: {:4f}'.format(best_acc))

    # load best model weights
    net.load_parameters('net.params', ctx = ctx)

In [None]:
train(net, train_dataloader, val_dataloader)