<a href="https://colab.research.google.com/github/mathresearch/mlsec/blob/master/htmlclf_mxnet_aws.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!aws s3 sync s3://mlsec/htmldata.tar.gz .

In [None]:
!mkdir -p data
!tar -xzf htmldata.tar.gz -C data

In [None]:
!pip install mmh3

In [None]:
import os
import re
import mmh3
import time
import logging

import numpy as np
import mxnet as mx
from mxnet import gluon, autograd

logging.basicConfig(level=logging.INFO)

In [None]:
# Custom dataset to load the data
class CustomDataset(gluon.data.Dataset):
    def __init__(self, path_to_b_files, path_to_m_files, features_size=1024):
        self.features_size = features_size
        b_files = [os.path.join(path_to_b_files, f) for f in os.listdir(path_to_b_files)]
        m_files = [os.path.join(path_to_m_files, f) for f in os.listdir(path_to_m_files)]
        self.list_files = b_files + m_files
        self.length = len(self.list_files)
        self.labels = mx.nd.concat(mx.nd.zeros(shape=(len(b_files))),
                                   mx.nd.ones(shape=(len(m_files))),
                                   dim=0)

    def _extract_features(self, string, hash_dim, split_regex=rb"\s+"):
        tokens = re.split(pattern=split_regex, string=string)
        hash_buckets = [(mmh3.hash(w) % hash_dim) for w in tokens]
        buckets, counts = np.unique(hash_buckets, return_counts=True)
        feature_values = np.zeros(hash_dim)
        for bucket, count in zip(buckets, counts):
            feature_values[bucket] = count
        return feature_values

    def __getitem__(self, idx):
        with open(self.list_files[idx], 'rb') as f:
            content = f.read()
        data = self._extract_features(content, hash_dim=self.features_size, split_regex=rb"\s+")
        return mx.nd.array(data), self.labels[idx]

    def __len__(self):
        return self.length

In [None]:
# Contants
BATCH_SIZE = 128
EPOCHS = 10
LOG_INTERVAL = 100
VAL_INTERVAL = 1

# Fixed the seed for randomness
mx.random.seed(999)

In [None]:
# Function to get train and val dataloader
def get_dataloader():
    path_to_train_b_files = 'data/html/benign_files/training/'
    path_to_train_m_files = 'data/html/malicious_files/training/'
    path_to_validation_b_files = 'data/html/benign_files/validation/'
    path_to_validation_m_files = 'data/html/malicious_files/validation/'
    FEATURES_SIZE = 1024

    train_dataset = CustomDataset(path_to_train_b_files,
                                  path_to_train_m_files,
                                  FEATURES_SIZE)
    train_dataloader = mx.gluon.data.DataLoader(train_dataset, batch_size=BATCH_SIZE,
                                                num_workers=8, shuffle=True)

    val_dataset = CustomDataset(path_to_validation_b_files,
                                path_to_validation_m_files,
                                FEATURES_SIZE)
    val_dataloader = mx.gluon.data.DataLoader(val_dataset, batch_size=BATCH_SIZE,
                                              num_workers=8, shuffle=False)

    return train_dataloader, val_dataloader

In [None]:
# Function to define neural network
def custom_model():
    net = gluon.nn.HybridSequential()
    with net.name_scope():
        net.add(gluon.nn.Dense(1024, activation='relu'))
        net.add(gluon.nn.Dense(512, activation='relu'))
        net.add(gluon.nn.Dense(1, activation='sigmoid'))
    return net

In [None]:
# Function to get binary labels
def facc(label, pred):
    pred = pred.ravel()
    label = label.ravel()
    return ((pred > 0.5) == label).mean()

In [None]:
# Function to evaluate accuracy for a model
def evaluate(model, val_data, ctx):
    metric = mx.metric.CustomMetric(facc)
    for data, label in val_data:
        data = data.as_in_context(ctx)
        label = label.as_in_context(ctx)
        output = model(data)
        metric.update(label, output)

    return metric.get()

In [None]:
if mx.context.num_gpus() > 0:
    logging.info("Running the script on single GPU")
    ctx = mx.gpu(0)
else:
    logging.info("Running the script on CPU")
    ctx = mx.cpu()

In [None]:
# Create a model
net = custom_model()
net.cast('float32')
net.hybridize(static_alloc=True, static_shape=True)

In [None]:
# Initialize parameters
initializer = mx.init.Xavier(rnd_type='gaussian', factor_type="in",
                             magnitude=2)
net.initialize(initializer, ctx=ctx)

# Create optimizer
optimizer_params = {'learning_rate': 0.02, 'momentum': 0.9}

opt = mx.optimizer.create('sgd', **optimizer_params)
trainer = gluon.Trainer(net.collect_params(), opt)
loss_fn = gluon.loss.SigmoidBinaryCrossEntropyLoss(from_sigmoid=True)

In [None]:
train_dataloader, val_dataloader = get_dataloader()

In [None]:
# Function to train the model
def train(net, train_dataloader, val_dataloader):
    train_metric = mx.metric.CustomMetric(facc)
    start = time.time() #B
    for epoch in range(EPOCHS):
        tic = time.time()
        # reset metric at beginning of epoch.
        train_metric.reset()
        for i, (data, label) in enumerate(train_dataloader):
            # Copy data to ctx if necessary
            data = data.as_in_context(ctx)
            label = label.as_in_context(ctx)

            # Start recording computation graph with record() section.
            # Recorded graphs can then be differentiated with backward.
            with autograd.record():
                output = net(data)
                L = loss_fn(output, label)
            L.backward()
            curr_loss = mx.ndarray.mean(L).asscalar()

            # take a gradient step with batch_size equal to data.shape[0]
            trainer.step(BATCH_SIZE)
            # update metric at last.
            train_metric.update(label, output)

            if i % LOG_INTERVAL == 0:
                name, acc = train_metric.get()
                logging.info('[Epoch %d Batch %d] Training_Loss: %f Training_Acc: %f' %
                             (epoch, i, curr_loss, acc))
        elapsed = time.time() - tic
        speed = i * BATCH_SIZE / elapsed
        logging.info('Epoch[%d]\tSpeed=%.2f samples/sec \tTime cost=%f secs',
                     epoch+1, speed, elapsed)
        
        # Evaluate the model
        if (epoch + 1) % VAL_INTERVAL == 0:
            val_name, val_acc = evaluate(net, val_dataloader, ctx)
            logging.info('Validation Accuracy: %f' % (val_acc))
    logging.info('Total:%f' % (time.time()-start))

In [None]:
train(net, train_dataloader, val_dataloader)