# Neural network HTML classifier with PyTorch, 21.08.2020


In [None]:
from google.colab import drive
drive.mount('/content/gdrive',force_remount=True)
base_dir = "gdrive/My Drive/Colab Notebooks/"

In [None]:
!mkdir -p data
!tar -xzf gdrive/My\ Drive/Colab\ Notebooks/Data/htmldata.tar.gz -C data

In [None]:
!pip install mmh3

In [None]:
from __future__ import absolute_import, division, print_function, unicode_literals

import os, re, time,datetime, copy
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.utils.data import Dataset, DataLoader

import mmh3
import logging

logging.basicConfig(level=logging.INFO)

In [None]:
class Net(nn.Module):
    def __init__(self, input_size):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(input_size,512)
        self.fc2 = nn.Linear(512, 64)
        self.fc3 = nn.Linear(64,1)

    def forward(self, x):
        x = self.fc1(x)
        x = F.relu(x)
        x = self.fc2(x)
        x = F.relu(x)
        x = self.fc3(x)
        x = torch.sigmoid(x)
        return x

In [None]:
FEATURES_SIZE = 1024
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
model = Net(input_size=FEATURES_SIZE).to(device)
# loss
criterion = nn.BCELoss()
# optimizer
optimizer = optim.Adam(model.parameters(), lr=1e-2)

#Custom Dataset

In [None]:
class CustomDataset(Dataset):
    def __init__(self, path_to_b_files, path_to_m_files, features_size=1024):
        self.features_size = features_size
        b_files = [os.path.join(path_to_b_files, f) for f in os.listdir(path_to_b_files)]
        m_files = [os.path.join(path_to_m_files, f) for f in os.listdir(path_to_m_files)]
        self.list_files = b_files + m_files
        self.length = len(self.list_files)
        self.labels = torch.cat((torch.zeros(len(b_files)),
                                 torch.ones(len(m_files))),0)
        
    def _extract_features(self, string, hash_dim, split_regex=rb"\s+"):
        tokens = re.split(pattern=split_regex, string=string)
        hash_buckets = [(mmh3.hash(w) % hash_dim) for w in tokens]
        buckets, counts = np.unique(hash_buckets, return_counts=True)
        feature_values = np.zeros(hash_dim)
        for bucket, count in zip(buckets, counts):
            feature_values[bucket] = count
        return feature_values

    def __getitem__(self, idx):
        with open(self.list_files[idx], 'rb') as f:
            content = f.read()
        data = self._extract_features(content, hash_dim=self.features_size, split_regex=rb"\s+")
        return torch.FloatTensor(data), self.labels[idx]

    def __len__(self):
        return self.length

# Parameters

In [None]:
BATCH_SIZE = 128
EPOCHS = 10
LOG_INTERVAL = 100
VAL_INTERVAL = 1

#Dataloaders

In [None]:
path_to_train_b_files = 'data/html/benign_files/training/'
path_to_train_m_files = 'data/html/malicious_files/training/'
path_to_validation_b_files = 'data/html/benign_files/validation/'
path_to_validation_m_files = 'data/html/malicious_files/validation/'

train_dataset = CustomDataset(path_to_train_b_files,
                                  path_to_train_m_files,
                                  FEATURES_SIZE)
train_size = len(train_dataset)

train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE,
                                shuffle=True, num_workers=8)

val_dataset = CustomDataset(path_to_validation_b_files,
                                path_to_validation_m_files,
                                FEATURES_SIZE)

val_size = len(val_dataset)

val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE,
                                shuffle=False, num_workers=8)

In [None]:
# Function to train the model
def train(net, device, train_dataloader, val_dataloader, num_epochs):
    start = time.time() 
    best_acc  = 0.0
    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch+1, num_epochs))
        print('-' * 10)
        tic = time.time()
        
        net.train() 
        running_loss = 0.0
        running_corrects = 0
        for i, (data, label) in enumerate(train_dataloader):
            data, label = data.to(device), label.to(device)
            optimizer.zero_grad()
            output = net(data)
            loss = criterion(output, label.unsqueeze(1))
            loss.backward()
            optimizer.step()

            running_loss += loss.item() * data.size(0)
            # loss is a tensor, loss.item() the actual value
            # data.size(0): BATCH_SIZE
            
            pred = output.detach()
            running_corrects += ((pred > .5) == label.unsqueeze(1)).sum()

            if i >0 and i % LOG_INTERVAL == 0:
              logging.info('[Epoch %d Batch %d] Training_Loss: %f' %
                            (epoch+1, i, running_loss/(i*BATCH_SIZE)))
        elapsed = time.time() - tic
        speed = i * BATCH_SIZE / elapsed
        epoch_loss = running_loss / len(train_dataset)
        epoch_acc = running_corrects.double() / train_size

        logging.info(' Training: \tSpeed =%.2f samples/sec \tTime cost =%f secs \tLoss %f \tAccuracy %f',
                     speed, elapsed, epoch_loss, epoch_acc)      
                
        if (epoch +1) % VAL_INTERVAL == 0:
          net.eval()
          running_corrects = 0
          for data, label in val_dataloader:
            data, label = data.to(device), label.to(device)
            
            output = net(data)
            pred = output.detach()
            running_corrects += ((pred > .5) == label.unsqueeze(1)).sum()
          val_acc = running_corrects.double() / val_size
          # deep copy the model
          if val_acc > best_acc:
            best_acc = val_acc
            best_model = copy.deepcopy(model.state_dict())
                       
          logging.info(' Validation: \tAccuracy: %f' % val_acc)

    logging.info('Best validation accuracy: %4f' % best_acc)        
    logging.info('Total:%f' % (time.time()-start))
    time_elapsed = time.time() - start
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    #Load best model weights
    model.load_state_dict(best_model)
    return model

In [None]:
model = train(model, device, train_dataloader, val_dataloader, EPOCHS)