In [1]:
!pip install mmh3

Collecting mmh3
  Downloading mmh3-2.5.1.tar.gz (9.8 kB)
Building wheels for collected packages: mmh3
  Building wheel for mmh3 (setup.py) ... [?25ldone
[?25h  Created wheel for mmh3: filename=mmh3-2.5.1-cp36-cp36m-linux_x86_64.whl size=24922 sha256=31499b3421e0d08f65db4b88d5eabc944ce9893c9aebf4f095a0a561fa47018d
  Stored in directory: /home/ec2-user/.cache/pip/wheels/cc/3a/98/fc5e7f8e1840cf6dcf2435260b29661db90a0b22dbd2739df6
Successfully built mmh3
Installing collected packages: mmh3
Successfully installed mmh3-2.5.1
You should consider upgrading via the '/home/ec2-user/anaconda3/envs/pytorch_latest_p36/bin/python -m pip install --upgrade pip' command.[0m


In [2]:
from __future__ import absolute_import, division, print_function, unicode_literals

import os, re, time,datetime, copy
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.utils.data import Dataset, DataLoader

import mmh3
import logging

logging.basicConfig(level=logging.INFO)

In [3]:
class Net(nn.Module):
    def __init__(self, input_size):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(input_size,512)
        self.fc2 = nn.Linear(512, 64)
        self.fc3 = nn.Linear(64,1)

    def forward(self, x):
        x = self.fc1(x)
        x = F.relu(x)
        x = self.fc2(x)
        x = F.relu(x)
        x = self.fc3(x)
        x = torch.sigmoid(x)
        return x

In [4]:
FEATURES_SIZE = 1024
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
model = Net(input_size=FEATURES_SIZE).to(device)
# loss
criterion = nn.BCELoss()
#criterion = nn.BCEWithLogitsLoss()

# optimizer
optimizer = optim.Adam(model.parameters(), lr=1e-2)

cpu


In [5]:
class CustomDataset(Dataset):
    def __init__(self, path_to_b_files, path_to_m_files, features_size=1024):
        self.features_size = features_size
        b_files = [os.path.join(path_to_b_files, f) for f in os.listdir(path_to_b_files)]
        m_files = [os.path.join(path_to_m_files, f) for f in os.listdir(path_to_m_files)]
        self.list_files = b_files + m_files
        self.length = len(self.list_files)
        self.labels = torch.cat((torch.zeros(len(b_files)),
                                 torch.ones(len(m_files))),0)
        
    def _extract_features(self, string, hash_dim, split_regex=rb"\s+"):
        tokens = re.split(pattern=split_regex, string=string)
        hash_buckets = [(mmh3.hash(w) % hash_dim) for w in tokens]
        buckets, counts = np.unique(hash_buckets, return_counts=True)
        feature_values = np.zeros(hash_dim)
        for bucket, count in zip(buckets, counts):
            feature_values[bucket] = count
        return feature_values

    def __getitem__(self, idx):
        with open(self.list_files[idx], 'rb') as f:
            content = f.read()
        data = self._extract_features(content, hash_dim=self.features_size, split_regex=rb"\s+")
        return torch.FloatTensor(data), self.labels[idx]

    def __len__(self):
        return self.length

In [6]:
BATCH_SIZE = 128
EPOCHS = 10
LOG_INTERVAL = 100
VAL_INTERVAL = 1

In [7]:
path_to_train_b_files = 'data/html/benign_files/training/'
path_to_train_m_files = 'data/html/malicious_files/training/'
path_to_validation_b_files = 'data/html/benign_files/validation/'
path_to_validation_m_files = 'data/html/malicious_files/validation/'

train_dataset = CustomDataset(path_to_train_b_files,
                                  path_to_train_m_files,
                                  FEATURES_SIZE)
train_size = len(train_dataset)

train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE,
                                shuffle=True, num_workers=8)

val_dataset = CustomDataset(path_to_validation_b_files,
                                path_to_validation_m_files,
                                FEATURES_SIZE)

val_size = len(val_dataset)

val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE,
                                shuffle=False, num_workers=8)

In [8]:
# Function to train the model
def train(net, device, train_dataloader, val_dataloader):
#def train_model(model, train_dataloader, val_dataloader, criterion, optimizer,  num_epochs):    
    start = time.time() 
    best_acc  = 0.0
    for epoch in range(EPOCHS):
        print('Epoch {}/{}'.format(epoch+1, EPOCHS))
        print('-' * 10)
        tic = time.time()
        
        net.train() 
        running_loss = 0.0
        running_corrects = 0
        for i, (data, label) in enumerate(train_dataloader):
            data, label = data.to(device), label.to(device)
            optimizer.zero_grad()
            output = net(data)
            loss = criterion(output, label.unsqueeze(1))
            loss.backward()
            optimizer.step()

            running_loss += loss.item() * data.size(0)
            # loss is a tensor, loss.item() the actual value
            # data.size(0): BATCH_SIZE
            
            pred = output.detach()
            running_corrects += ((pred > .5) == label.unsqueeze(1)).sum()



            if i % LOG_INTERVAL == 0:
              logging.info('[Epoch %d Batch %d] Training_Loss: %f' %
                            (epoch+1, i, running_loss/BATCH_SIZE))
        elapsed = time.time() - tic
        speed = i * BATCH_SIZE / elapsed
        epoch_loss = running_loss / len(train_dataset)
        epoch_acc = running_corrects.double() / train_size

        logging.info(' Training: \tSpeed =%.2f samples/sec \tTime cost =%f secs \tLoss %f \tAccuracy %f',
                     speed, elapsed, epoch_loss, epoch_acc)      
        

        
        if (epoch +1) % VAL_INTERVAL == 0:
          net.eval()
          running_corrects = 0
          for data, label in val_dataloader:
            data, label = data.to(device), label.to(device)
            
            output = net(data)
            pred = output.detach()
            running_corrects += ((pred > .5) == label.unsqueeze(1)).sum()
          val_acc = running_corrects.double() / val_size
          # deep copy the model
          if val_acc > best_acc:
            best_acc = val_acc
            best_model = copy.deepcopy(model.state_dict())
            
            
          logging.info(' Validation: \tAccuracy: %f' % val_acc)



    logging.info('Best validation accuracy: {:4f}' % best_acc)        
    logging.info('Total:%f' % (time.time()-start))
    time_elapsed = time.time() - start
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    #Load best model weights
    model.load_state_dict(best_model)
    return model

In [9]:
train(model, device, train_dataloader, val_dataloader)

Epoch 1/10
----------


INFO:root:[Epoch 1 Batch 0] Training_Loss: 0.840272
INFO:root:[Epoch 1 Batch 100] Training_Loss: 50.539741
INFO:root:[Epoch 1 Batch 200] Training_Loss: 76.215772
INFO:root:[Epoch 1 Batch 300] Training_Loss: 96.386491
INFO:root:[Epoch 1 Batch 400] Training_Loss: 113.793617
INFO:root:[Epoch 1 Batch 500] Training_Loss: 132.389906
INFO:root:[Epoch 1 Batch 600] Training_Loss: 150.384531
INFO:root:[Epoch 1 Batch 700] Training_Loss: 167.307657
INFO:root: Training: 	Speed =956.67 samples/sec 	Time cost =94.059897 secs 	Loss 0.238474 	Accuracy 0.914233
INFO:root: Validation: 	Accuracy: 0.893000


Epoch 2/10
----------


INFO:root:[Epoch 2 Batch 0] Training_Loss: 0.086947
INFO:root:[Epoch 2 Batch 100] Training_Loss: 15.895203
INFO:root:[Epoch 2 Batch 200] Training_Loss: 29.035101
INFO:root:[Epoch 2 Batch 300] Training_Loss: 43.056632
INFO:root:[Epoch 2 Batch 400] Training_Loss: 57.368421
INFO:root:[Epoch 2 Batch 500] Training_Loss: 71.373483
INFO:root:[Epoch 2 Batch 600] Training_Loss: 82.869181
INFO:root:[Epoch 2 Batch 700] Training_Loss: 94.370836
INFO:root: Training: 	Speed =1266.71 samples/sec 	Time cost =71.037685 secs 	Loss 0.134460 	Accuracy 0.954200
INFO:root: Validation: 	Accuracy: 0.906200


Epoch 3/10
----------


INFO:root:[Epoch 3 Batch 0] Training_Loss: 0.092000
INFO:root:[Epoch 3 Batch 100] Training_Loss: 9.917153
INFO:root:[Epoch 3 Batch 200] Training_Loss: 19.426199
INFO:root:[Epoch 3 Batch 300] Training_Loss: 30.517117
INFO:root:[Epoch 3 Batch 400] Training_Loss: 41.522685
INFO:root:[Epoch 3 Batch 500] Training_Loss: 51.904216
INFO:root:[Epoch 3 Batch 600] Training_Loss: 62.038521
INFO:root:[Epoch 3 Batch 700] Training_Loss: 73.886837
INFO:root: Training: 	Speed =1260.40 samples/sec 	Time cost =71.393120 secs 	Loss 0.105373 	Accuracy 0.963644
INFO:root: Validation: 	Accuracy: 0.898700


Epoch 4/10
----------


INFO:root:[Epoch 4 Batch 0] Training_Loss: 0.081384
INFO:root:[Epoch 4 Batch 100] Training_Loss: 10.243231
INFO:root:[Epoch 4 Batch 200] Training_Loss: 18.844510
INFO:root:[Epoch 4 Batch 300] Training_Loss: 27.364390
INFO:root:[Epoch 4 Batch 400] Training_Loss: 36.689920
INFO:root:[Epoch 4 Batch 500] Training_Loss: 45.354515
INFO:root:[Epoch 4 Batch 600] Training_Loss: 53.339349
INFO:root:[Epoch 4 Batch 700] Training_Loss: 64.337882
INFO:root: Training: 	Speed =868.47 samples/sec 	Time cost =103.611714 secs 	Loss 0.091664 	Accuracy 0.969633
INFO:root: Validation: 	Accuracy: 0.912000


Epoch 5/10
----------


INFO:root:[Epoch 5 Batch 0] Training_Loss: 0.070947
INFO:root:[Epoch 5 Batch 100] Training_Loss: 9.385656
INFO:root:[Epoch 5 Batch 200] Training_Loss: 21.568529
INFO:root:[Epoch 5 Batch 300] Training_Loss: 30.705186
INFO:root:[Epoch 5 Batch 400] Training_Loss: 38.708645
INFO:root:[Epoch 5 Batch 500] Training_Loss: 49.348228
INFO:root:[Epoch 5 Batch 600] Training_Loss: 60.031523
INFO:root:[Epoch 5 Batch 700] Training_Loss: 67.417498
INFO:root: Training: 	Speed =446.13 samples/sec 	Time cost =201.697758 secs 	Loss 0.096084 	Accuracy 0.970911
INFO:root: Validation: 	Accuracy: 0.919700


Epoch 6/10
----------


INFO:root:[Epoch 6 Batch 0] Training_Loss: 0.050926
INFO:root:[Epoch 6 Batch 100] Training_Loss: 6.930441
INFO:root:[Epoch 6 Batch 200] Training_Loss: 15.691050
INFO:root:[Epoch 6 Batch 300] Training_Loss: 22.602173
INFO:root:[Epoch 6 Batch 400] Training_Loss: 28.033179
INFO:root:[Epoch 6 Batch 500] Training_Loss: 35.890994
INFO:root:[Epoch 6 Batch 600] Training_Loss: 44.094500
INFO:root:[Epoch 6 Batch 700] Training_Loss: 51.842434
INFO:root: Training: 	Speed =353.36 samples/sec 	Time cost =254.651176 secs 	Loss 0.073945 	Accuracy 0.977444
INFO:root: Validation: 	Accuracy: 0.920900


Epoch 7/10
----------


INFO:root:[Epoch 7 Batch 0] Training_Loss: 0.070460
INFO:root:[Epoch 7 Batch 100] Training_Loss: 7.332041
INFO:root:[Epoch 7 Batch 200] Training_Loss: 14.811651
INFO:root:[Epoch 7 Batch 300] Training_Loss: 24.672302
INFO:root:[Epoch 7 Batch 400] Training_Loss: 32.372216
INFO:root:[Epoch 7 Batch 500] Training_Loss: 40.941403
INFO:root:[Epoch 7 Batch 600] Training_Loss: 54.541782
INFO:root:[Epoch 7 Batch 700] Training_Loss: 67.087204
INFO:root: Training: 	Speed =345.33 samples/sec 	Time cost =260.570775 secs 	Loss 0.095787 	Accuracy 0.972311
INFO:root: Validation: 	Accuracy: 0.920600


Epoch 8/10
----------


INFO:root:[Epoch 8 Batch 0] Training_Loss: 0.063554
INFO:root:[Epoch 8 Batch 100] Training_Loss: 8.894175
INFO:root:[Epoch 8 Batch 200] Training_Loss: 17.684133
INFO:root:[Epoch 8 Batch 300] Training_Loss: 25.330633
INFO:root:[Epoch 8 Batch 400] Training_Loss: 31.806198
INFO:root:[Epoch 8 Batch 500] Training_Loss: 40.604515
INFO:root:[Epoch 8 Batch 600] Training_Loss: 47.880516
INFO:root:[Epoch 8 Batch 700] Training_Loss: 55.614714
INFO:root: Training: 	Speed =349.29 samples/sec 	Time cost =257.622295 secs 	Loss 0.079255 	Accuracy 0.978333
INFO:root: Validation: 	Accuracy: 0.918000


Epoch 9/10
----------


INFO:root:[Epoch 9 Batch 0] Training_Loss: 0.070623
INFO:root:[Epoch 9 Batch 100] Training_Loss: 7.099462
INFO:root:[Epoch 9 Batch 200] Training_Loss: 12.491522
INFO:root:[Epoch 9 Batch 300] Training_Loss: 19.206853
INFO:root:[Epoch 9 Batch 400] Training_Loss: 25.432422
INFO:root:[Epoch 9 Batch 500] Training_Loss: 36.366336
INFO:root:[Epoch 9 Batch 600] Training_Loss: 45.444347
INFO:root:[Epoch 9 Batch 700] Training_Loss: 52.075326
INFO:root: Training: 	Speed =322.61 samples/sec 	Time cost =278.922453 secs 	Loss 0.074329 	Accuracy 0.980267
INFO:root: Validation: 	Accuracy: 0.922100


Epoch 10/10
----------


INFO:root:[Epoch 10 Batch 0] Training_Loss: 0.030613
INFO:root:[Epoch 10 Batch 100] Training_Loss: 6.253498
INFO:root:[Epoch 10 Batch 200] Training_Loss: 13.421359
INFO:root:[Epoch 10 Batch 300] Training_Loss: 18.781872
INFO:root:[Epoch 10 Batch 400] Training_Loss: 26.132133
INFO:root:[Epoch 10 Batch 500] Training_Loss: 34.999702
INFO:root:[Epoch 10 Batch 600] Training_Loss: 40.946223
INFO:root:[Epoch 10 Batch 700] Training_Loss: 49.466839
INFO:root: Training: 	Speed =360.96 samples/sec 	Time cost =249.289503 secs 	Loss 0.070597 	Accuracy 0.981189
INFO:root: Validation: 	Accuracy: 0.928500
INFO:root:Best validation accuracy: {:4f}
INFO:root:Total:2151.782611


Training complete in 35m 52s


Net(
  (fc1): Linear(in_features=1024, out_features=512, bias=True)
  (fc2): Linear(in_features=512, out_features=64, bias=True)
  (fc3): Linear(in_features=64, out_features=1, bias=True)
)