# Neural network malsite detector with TensorFlow, 30.5.2020

In [0]:
from google.colab import drive
drive.mount('/content/gdrive',force_remount=True)
base_dir = "gdrive/My Drive/Colab Notebooks/"

In [0]:
!mkdir -p data
!tar -xzf gdrive/My\ Drive/Colab\ Notebooks/Data/htmldata.tar.gz -C data

In [0]:
from __future__ import absolute_import, division, print_function, unicode_literals

import tensorflow as tf
from tensorflow import keras
print(tf.__version__)

In [0]:
import os, re, time,datetime
import numpy as np

#Feature extraction

In [0]:
!pip install murmurhash3
import mmh3

In [0]:
def extract_features(string, hash_dim = 1024, split_regex = rb"\s+"):
  tokens = re.split(pattern= split_regex, string = string)
  token_hash_buckets = [(mmh3.hash(w) % hash_dim) for w in tokens]
  buckets, counts = np.unique(token_hash_buckets,return_counts= True)
  token_hash_counts = np.zeros(hash_dim)
  for bucket, count in zip(buckets, counts):
    token_hash_counts[bucket] = count
  return token_hash_counts                       

#Data generation

In [0]:
def read_file(file_name, dir):
  with open(os.path.join(dir, file_name),  'rb') as f:
    file = f.read()
  return file

In [0]:
def my_generator(path_to_b_files, path_to_m_files, batch_size, features_size=1024):
  while True:
    b_files = os.listdir(path_to_b_files)
    m_files = os.listdir(path_to_m_files)
    n_samples_per_class = batch_size//2 # 2 = no. of classes
    assert len(b_files) >= n_samples_per_class
    assert len(m_files) >= n_samples_per_class
    b_features = [extract_features(read_file(file_name=sha, dir= path_to_b_files), hash_dim = features_size) for sha in np.random.choice(b_files, n_samples_per_class, replace = False) ]
    m_features = [extract_features(read_file(file_name=sha, dir= path_to_m_files), hash_dim = features_size) for sha in np.random.choice(m_files, n_samples_per_class, replace = False) ]
    all_features = b_features + m_features
    labels = [0 for i in range(n_samples_per_class)] + [1 for i in range(n_samples_per_class)]
    idx = np.random.choice(range(batch_size),batch_size)
    all_features = np.array([all_features[i] for i in idx])
    labels = np.array([labels[i] for i in idx])
    yield all_features, labels

In [0]:
BATCH_SIZE = 128
FEATURES_SIZE = 1024

path_to_training_b_files = 'data/html/benign_files/training/'
path_to_training_m_files = 'data/html/malicious_files/training/'

train_b_files = os.listdir(path_to_training_b_files)
train_m_files = os.listdir(path_to_training_m_files)

# Number of samples
nbr = len(train_b_files) + len(train_m_files)
print(nbr)
# Get number of training steps. This indicated the number of steps it takes
# to cover all samples in one epoch.
steps_per_epoch = nbr // BATCH_SIZE
if nbr % BATCH_SIZE:
    steps_per_epoch += 1
print(steps_per_epoch)

training_generator = my_generator(
    path_to_b_files = path_to_training_b_files, 
    path_to_m_files = path_to_training_m_files, 
    batch_size = BATCH_SIZE, 
    features_size = FEATURES_SIZE
)

In [0]:
path_to_validation_b_files = 'data/html/benign_files/validation/'
path_to_validation_m_files = 'data/html/malicious_files/validation/'

val_b_files = os.listdir(path_to_validation_b_files)
val_m_files = os.listdir(path_to_validation_m_files)

nbr = len(val_b_files) + len(val_m_files)
print(nbr)
validation_steps = nbr // BATCH_SIZE
if nbr % BATCH_SIZE:
    validation_steps += 1
print(validation_steps)

validation_generator = my_generator(
    path_to_b_files = path_to_validation_b_files, 
    path_to_m_files = path_to_validation_m_files, 
    batch_size = BATCH_SIZE, 
    features_size= FEATURES_SIZE
)

#Definition of the model

In [0]:
def my_model(input_length=1024):
  model = tf.keras.Sequential([
    tf.keras.layers.Dense(1024, input_shape =(1024,), dtype='float32', activation='relu'),
    tf.keras.layers.BatchNormalization(),    
    tf.keras.layers.Dense(512, activation='relu'),
    tf.keras.layers.BatchNormalization(), 
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.BatchNormalization(), 
    tf.keras.layers.Dense(1, activation='sigmoid')
  ])
  model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
  return model

In [0]:
my_model().summary()

# Train the model

In [0]:
start = time.time()
model = my_model(input_length = FEATURES_SIZE)
EPOCHS = 5

history = model.fit(
    x= training_generator,
    y = None,
    steps_per_epoch= steps_per_epoch,
    validation_data=validation_generator,
    validation_steps=validation_steps,
    epochs=EPOCHS,
    verbose=1
)
stop = time.time()
print(stop-start)