# MALWARE DETECTION

**Aim:**

  1. To develop a Machine Learning-based framework for malware detection using artificial neural network (ANN) algorithm.
  2. Test and evaluate the results to determine the effectiveness and efficiencies of the machine learning-based algorithms in (1) above

DATA:

### IMPORTATION OF LIBRARIES

In [248]:
# common import for basic EDA
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl

# for modelling
import tensorflow as tf
from tensorflow import keras
from tensorflow import feature_column

### Basic EDA and Feature Selection

In [249]:
# Load the dataset
dataset = pd.read_csv('/content/drive/My Drive/Malware Detector/MALWARE_DATASET.csv')
dataset.head()

Unnamed: 0,hash,millisecond,classification,state,usage_counter,prio,static_prio,normal_prio,policy,vm_pgoff,vm_truncate_count,task_size,cached_hole_size,free_area_cache,mm_users,map_count,hiwater_rss,total_vm,shared_vm,exec_vm,reserved_vm,nr_ptes,end_data,last_interval,nvcsw,nivcsw,min_flt,maj_flt,fs_excl_counter,lock,utime,stime,gtime,cgtime,signal_nvcsw
0,42fb5e2ec009a05ff5143227297074f1e9c6c3ebb9c914...,0,malware,0,0,3069378560,14274,0,0,0,13173,0,0,24,724,6850,0,150,120,124,210,0,120,3473,341974,0,0,120,0,3204448256,380690,4,0,0,0
1,42fb5e2ec009a05ff5143227297074f1e9c6c3ebb9c914...,1,malware,0,0,3069378560,14274,0,0,0,13173,0,0,24,724,6850,0,150,120,124,210,0,120,3473,341974,0,0,120,0,3204448256,380690,4,0,0,0
2,42fb5e2ec009a05ff5143227297074f1e9c6c3ebb9c914...,2,malware,0,0,3069378560,14274,0,0,0,13173,0,0,24,724,6850,0,150,120,124,210,0,120,3473,341974,0,0,120,0,3204448256,380690,4,0,0,0
3,42fb5e2ec009a05ff5143227297074f1e9c6c3ebb9c914...,3,malware,0,0,3069378560,14274,0,0,0,13173,0,0,24,724,6850,0,150,120,124,210,0,120,3473,341974,0,0,120,0,3204448256,380690,4,0,0,0
4,42fb5e2ec009a05ff5143227297074f1e9c6c3ebb9c914...,4,malware,0,0,3069378560,14274,0,0,0,13173,0,0,24,724,6850,0,150,120,124,210,0,120,3473,341974,0,0,120,0,3204448256,380690,4,0,0,0


In [250]:
dataset.classification

0        malware
1        malware
2        malware
3        malware
4        malware
          ...   
99995    malware
99996    malware
99997    malware
99998    malware
99999    malware
Name: classification, Length: 100000, dtype: object

In [251]:
dataset.classification.value_counts()

malware    50000
benign     50000
Name: classification, dtype: int64

In [252]:
# Replace 1 for malware and 0 for benign files
dataset.classification.replace({'malware':1, 'benign': 0}, inplace=True)

In [253]:
dataset.classification

0        1
1        1
2        1
3        1
4        1
        ..
99995    1
99996    1
99997    1
99998    1
99999    1
Name: classification, Length: 100000, dtype: int64

In [254]:
# Looks like our dataset is clustered, let's shuffle it a bit
dataset = dataset.sample(frac=1)

In [255]:
dataset.classification

75721    1
80184    1
19864    0
76699    1
92991    1
        ..
6265     0
54886    0
76820    1
860      1
15795    0
Name: classification, Length: 100000, dtype: int64

In [256]:
# Let's look at the nature of our dataframe
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100000 entries, 75721 to 15795
Data columns (total 35 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   hash               100000 non-null  object
 1   millisecond        100000 non-null  int64 
 2   classification     100000 non-null  int64 
 3   state              100000 non-null  int64 
 4   usage_counter      100000 non-null  int64 
 5   prio               100000 non-null  int64 
 6   static_prio        100000 non-null  int64 
 7   normal_prio        100000 non-null  int64 
 8   policy             100000 non-null  int64 
 9   vm_pgoff           100000 non-null  int64 
 10  vm_truncate_count  100000 non-null  int64 
 11  task_size          100000 non-null  int64 
 12  cached_hole_size   100000 non-null  int64 
 13  free_area_cache    100000 non-null  int64 
 14  mm_users           100000 non-null  int64 
 15  map_count          100000 non-null  int64 
 16  hiwater_rss      

we have only one object datatype column which we will take care of later

In [257]:
dataset.lock.value_counts()

3204448256    100000
Name: lock, dtype: int64

### Lets's split our dataset

In [258]:
from sklearn.model_selection import train_test_split
np.random.seed(43)
tf.random.set_seed(43)
df = dataset.copy()

train_valid, test = train_test_split(df, test_size=0.1)
train, valid = train_test_split(train_valid, test_size=0.2)

if you take a very good look at our dataset, you will observe that most columns has little or no effect to our target columns, but since the dimension of our datset
is not heavy, I don't think it will have effect on our result

## Input Pipeline

Let's build an input pipeline that will batch, shuffle and prefetch our dataset

In [259]:
# A utility method to create a tf.data dataset from a pandas dataframe

def df_to_dataset(dataframe, shuffle=True, batch_size=32):
  """
  Takes in a pandas dataframe, converts it to tf.data dataset,
  shuffles, it, batchifies and prefetches it
  """
  np.random.seed(42)
  tf.random.set_seed(42)
  # copies the dataframe
  dataframe = dataframe.copy()
  # returns the classification column
  labels = dataframe['classification']
  # convert to tensors
  dataframe = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
  # shuffle if train data
  if shuffle:
    dataframe = dataframe.shuffle(10000)
  dataframe = dataframe.batch(batch_size)
  dataframe = dataframe.prefetch(1)

  return dataframe
  
  

In [260]:
batch_size=5 # for demonstration purposes
train_set = df_to_dataset(train, batch_size=batch_size)
valid_set = df_to_dataset(valid, shuffle=False, batch_size=batch_size)
test_set = df_to_dataset(test, shuffle=False, batch_size=batch_size)

In [261]:
# Let's test the function we just created
for feature_batch, label_batch in train_set.take(1):
  print('Every feature:', list(feature_batch.keys()))
  print('A batch of hash:', feature_batch['hash'])
  print('A batch of targets:', label_batch )

Every feature: ['hash', 'millisecond', 'classification', 'state', 'usage_counter', 'prio', 'static_prio', 'normal_prio', 'policy', 'vm_pgoff', 'vm_truncate_count', 'task_size', 'cached_hole_size', 'free_area_cache', 'mm_users', 'map_count', 'hiwater_rss', 'total_vm', 'shared_vm', 'exec_vm', 'reserved_vm', 'nr_ptes', 'end_data', 'last_interval', 'nvcsw', 'nivcsw', 'min_flt', 'maj_flt', 'fs_excl_counter', 'lock', 'utime', 'stime', 'gtime', 'cgtime', 'signal_nvcsw']
A batch of hash: tf.Tensor(
[b'com.baiwang.PhotoFeeling.apk'
 b'797ca0705a3b8220e671660849c9ab8f030c09e8c00224a375a92802d521fab3'
 b'com.appquiz.educational.games.apk' b'DOCECG2.doctor.apk'
 b'com.fingerprintplay.bysbaseball2015.apk'], shape=(5,), dtype=string)
A batch of targets: tf.Tensor([0 1 0 0 0], shape=(5,), dtype=int64)


## Preprocessing our dataset

Here, we make use of tensorflows's feature_column

In [262]:
#firstly, let's work on the numeric columns
for label, content in dataset.items():
  if pd.api.types.is_numeric_dtype(content):
    print(label)

millisecond
classification
state
usage_counter
prio
static_prio
normal_prio
policy
vm_pgoff
vm_truncate_count
task_size
cached_hole_size
free_area_cache
mm_users
map_count
hiwater_rss
total_vm
shared_vm
exec_vm
reserved_vm
nr_ptes
end_data
last_interval
nvcsw
nivcsw
min_flt
maj_flt
fs_excl_counter
lock
utime
stime
gtime
cgtime
signal_nvcsw


In [263]:
# let's now write a function that will select only our numeric columns and preprocess them
def preprocess_numeric_columns(dataframe, **kwargs):
  """
  Takes in a dataframe as argument, selects the numeric columns and 
  preprocesses them to tf.numeric dtype
  """
  np.random.seed(42)
  tf.random.set_seed(43)
  numerical_columns = []
  # Select numeric columns
  num_cols = dataframe.select_dtypes(include=(np.number))
  # Convert to dictionary
  num_cols = num_cols.to_dict()
  # We need to append the keys to numeric_column
  for key, value in num_cols.items():
    numerical_columns.append(tf.feature_column.numeric_column(key=key))
  return numerical_columns



In [264]:
# Testing our preprocess_numeric_function
x=dataset[:1000]
preprocess_numeric_columns(x)

[NumericColumn(key='millisecond', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='classification', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='state', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='usage_counter', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='prio', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='static_prio', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='normal_prio', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='policy', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='vm_pgoff', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='vm_truncate_count', shape=(1,), default_value=None,

In [265]:
# Let's now write a function to convert categorical columns to embedding columns
def preprocess_categorical_columns(dataframe, **kwargs):
  """
  Takes in a dataframe as argument, preprocesses the categorical columns
  """
  categorical_columns = []
  for key, value in dataframe.items():
    if key == 'hash':
      # Get the unique values
      unique = np.unique(value)
      # Convert it to python's list
      cat_cols = unique.tolist()
      # create a dictionary of key and cat_cols
      cat_cols = {'hash': cat_cols}

  # Iterate through the newly created dictionary
  for feature, vocab in cat_cols.items():
    cat_cols = tf.feature_column.categorical_column_with_vocabulary_list(
        key=feature, vocabulary_list = vocab)
    categorical_columns.append(tf.feature_column.embedding_column(cat_cols, dimension=4))
  
  return categorical_columns
  

In [266]:
# Testing our preprocess_categorical_column function
preprocess_categorical_columns(x)

[EmbeddingColumn(categorical_column=VocabularyListCategoricalColumn(key='hash', vocabulary_list=('024b27972a6b3a1535510e9c0f154fb1a8e3a2afb25d5c30d2f6a9d23424d925', '025c63d266e05d9e3bd57dd9ebd0abe904616f569fe4e2b78cf2ac52493cb460', '0602834d897fe3f3314586ae867aed63f3757be01b7f0354c8626519d8575453', '079277b8b6049c06806b79216901d0e9ff473bfe2c2454aa8a496515167eca40', '1117d14765e9169184cc931f7a417a460898e4b0d8f3c86562065fc82f5866ce', '1119f652a5b1f04e98835e2b2ed56efe990e9073aa7dcb39251c98e21d335abd', '116ae92ecfacb70146fe643d92878e522f71af393702f3b66d2135a06bcff57f', '12fbe832590c8d44b1687b178450d49190c1a9d8e61c80a090f461168dd0bf8f', '156a617d84b92c1611e153ebaa1fc2e9d1af9c6154834c20d1f414c4d61e1983', '1824056efb105d20db233bfeb1f93ee69eeaff81b63eb8cd53d582d7330687ab', '186d3233e77f4a0c64043da385fb7f0dcd195ee0c1f46d3e8f49d4bf8d5d2d1f', '1c5643426b0d13ddcf0c12830e50252011bd2377e3b8003517bdcb96f5f11f8a', '1dec265aeda7b58e4173f47af0641a949937edbf21904ff1b6681c5348642387', '1efc135b8f924076b5

In [267]:
tf.keras.layers.DenseFeatures(preprocess_categorical_columns(x))

<tensorflow.python.feature_column.dense_features_v2.DenseFeatures at 0x7f57a90cff60>

In [268]:
# A funcion to add both numerical and categorical columns to tf.keras.layers.Densefeatures
def preprocessing_layer(categorical_columns, numerical_columns):
  return tf.keras.layers.DenseFeatures(categorical_columns + numerical_columns)

## Train, Evaluate and Save model

In [269]:
# Let's write a function to build our model
input_shape = [35]
def build_model(preprocessing_layer, n_hidden=2, n_neurons=128):
  model = tf.keras.Sequential()
  model.add(preprocessing_layer)
  for layer in range(n_hidden):
    model.add(tf.keras.layers.Dense(n_neurons, activation = 'relu'))
  model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
  model.compile(loss='categorical_crossentropy', optimizer=tf.keras.optimizers.SGD(lr=0.0001),
                metrics=['accuracy'])
  
  return model

In [270]:
# Let's write a function to train our model
import datetime
import os
early_stopping_cb = tf.keras.callbacks.EarlyStopping(patience=10)
model_checkpoint_cb = tf.keras.callbacks.ModelCheckpoint('chris_model.h5')
log_dir = os.path.join('/content/drive/My Drive/Malware Detector/logs',
                       datetime.datetime.now().strftime('%y%m%d-%H%M%S'))
tensorboard_cb = tf.keras.callbacks.TensorBoard(log_dir)
callbacks = [early_stopping_cb, model_checkpoint_cb, tensorboard_cb]

def train_model(train_data, valid_data, callbacks, preprocessing_layer):
  """
  Takes in train_data, valid_data, callbacks, preprocessing_layer as
  args and trains our model
  """
  # create the model
  
  model = build_model(preprocessing_layer)
  # Fit the model to the data passing it the callbacks we created
  model.fit(train_data, validation_data=valid_data,
            epochs=100, callbacks=callbacks)
  return model

In [271]:
# Create a function to save a model
def save_model(model, suffix=None):
  """
  Saves a  model in a given model directory and appends a suffix(string).
  """
  # Create a model directory pathname with current time
  modeldir = os.path.join("/content/drive/My Drive/Malware Detector/models",
                           datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
  model_path = modeldir + "-" + suffix + ".h5"  # save format of model
  print(f"Saving model to: {model_path}...")
  model.save(model_path)
  return model_path

  # Create a function to load a trained model
def load_model(model_path):
  """
  Loads a saved model from a specifid path.
  """
  print(f"Loading saved model from: {model_path}...")
  model = tf.keras.models.load_model(model_path,
                                     custom_objects={'KerasLayer': hub.KerasLayer})
  return model


We have created all the functions needed for training, saving and loading our model, let's now use our function to train, save and load the model

In [272]:
categorical_columns = preprocess_categorical_columns(dataset)
numerical_columns = preprocess_numeric_columns(dataset)
preprocessing_layer = preprocessing_layer(categorical_columns, numerical_columns)

In [273]:
preprocessing_layer

<tensorflow.python.feature_column.dense_features_v2.DenseFeatures at 0x7f57a7461390>

In [274]:
build_model(preprocessing_layer)


<tensorflow.python.keras.engine.sequential.Sequential at 0x7f57a909d470>

In [275]:
model = train_model(train_data=train_set, valid_data=valid_set, callbacks=callbacks, preprocessing_layer=preprocessing_layer)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100


In [276]:
model.evaluate(valid_set)



[5.9339736679930866e-08, 0.4977777898311615]

In [None]:
test_set = df_to_dataset(test)

In [279]:
# let's make some predictions
predictions = model.predict(test_set)
# Show some results
for prediction, malware in zip(predictions[:10], list(test_set)[0][1][:10]):
  prediction = tf.sigmoid(prediction).numpy()
  print("Predicted malware: {:.2%}".format(prediction[0]),
        " | Actual outcome: ",
        ("BENIGN" if bool(malware) else "MALWARE"))

Predicted malware: 73.11%  | Actual outcome:  MALWARE
Predicted malware: 73.11%  | Actual outcome:  BENIGN
Predicted malware: 73.11%  | Actual outcome:  MALWARE
Predicted malware: 73.11%  | Actual outcome:  MALWARE
Predicted malware: 73.11%  | Actual outcome:  MALWARE
