In [173]:
import os
import numpy as np
import random
from sklearn.model_selection import train_test_split
import matplotlib 
import matplotlib.pyplot as plt # for plotting model loss

#Import svm model
from sklearn import svm
#Import random forest model
from sklearn.ensemble import RandomForestClassifier

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing import sequence

# fix random seed for reproducibility
tf.random.set_seed(7)

import os
import sys
import json

# Assign main directory to a variable
main_dir=os.path.dirname(sys.path[0])

In [174]:
# parameters
real_malware = ['WinWebSec'] # OnLineGames, Renos, VBInject, WinWebSec, Zbot
fake_malware = 'wws' # olgames, renos, vbinject, wws, zbot
use_harshit = 0 # 1 to use Harshit's fake samples, other to use Albert's
select_malware = 'vae_dense_samples' # wgan_gp_samples, vae_cnn_samples, vae_dense_samples
select_vectorization = 'pos' # freq, pos
unique_opcodes = 1 # 0 = visualize_explore/opcodes, 1 = fake_tests/opdicts
max_sequence_length = 600

# LSTM
embedding_vector_length = 32
top_opcodes = 22

In [175]:
# function to get unique values
def unique(list1):
    # initialize a null list
    unique_list = []
    # traverse for all elements
    for x in list1:
        # check if exists in unique_list or not
        if x not in unique_list:
            unique_list.append(x)
    return unique_list

# Python code to count the number of occurrences
def countX(lst, x):
    count = 0
    for ele in lst:
        if (ele == x):
            count = count + 1
    return count

# opcodes are indexed by overall frequency
def get_opcode_freq(opcode_frequency):
    rank = 0
    prev_count = -1
    for item in sorted(opcode_frequency,reverse=True):
        # increment rank if current opcode has different frequency than previous opcode
        if prev_count != item[0]:
            rank+=1
        # assign frequency rank
        opcode_frequency[item[1]] = rank
        # save previous frequency
        prev_count = item[0]
    return opcode_frequency

# opcodes are indexed by unique opcode position (opcodes_into_list)
def get_opcode_pos(data_into_list,opcodes_into_list):
    opcode_position = []
    nx = 0
    for x in data_into_list:
        ny = 0
        for y in opcodes_into_list:
            if x == y:
                opcode_position.append(ny)
                break
            ny+=1
        nx+=1
        # index for unknown opcodes
        if len(opcodes_into_list) == ny:
            opcode_position.append(ny)
    return opcode_position

# get opcode list from opdict
def get_opcode_dict(malware_fam,file_path):
    my_opcodes = file_path + 'opdict' + malware_fam + '.json'
    with open(my_opcodes) as json_file:
        data = json.load(json_file)
    return list(data.keys())

In [176]:
# initialize variables
dataset = []
dataset_names = []
dataset_ind = []

# iterate through families
for real in real_malware:

    # opening list of unique opcodes
    if unique_opcodes == 0:
        my_fp = './code/visualize_explore/opcodes/'
        my_opcodes = open(my_fp + 'opcodes' + real + '.txt','r')
        opcodes = my_opcodes.read()
        opcodes_into_list = opcodes.replace('\n', ' ').split(" ")
        opcodes_into_list.remove('')
        # close opcodes file
        my_opcodes.close()
    elif unique_opcodes == 1:
        my_fp = main_dir + '/code-20230116T073801Z-001/code/fake_tests/opdicts/'
        opcodes_into_list = get_opcode_dict(real,my_fp)

    # real malware parameters
    my_filepath = "../malware_data/" + real +'/'
    dir_list = os.listdir(my_filepath)

    # process real malware
    for fm in dir_list:
        # if f == 'VirusShare_07c88839c083ddf7ecb11e7bfde38ea8.txt': # debug
        print('Processing real malware' + my_filepath + fm)

        # opening the file in read mode
        my_file = open(my_filepath + fm, "r")

        # reading the file
        data = my_file.read()

        # replacing end of line('/n') with ' ' and
        # splitting the text it further when '.' is seen.
        data_into_list = data.replace('\n', ' ').split(" ")

        # remove '' from opcodes
        data_into_list.remove('')

        print(len(data_into_list))

        opcode_frequency = []
        if data_into_list:
                
            # create (opcode frequency, rank) tuple
            idx = 0
            for x in opcodes_into_list:
                count = countX(data_into_list, x)
                #print('{} has occurred {} times'.format(x,count))
                opcode_frequency.append((count,idx))
                idx+=1
            # print(sorted(opcode_frequency,reverse=True))

            # opcodes are indexed by overall frequency
            opcode_frequency = get_opcode_freq(opcode_frequency)

            # opcodes are indexed by opcodes_into_list position
            opcode_position = get_opcode_pos(data_into_list,opcodes_into_list)

            print(len(opcode_position))

            # add real malware to dataset
            dataset.append(opcode_position)
            dataset_names.append(fm)
            dataset_ind.append(0) # 0 is indicator for real malware
        else:
            # skip processing if malware file is empty
            print('------------> is empty ... skipping')

        # close file
        my_file.close()

    # create fake malware dataset
    if use_harshit == 1:
        my_fake_filepath = './code/fake_tests/fakeSamples/' + fake_malware # Harshit
        dir_list_fake = os.listdir(my_fake_filepath)

        for fake in dir_list_fake:
            # open the file
            my_fake_opcodes = open(my_fake_filepath + '/' + fake, "r")

            # reading the file
            data = my_fake_opcodes.read()

            # replacing end of line('/n') with ' ' and
            # splitting the text it further when '.' is seen.
            data_into_list = data.replace('\n', ' ').split(" ")

            # remove '' from opcodes
            data_into_list.remove('')

            opcode_frequency = []
            if data_into_list:
                    
                # create (opcode frequency, rank) tuple
                idx = 0
                for x in opcodes_into_list:
                    count = countX(data_into_list, x)
                    #print('{} has occurred {} times'.format(x,count))
                    opcode_frequency.append((count,idx))
                    idx+=1
                # print(sorted(opcode_frequency,reverse=True))

                # opcodes are indexed by overall frequency
                opcode_frequency = get_opcode_freq(opcode_frequency)

                # opcodes are indexed by opcodes_into_list position
                opcode_position = get_opcode_pos(data_into_list,opcodes_into_list)

                # add real malware to dataset
                if select_vectorization == 'freq':
                    dataset.append(opcode_frequency)
                elif select_vectorization == 'pos':
                    dataset.append(opcode_position)

                dataset_ind.append(1)

            # close opcodes file
            my_fake_opcodes.close()

    else: # self-generated malware
        
        # fake malware parameters
        my_fake_filepath = './code/fake_tests/' + select_malware + '/' + fake_malware + '/'
        dir_list = os.listdir(my_fake_filepath)

        # process fake malware
        for fm in dir_list:
            # if f == 'VirusShare_07c88839c083ddf7ecb11e7bfde38ea8.txt': # debug
            print('Processing fake malware' + my_fake_filepath + fm)

            # opening the file in read mode
            my_fake_file = open(my_fake_filepath + fm, "r")

            # reading the file
            data = my_fake_file.read()

            # replacing end of line('/n') with ' ' and
            # splitting the text it further when '.' is seen.
            data_into_list = data.replace('\n', ' ').split(" ")

            # remove '' from opcodes
            data_into_list.remove('')

            opcode_frequency = []
            if data_into_list:
                    
                # create (opcode frequency, rank) tuple
                idx = 0
                for x in opcodes_into_list:
                    count = countX(data_into_list, x)
                    #print('{} has occurred {} times'.format(x,count))
                    opcode_frequency.append((count,idx))
                    idx+=1
                # print(sorted(opcode_frequency,reverse=True))

                # opcodes are indexed by overall frequency
                opcode_frequency = get_opcode_freq(opcode_frequency)

                # opcodes are indexed by opcodes_into_list position
                opcode_position = get_opcode_pos(data_into_list,opcodes_into_list)

                # add fake malware to dataset
                if select_vectorization == 'freq':
                    dataset.append(opcode_frequency)
                elif select_vectorization == 'pos':
                    dataset.append(opcode_position)

                dataset_names.append(fm)
                dataset_ind.append(1) # 1 is indicator for fake malware
            else:
                # skip processing if malware file is empty
                print('------------> is empty ... skipping')

            # close file
            my_fake_file.close()

# convert dataset to numpy arrays
X = np.array(dataset,dtype=object)
y = np.array(dataset_ind,dtype='int64')

# print
nSamples = len(X)
print('There are', nSamples, 'malware files')

Processing real malware../malware_data/WinWebSec/0009d99691e8eed99c7dd1500e07cda336d54260.asm.txt
617
617
Processing real malware../malware_data/WinWebSec/00113d9802cca3deba19cf9daa17f1c2269de2b8.asm.txt
313
313
Processing real malware../malware_data/WinWebSec/0036d720d8ff6c8f4860b5c69deba7c400e4d356.asm.txt
671
671
Processing real malware../malware_data/WinWebSec/0037c7716f1dc8e5c4e1f9a9f3e9d5aedb7a6979.asm.txt
6547
6547
Processing real malware../malware_data/WinWebSec/003824de7a82d2db9fc877c44ea93f76dd0e5ca9.asm.txt
618
618
Processing real malware../malware_data/WinWebSec/00427746e03afb4d3b28791a82315e52acf66a0b.asm.txt
638
638
Processing real malware../malware_data/WinWebSec/004bb59ba37917bfea49e6904f0551df7b3c719f.asm.txt
1089
1089
Processing real malware../malware_data/WinWebSec/005150c72b9cd08a62bc0d730e3593b4f160534a.asm.txt
404
404
Processing real malware../malware_data/WinWebSec/005231177f706856a0617a2c871d627ddedf54a7.asm.txt
1211
1211
Processing real malware../malware_data/W

In [184]:
print(len(dataset[0]))
#print(dataset[0])
np.rint(dataset[0]).astype(int).reshape(1,617)

617


array([[ 1,  0, 16,  1,  1,  1,  0,  0,  0,  0,  0,  0, 11,  9, 21, 21,
         0,  0,  0,  1,  9,  1,  2,  0,  9,  1,  2,  8,  0,  3, 21,  0,
         0,  0,  0,  0,  3,  6,  1,  1,  1,  1,  1,  2,  0,  1,  0,  1,
         2,  8, 11,  4, 21, 21,  1,  0,  1,  1,  2, 11,  4,  4,  4,  0,
         4, 12,  1,  0, 16,  1,  1,  1,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  3,  5,
         0,  0,  0,  1,  0, 21,  0,  4,  0, 10,  0,  0,  0,  1,  0, 16,
         0,  0,  0,  0,  4,  0,  0,  8,  0,  0,  0, 16,  0,  1,  4,  0,
         0,  8,  0,  1,  9,  1,  0,  0,  9,  1,  2,  8,  1,  9,  1,  9,
         1,  2,  8,  0,  0,  0,  8,  0,  0,  8,  0,  1,  9,  1,  0,  0,
         9,  1,  2,  8,  0,  8,  0,  0,  8,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  8,  0,  0,  0, 21,  0,  0,  1,  0,  8,  1,  0,
         1,  2,  8,  0,  8,  0,  0,  0,  0,  0,  8,  0,  0,  0, 16,  0,
         0,  3,  6,  0, 10,  0,  8,  0,  0, 16,  0,  0,  8,  0, 

In [178]:
# train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

# truncate and pad input sequences
X_train = sequence.pad_sequences(X_train, maxlen=max_sequence_length)
X_test = sequence.pad_sequences(X_test, maxlen=max_sequence_length)

In [179]:
# create LSTM model

model = Sequential()
model.add(Embedding(top_opcodes, embedding_vector_length, input_length=max_sequence_length))
model.add(LSTM(100))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=64)

# Final evaluation of the model
scores = model.evaluate(X_test, y_test, verbose=1)
print("LSTM Accuracy: %.2f%%" % (scores[1]*100))

Model: "sequential_12"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_12 (Embedding)    (None, 600, 32)           704       
                                                                 
 lstm_12 (LSTM)              (None, 100)               53200     
                                                                 
 dense_12 (Dense)            (None, 1)                 101       
                                                                 
Total params: 54,005
Trainable params: 54,005
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
LSTM Accuracy: 42.86%


In [180]:
#Create a svm Classifier
model = svm.SVC(kernel='linear') # Linear Kernel

#Train the model using the training sets
model.fit(X_train, y_train)

#Predict the response for test dataset
y_pred = model.predict(X_test)

# Final evaluation of the model
from sklearn.metrics import accuracy_score

print("SVM Accuracy: %.2f%%" % (accuracy_score(y_test, y_pred)*100))

SVM Accuracy: 85.71%


In [181]:
#Create a Random Forest Classifier
model = RandomForestClassifier(max_depth=2, random_state=0)

#Train the model using the training sets
model.fit(X_train, y_train)

#Predict the response for test dataset
y_pred = model.predict(X_test)

# Final evaluation of the model
from sklearn.metrics import accuracy_score

print("Random Forest Accuracy: %.2f%%" % (accuracy_score(y_test, y_pred)*100))

Random Forest Accuracy: 100.00%
