Malware Opcode Classification, Minimum Samples for 95% accuracy

Authored by David Luong and Albert Giang

LSTM Reference: https://machinelearningmastery.com/sequence-classification-lstm-recurrent-neural-networks-python-keras/

The Notebook contains both machine learning and deep learning classifiers. The main function calls on each of the classifiers and perform a hybrid of binary search and jump search to find the minimum samples needed for 95% accuracy
This Notebook deals with the Dense and CNN VAE models

In [13]:
import os
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt # for plotting model loss
import math
from datetime import datetime

#Import svm model
from sklearn import svm
#Import random forest model
from sklearn.ensemble import RandomForestClassifier

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Bidirectional
from tensorflow.keras.layers import Conv1D
from tensorflow.keras.layers import MaxPooling1D
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import LeakyReLU
from tensorflow.keras.preprocessing import sequence

#Import knn model
from sklearn.neighbors import KNeighborsClassifier

# Final evaluation of the model
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.metrics import ConfusionMatrixDisplay, classification_report
from sklearn.metrics import confusion_matrix, recall_score, precision_score
from sklearn.naive_bayes import GaussianNB

# fix random seed for reproducibility
tf.random.set_seed(7)
np.random.seed(7)
tf.keras.utils.set_random_seed(7)

import os
import sys
import json

# Assign main directory to a variable
main_dir=os.path.dirname(sys.path[0])

import data
from tensorflow.keras.models import load_model

Define LSTM Model

In [14]:
def lstm(X, y, real_malware, target_names,max_sequence_length, embedding_vector_length, num_unique):
    classifier_name = 'LSTM'
    #print('******%s******' % classifier_name)

    # truncate and pad input sequences
    X = sequence.pad_sequences(X, maxlen=max_sequence_length)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True)
    clf = Sequential()
    clf.add(Embedding(num_unique, embedding_vector_length, input_length=max_sequence_length))
    clf.add(LSTM(100))
    clf.add(Dense(1, activation='sigmoid'))
    clf.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])   
    callback = tf.keras.callbacks.EarlyStopping(monitor='loss', min_delta = 0.01, patience=10, restore_best_weights=True)
    clf.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=100, batch_size=64, callbacks=[callback], verbose=0)
    scores = clf.evaluate(X_test, y_test, verbose=0)
    accuracy = scores[1]
    y_pred = np.round(clf.predict(X_test, verbose=0))
    precision = precision_score(y_test, y_pred)
    recall= recall_score(y_test, y_pred)

    
    #fig, ax = plt.subplots(figsize=(4,4))
    #ConfusionMatrixDisplay.from_predictions(y_test, y_pred, display_labels=np.array(['real', 'fake']), cmap=plt.cm.Blues, ax=ax)
    #ax.set_title('%s Confusion Matrix \n using %s'%(real_malware,classifier_name))
    #plt.show()
    
    #class_report = classification_report(y_test, y_pred, target_names=target_names) 
    return accuracy, precision, recall

def lstm_kfold(X, y, real_malware, target_names,max_sequence_length, embedding_vector_length, num_unique):
    classifier_name = 'LSTM'
    #print('******%s******' % classifier_name)
    k = 5
    accuracy = []
    precision = []
    recall = []
    best = 0.
    kf = KFold(n_splits=k, shuffle=True, random_state=18245)

    # truncate and pad input sequences
    X = sequence.pad_sequences(X, maxlen=max_sequence_length)
    


    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        # build LSTM model
        clf = Sequential()
        clf.add(Embedding(num_unique, embedding_vector_length, input_length=max_sequence_length))
        clf.add(LSTM(100))
        clf.add(Dense(1, activation='sigmoid'))
        clf.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])   
        callback = tf.keras.callbacks.EarlyStopping(monitor='loss', min_delta = 0.01, patience=10, restore_best_weights=True)
        clf.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=100, batch_size=64, callbacks=[callback], verbose=0)
        scores = clf.evaluate(X_test, y_test, verbose=0)
        acc = scores[1]
        accuracy.append(acc)
        if acc>best:
            best = acc
            best_train, best_test = train_index, test_index
            
        y_pred = np.round(clf.predict(X_test, verbose=0))
        precision.append(precision_score(y_test, y_pred))
        recall.append(recall_score(y_test, y_pred))
        
    # print('\n%s accuracy: ' % classifier_name, accuracy)
    # print('%s precision: ' % classifier_name, precision)
    # print('%s recall: ' % classifier_name, recall)
    
    # print('\nAvg %s accuracy: %0.2f'% (classifier_name, np.mean(np.array(accuracy))))
    # print('Avg %s precision: %0.2f'% (classifier_name, np.mean(np.array(precision))))
    # print('Avg %s recall: %0.2f\n'% (classifier_name ,np.mean(np.array(recall))))
    
    X_train, X_test, y_train, y_test = X[best_train], X[best_test], y[best_train], y[best_test]
    clf = Sequential()
    clf.add(Embedding(num_unique, embedding_vector_length, input_length=max_sequence_length))
    clf.add(LSTM(100))
    clf.add(Dense(1, activation='sigmoid'))
    clf.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])   
    callback = tf.keras.callbacks.EarlyStopping(monitor='loss', min_delta = 0.01, patience=10, restore_best_weights=True)
    clf.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=100, batch_size=64, callbacks=[callback], verbose=0)
    y_pred = np.round(clf.predict(X_test, verbose=0))
    
    #fig, ax = plt.subplots(figsize=(4,4))
    #ConfusionMatrixDisplay.from_predictions(y_test, y_pred, display_labels=np.array(['real', 'fake']), cmap=plt.cm.Blues, ax=ax)
    #ax.set_title('%s Confusion Matrix \n using %s'%(real_malware,classifier_name))
    #plt.show()
    
    #class_report = classification_report(y_test, y_pred, target_names=target_names) 

    return np.mean(np.array(accuracy)), np.mean(np.array(precision)), np.mean(np.array(recall))

Define Bidirectional LSTM Classifier

In [15]:
def bi_lstm(X, y, real_malware, target_names,max_sequence_length, embedding_vector_length, num_unique):
    classifier_name = 'Bidirectional_LSTM'
    #print('******%s******' % classifier_name)

    # truncate and pad input sequences
    X = sequence.pad_sequences(X, maxlen=max_sequence_length)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True)
    clf = Sequential()
    clf.add(Embedding(num_unique, embedding_vector_length, input_length=max_sequence_length))
    #clf.add(Bidirectional(LSTM(100, dropout=0.2, recurrent_dropout=0.2)))
    clf.add(Bidirectional(LSTM(100, dropout=0.2)))
    clf.add(Dense(1, activation='sigmoid'))
    clf.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])  
    callback = tf.keras.callbacks.EarlyStopping(monitor='loss', min_delta = 0.01, patience=10, restore_best_weights=True)
    clf.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=100, batch_size=64, callbacks=[callback], verbose=0)
    scores = clf.evaluate(X_test, y_test, verbose=0)
    accuracy = scores[1]
    y_pred = np.round(clf.predict(X_test, verbose=0))
    precision = precision_score(y_test, y_pred)
    recall= recall_score(y_test, y_pred)

    
    #fig, ax = plt.subplots(figsize=(4,4))
    #ConfusionMatrixDisplay.from_predictions(y_test, y_pred, display_labels=np.array(['real', 'fake']), cmap=plt.cm.Blues, ax=ax)
    #ax.set_title('%s Confusion Matrix \n using %s'%(real_malware,classifier_name))
    #plt.show()
    
    #class_report = classification_report(y_test, y_pred, target_names=target_names) 

    return accuracy, precision, recall

def bi_lstm_kfold(X, y, real_malware, target_names,max_sequence_length, embedding_vector_length, num_unique):
    classifier_name = 'Bidirectional_LSTM'
    #print('******%s******' % classifier_name)
    k = 5
    accuracy = []
    precision = []
    recall = []
    best = 0.
    kf = KFold(n_splits=k, shuffle=True, random_state=18245)

    # truncate and pad input sequences
    X = sequence.pad_sequences(X, maxlen=max_sequence_length)
    


    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        # build LSTM model
        clf = Sequential()
        clf.add(Embedding(num_unique, embedding_vector_length, input_length=max_sequence_length))
        #clf.add(Bidirectional(LSTM(100, dropout=0.2, recurrent_dropout=0.2)))
        clf.add(Bidirectional(LSTM(100, dropout=0.2)))
        clf.add(Dense(1, activation='sigmoid'))
        clf.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])  
        callback = tf.keras.callbacks.EarlyStopping(monitor='loss', min_delta = 0.01, patience=10, restore_best_weights=True)
        clf.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=100, batch_size=64, callbacks=[callback], verbose=0)
        scores = clf.evaluate(X_test, y_test, verbose=0)
        acc = scores[1]
        accuracy.append(acc)
        if acc>best:
            best = acc
            best_train, best_test = train_index, test_index
            
        y_pred = np.round(clf.predict(X_test, verbose=0))
        precision.append(precision_score(y_test, y_pred))
        recall.append(recall_score(y_test, y_pred))
        
    # print('\n%s accuracy: ' % classifier_name, accuracy)
    # print('%s precision: ' % classifier_name, precision)
    # print('%s recall: ' % classifier_name, recall)
    
    # print('\nAvg %s accuracy: %0.2f'% (classifier_name, np.mean(np.array(accuracy))))
    # print('Avg %s precision: %0.2f'% (classifier_name, np.mean(np.array(precision))))
    # print('Avg %s recall: %0.2f\n'% (classifier_name ,np.mean(np.array(recall))))
    
    X_train, X_test, y_train, y_test = X[best_train], X[best_test], y[best_train], y[best_test]
    clf = Sequential()
    clf.add(Embedding(num_unique, embedding_vector_length, input_length=max_sequence_length))
    #clf.add(Bidirectional(LSTM(100, dropout=0.2, recurrent_dropout=0.2)))
    clf.add(Bidirectional(LSTM(100, dropout=0.2)))
    clf.add(Dense(1, activation='sigmoid'))
    clf.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])  
    callback = tf.keras.callbacks.EarlyStopping(monitor='loss', min_delta = 0.01, patience=10, restore_best_weights=True)
    clf.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=100, batch_size=64, callbacks=[callback], verbose=0)
    y_pred = np.round(clf.predict(X_test, verbose=0))
    
    #fig, ax = plt.subplots(figsize=(4,4))
    #ConfusionMatrixDisplay.from_predictions(y_test, y_pred, display_labels=np.array(['real', 'fake']), cmap=plt.cm.Blues, ax=ax)
    #ax.set_title('%s Confusion Matrix \n using %s'%(real_malware,classifier_name))
    #plt.show()
    
    #class_report = classification_report(y_test, y_pred, target_names=target_names) 

    return np.mean(np.array(accuracy)), np.mean(np.array(precision)), np.mean(np.array(recall))

Define CNN LSTM Classifier

In [16]:
def cnn_lstm(X, y, real_malware, target_names,max_sequence_length, embedding_vector_length, num_unique):
    classifier_name = 'CNN_LSTM'
    #print('******%s******' % classifier_name)

    # truncate and pad input sequences
    X = sequence.pad_sequences(X, maxlen=max_sequence_length)
    


    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True)
    clf = Sequential()
    clf.add(Embedding(num_unique, embedding_vector_length, input_length=max_sequence_length))
    clf.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
    clf.add(MaxPooling1D(pool_size=2))
    clf.add(LSTM(100))
    clf.add(Dense(1, activation='sigmoid'))
    clf.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    callback = tf.keras.callbacks.EarlyStopping(monitor='loss', min_delta = 0.01, patience=10, restore_best_weights=True)
    clf.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=100, batch_size=64, callbacks=[callback], verbose=0)
    scores = clf.evaluate(X_test, y_test, verbose=0)
    accuracy = scores[1]
    y_pred = np.round(clf.predict(X_test, verbose=0))
    precision = precision_score(y_test, y_pred)
    recall= recall_score(y_test, y_pred)

    
    #fig, ax = plt.subplots(figsize=(4,4))
    #ConfusionMatrixDisplay.from_predictions(y_test, y_pred, display_labels=np.array(['real', 'fake']), cmap=plt.cm.Blues, ax=ax)
    #ax.set_title('%s Confusion Matrix \n using %s'%(real_malware,classifier_name))
    #plt.show()
    
    #class_report = classification_report(y_test, y_pred, target_names=target_names) 

    return accuracy, precision, recall

def cnn_lstm_kfold(X, y, real_malware, target_names,max_sequence_length, embedding_vector_length, num_unique):
    classifier_name = 'CNN_LSTM'
    #print('******%s******' % classifier_name)
    k = 5
    accuracy = []
    precision = []
    recall = []
    best = 0.
    kf = KFold(n_splits=k, shuffle=True, random_state=18245)

    # truncate and pad input sequences
    X = sequence.pad_sequences(X, maxlen=max_sequence_length)
    


    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        # build LSTM model
        clf = Sequential()
        clf.add(Embedding(num_unique, embedding_vector_length, input_length=max_sequence_length))
        clf.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
        clf.add(MaxPooling1D(pool_size=2))
        clf.add(LSTM(100))
        clf.add(Dense(1, activation='sigmoid'))
        clf.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
        callback = tf.keras.callbacks.EarlyStopping(monitor='loss', min_delta = 0.01, patience=10, restore_best_weights=True)
        clf.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=100, batch_size=64, callbacks=[callback], verbose=0)
        scores = clf.evaluate(X_test, y_test, verbose=0)
        acc = scores[1]
        accuracy.append(acc)
        if acc>best:
            best = acc
            best_train, best_test = train_index, test_index
            
        y_pred = np.round(clf.predict(X_test, verbose=0))
        precision.append(precision_score(y_test, y_pred))
        recall.append(recall_score(y_test, y_pred))
        
    # print('\n%s accuracy: ' % classifier_name, accuracy)
    # print('%s precision: ' % classifier_name, precision)
    # print('%s recall: ' % classifier_name, recall)
    
    # print('\nAvg %s accuracy: %0.2f'% (classifier_name, np.mean(np.array(accuracy))))
    # print('Avg %s precision: %0.2f'% (classifier_name, np.mean(np.array(precision))))
    # print('Avg %s recall: %0.2f\n'% (classifier_name ,np.mean(np.array(recall))))
    
    X_train, X_test, y_train, y_test = X[best_train], X[best_test], y[best_train], y[best_test]
    clf = Sequential()
    clf.add(Embedding(num_unique, embedding_vector_length, input_length=max_sequence_length))
    clf.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
    clf.add(MaxPooling1D(pool_size=2))
    clf.add(LSTM(100))
    clf.add(Dense(1, activation='sigmoid'))
    clf.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    callback = tf.keras.callbacks.EarlyStopping(monitor='loss', min_delta = 0.01, patience=10, restore_best_weights=True)
    clf.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=100, batch_size=64, callbacks=[callback], verbose=0)
    y_pred = np.round(clf.predict(X_test, verbose=0))
    
    #fig, ax = plt.subplots(figsize=(4,4))
    #ConfusionMatrixDisplay.from_predictions(y_test, y_pred, display_labels=np.array(['real', 'fake']), cmap=plt.cm.Blues, ax=ax)
    #ax.set_title('%s Confusion Matrix \n using %s'%(real_malware,classifier_name))
    #plt.show()
    
    #class_report = classification_report(y_test, y_pred, target_names=target_names) 

    return np.mean(np.array(accuracy)), np.mean(np.array(precision)), np.mean(np.array(recall))

Define SVM Classifer

In [17]:
def support_vector_machine(X, y, real_malware, target_names,max_sequence_length):
    classifier_name = 'Support Vector Machines'
    #print('******%s******' % classifier_name)
    k = 5
    accuracy = []
    precision = []
    recall = []
    best = 0.

    kf = KFold(n_splits=k, shuffle=True)

    # truncate and pad input sequences
    X = sequence.pad_sequences(X, maxlen=max_sequence_length)

    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        clf = svm.SVC(C=5, kernel='rbf')
        clf.fit(X_train, y_train)
        acc = clf.score(X_test, y_test)
        accuracy.append(acc)
        if acc>best:
            best = acc
            best_train, best_test = train_index, test_index
            
        y_pred = clf.predict(X_test)
        precision.append(precision_score(y_test, y_pred))
        recall.append(recall_score(y_test, y_pred))
        
    # print('\n%s accuracy: ' % classifier_name, accuracy)
    # print('%s precision: ' % classifier_name, precision)
    # print('%s recall: ' % classifier_name, recall)
    
    # print('\nAvg %s accuracy: %0.2f'% (classifier_name, np.mean(np.array(accuracy))))
    # print('Avg %s precision: %0.2f'% (classifier_name, np.mean(np.array(precision))))
    # print('Avg %s recall: %0.2f\n'% (classifier_name ,np.mean(np.array(recall))))
    
    X_train, X_test, y_train, y_test = X[best_train], X[best_test], y[best_train], y[best_test]
    clf = svm.SVC(C=5, kernel='rbf')
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    
    #fig, ax = plt.subplots(figsize=(4,4))
    #ConfusionMatrixDisplay.from_predictions(y_test, y_pred, display_labels=np.array(['real', 'fake']), cmap=plt.cm.Blues, ax=ax)
    #ax.set_title('%s Confusion Matrix \n using %s'%(real_malware,classifier_name))
    #plt.show()
    
    #class_report = classification_report(y_test, y_pred, target_names=target_names) 

    return np.mean(np.array(accuracy)), np.mean(np.array(precision)), np.mean(np.array(recall))

Define Random Forest Classifier

In [18]:
def random_forest(X, y, real_malware, target_names,max_sequence_length):
    classifier_name = 'Random Forest'
    #print('******%s******' % classifier_name)
    k = 5
    accuracy = []
    precision = []
    recall = []
    best = 0.
    kf = KFold(n_splits=k, shuffle=True, random_state=6357)

    # truncate and pad input sequences
    X = sequence.pad_sequences(X, maxlen=max_sequence_length)
    
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        clf = RandomForestClassifier(n_estimators=50, max_depth=5)
        clf.fit(X_train, y_train)
        acc = clf.score(X_test, y_test)
        accuracy.append(acc)
        if acc>best:
            best = acc
            best_train, best_test = train_index, test_index
            
        y_pred = clf.predict(X_test)
        precision.append(precision_score(y_test, y_pred))
        recall.append(recall_score(y_test, y_pred))
        
    # print('\n%s accuracy: ' % classifier_name, accuracy)
    # print('%s precision: ' % classifier_name, precision)
    # print('%s recall: ' % classifier_name, recall)
    
    # print('\nAvg %s accuracy: %0.2f'% (classifier_name, np.mean(np.array(accuracy))))
    # print('Avg %s precision: %0.2f'% (classifier_name, np.mean(np.array(precision))))
    # print('Avg %s recall: %0.2f\n'% (classifier_name ,np.mean(np.array(recall))))
    
    X_train, X_test, y_train, y_test = X[best_train], X[best_test], y[best_train], y[best_test]
    clf = RandomForestClassifier(n_estimators=50, max_depth=5)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    
    #fig, ax = plt.subplots(figsize=(4,4))
    #ConfusionMatrixDisplay.from_predictions(y_test, y_pred, display_labels=np.array(['real', 'fake']), cmap=plt.cm.Blues, ax=ax)
    #ax.set_title('%s Confusion Matrix \n using %s'%(real_malware,classifier_name))
    #plt.show()
    
    #class_report = classification_report(y_test, y_pred, target_names=target_names) 

    return np.mean(np.array(accuracy)), np.mean(np.array(precision)), np.mean(np.array(recall))

Define k-Nearest Neighbor Classifier

In [19]:
def k_nearest_neighbors(X, y, real_malware, target_names,max_sequence_length):
    classifier_name = 'K-Nearest Neighbors'
    #print('******%s******' % classifier_name)
    k = 5
    accuracy = []
    precision = []
    recall = []
    best = 0.
    kf = KFold(n_splits=k, shuffle=True, random_state=18245)

    # truncate and pad input sequences
    X = sequence.pad_sequences(X, maxlen=max_sequence_length)
    
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        clf = KNeighborsClassifier(n_neighbors=1, p=2)
        clf.fit(X_train, y_train)
        acc = clf.score(X_test, y_test)
        accuracy.append(acc)
        if acc>best:
            best = acc
            best_train, best_test = train_index, test_index
            
        y_pred = clf.predict(X_test)
        precision.append(precision_score(y_test, y_pred))
        recall.append(recall_score(y_test, y_pred))
        
    # print('\n%s accuracy: ' % classifier_name, accuracy)
    # print('%s precision: ' % classifier_name, precision)
    # print('%s recall: ' % classifier_name, recall)
    
    # print('\nAvg %s accuracy: %0.2f'% (classifier_name, np.mean(np.array(accuracy))))
    # print('Avg %s precision: %0.2f'% (classifier_name, np.mean(np.array(precision))))
    # print('Avg %s recall: %0.2f\n'% (classifier_name ,np.mean(np.array(recall))))
    
    X_train, X_test, y_train, y_test = X[best_train], X[best_test], y[best_train], y[best_test]
    clf = KNeighborsClassifier(n_neighbors=1, p=2)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    
    #fig, ax = plt.subplots(figsize=(4,4))
    #ConfusionMatrixDisplay.from_predictions(y_test, y_pred, display_labels=np.array(['real', 'fake']), cmap=plt.cm.Blues, ax=ax)
    #ax.set_title('%s Confusion Matrix \n using %s'%(real_malware,classifier_name))
    #plt.show()
    
    #class_report = classification_report(y_test, y_pred, target_names=target_names) 

    return np.mean(np.array(accuracy)), np.mean(np.array(precision)), np.mean(np.array(recall))

Main Code

In [20]:
def combined_search(real_malware, function, present_num_sample = 0):
    #### Define Run Settings ####
    tf.random.set_seed(7)
    np.random.seed(7)
    tf.keras.utils.set_random_seed(7)

    max_sequence_length = 600
    use_deep_classifiers = 1 # 1+ = use, 0 do not use 
    target_names = ['Real', 'Fake']
    embedding_vector_length = 32
    num_tries = 20
    save_epoch = []
    latent_dim = 100
    architecture = 'WGANGP'
    deep = ['cnn_lstm_kfold','bi_lstm_kfold','lstm_kfold','cnn_lstm','bi_lstm','lstm']


    if real_malware == 'AllFiveFamilies':
        epoch_num = 1000
    else:
        epoch_num = 10000
    

    #### Load Real Samples ####
    data_obj = data.DATA(real_malware, 600)
    real_data, num_unique = data_obj.load_data(True)
    print('Total valid samples for %s: %i' % (real_malware, len(real_data)))

    ### Load Trained WGAN-GP Model ###
    gen_samples = np.load('C:/Users/Albert/Desktop/CMPE_295/Code/FC_VAEAllFiveFamilies_.npy')
            

    ### Start Binary Search ###
    linear_end_flag = 0
    binary_start_flag = 0
    binary_end_flag = 0
    binary_continue_flag = 0
    mid = 0
    while binary_end_flag == 0:
        low = 0
        count = 0
  
        if present_num_sample > 0 and binary_start_flag == 0:
            high = present_num_sample
            binary_start_flag = 1
        elif binary_continue_flag == 1:
            high = mid
        else:
            high = len(real_data)

        saved_high = high
        while low <= high:
            mid = (high + low) // 2

            if mid < 20: # Make sure num_samples do not go under 5 because of k-folds = 5
                binary_end_flag == 1
                mid = 20
                break

            num_samples = mid
            #print('Generated Samples: %s      Acutal Malware Files Used: %s' % (num_samples,num_samples))

            idx = np.random.randint(0, real_data.shape[0], num_samples)
            real_samples = real_data[idx]

            #### Generate Samples ####
            fake_samples = gen_samples[idx]

            #print(fake_samples.shape)

            #### Define Training and Test Datasets ####
            X = np.concatenate([real_samples, fake_samples])
            y= [1]*num_samples + [0]*num_samples
            X = np.array(X,dtype=object)
            y = np.array(y,dtype='int64')

            if function.__name__ in deep:
                accuracy, precision, recall = function(X,y,real_malware,target_names,max_sequence_length, embedding_vector_length, num_unique)
            else:
                accuracy, precision, recall = function(X,y,real_malware,target_names,max_sequence_length)
            if accuracy < 0.95:
                low = mid + 1

            elif accuracy > 0.95:
                high = mid -1
                count += 1


        if count <= 1: # if binary search does not find a num_samples that classifys with .95 accuracy, return highest value
            mid = saved_high
            binary_end_flag = 1
            if binary_continue_flag == 0:
                linear_end_flag = 1 #dont do linear search if binary search doesnt find any num_sample values to get to .95



        if binary_end_flag == 0:
            binary_continue_flag = 1
            now = datetime.now()
            current_time = now.strftime("%H:%M:%S")
            print('%s: Binary search in progress, Accuracy = %0.2f,  Num_Samples = %i' % (current_time, accuracy, mid))


    now = datetime.now()
    current_time = now.strftime("%H:%M:%S")
    print('%s: Binary search result for %s: %i samples needed for .95 accuracy' % (current_time, function.__name__, mid))

    ### Start Jump Search ###
    if linear_end_flag == 1:
        num_samples = mid
    else:
        num_samples = mid - 1

    final_loop = 0
    decrement = 10
    while linear_end_flag == 0:
        for loop_num in range(num_tries):

            #print('Generated Samples: %s      Acutal Malware Files Used: %s' % (num_samples,num_samples))

            idx = np.random.randint(0, real_data.shape[0], num_samples)
            real_samples = real_data[idx]

            #### Generate Samples ####
            fake_samples = gen_samples[idx]

            #print(fake_samples.shape)

            #### Define Training and Test Datasets ####
            X = np.concatenate([real_samples, fake_samples])
            y= [1]*num_samples + [0]*num_samples
            X = np.array(X,dtype=object)
            y = np.array(y,dtype='int64')

            if function.__name__ in deep:
                accuracy, precision, recall = function(X,y,real_malware,target_names,max_sequence_length, embedding_vector_length, num_unique)
            else:
                accuracy, precision, recall = function(X,y,real_malware,target_names,max_sequence_length)

            if num_samples <= 10:
                linear_end_flag = 1
                break
            
            if accuracy > 0.95:
                if final_loop == 1:
                    linear_end_flag = 1
                    #### Display All Classification Results ####
                    now = datetime.now()
                    current_time = now.strftime("%H:%M:%S")
                    print('%s: Linear search in progress, Accuracy = %0.2f,  Num_Samples = %i,  Number_of_Tries = %i' % (current_time, accuracy, num_samples, (loop_num+1)))
                    print('%s: Linear search result for %s is complete: %i samples needed for .95 accuracy' % (current_time, function.__name__, num_samples))
                    break
                elif num_samples <= 20 and linear_end_flag == 0:
                    num_samples -= 1
                    now = datetime.now()
                    current_time = now.strftime("%H:%M:%S")
                    print('%s: Linear search in progress, Accuracy = %0.2f,  Num_Samples = %i,  Number_of_Tries = %i' % (current_time, accuracy, num_samples, (loop_num+1)))
                else:
                    num_samples -= decrement
                    now = datetime.now()
                    current_time = now.strftime("%H:%M:%S")
                    print('%s: Linear search in progress, Accuracy = %0.2f,  Num_Samples = %i,  Number_of_Tries = %i' % (current_time, accuracy, num_samples, (loop_num+1)))
                break
            if loop_num == (num_tries - 1):
                final_loop = 1
                num_samples += 1
                now = datetime.now()
                current_time = now.strftime("%H:%M:%S")
                print('%s: Linear search in progress, Accuracy = %0.2f,  Num_Samples = %i,  Number_of_Tries = %i' % (current_time, accuracy, num_samples, (loop_num+1)))
                                   
        now = datetime.now()
        current_time = now.strftime("%H:%M:%S")
        print('%s: Linear search result for %s is complete: %i samples needed for .95 accuracy' % (current_time, function.__name__, num_samples))

    return num_samples

In [21]:
#malware_familes = ['AllFiveFamilies','OnLineGames', 'Renos', 'VBInject', 'WinWebSec', 'Zbot']
malware_familes = ['AllFiveFamilies']

for real_malware in malware_familes:
    svm_num_samples  = combined_search(real_malware, support_vector_machine) 
    rf_num_samples  = combined_search(real_malware, random_forest) 
    knn_num_samples  = combined_search(real_malware, k_nearest_neighbors) 

    lstm_num_samples  = combined_search(real_malware, lstm_kfold, 100)
    bi_lstm_num_samples  = combined_search(real_malware, bi_lstm_kfold)
    cnn_lstm_num_samples  = combined_search(real_malware, cnn_lstm_kfold)
    
    print(real_malware)
    print('-------------------------------------------------------------')
    print("SVM Epochs:                     ", svm_num_samples) #10
    print("Random Forest Epochs:           ", rf_num_samples) #31
    print("k-Nearest Neighbor Epochs:      ", knn_num_samples) #11335

    print("Standard LSTM Epochs:           ", lstm_num_samples) #10
    print("Bidirectional LSTM Epochs:      ", bi_lstm_num_samples) #10
    print("CNN LSTM Epochs:                ", cnn_lstm_num_samples) #10
    print('-------------------------------------------------------------')

Total samples for AllFiveFamilies: 12271
Mappings in JSON file: 40
Total valid samples for AllFiveFamilies: 11335
01:02:10: Binary search in progress, Accuracy = 0.95,  Num_Samples = 132
01:02:10: Binary search in progress, Accuracy = 0.97,  Num_Samples = 20
01:02:10: Binary search result for support_vector_machine: 20 samples needed for .95 accuracy
01:02:10: Linear search in progress, Accuracy = 0.97,  Num_Samples = 18,  Number_of_Tries = 1
01:02:10: Linear search in progress, Accuracy = 1.00,  Num_Samples = 17,  Number_of_Tries = 7


  _warn_prf(average, modifier, msg_start, len(result))


01:02:10: Linear search in progress, Accuracy = 0.97,  Num_Samples = 16,  Number_of_Tries = 4
01:02:10: Linear search in progress, Accuracy = 0.97,  Num_Samples = 15,  Number_of_Tries = 1
01:02:10: Linear search in progress, Accuracy = 0.97,  Num_Samples = 14,  Number_of_Tries = 4
01:02:10: Linear search in progress, Accuracy = 0.96,  Num_Samples = 13,  Number_of_Tries = 9


  _warn_prf(average, modifier, msg_start, len(result))


01:02:10: Linear search in progress, Accuracy = 1.00,  Num_Samples = 12,  Number_of_Tries = 3
01:02:10: Linear search in progress, Accuracy = 0.96,  Num_Samples = 11,  Number_of_Tries = 14


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


01:02:11: Linear search in progress, Accuracy = 0.96,  Num_Samples = 10,  Number_of_Tries = 3
01:02:11: Linear search result for support_vector_machine: 10 samples needed for .95 accuracy
Total samples for AllFiveFamilies: 12271
Mappings in JSON file: 40
Total valid samples for AllFiveFamilies: 11335
01:02:54: Binary search in progress, Accuracy = 0.93,  Num_Samples = 50
01:02:56: Binary search result for random_forest: 50 samples needed for .95 accuracy
01:02:57: Linear search in progress, Accuracy = 0.98,  Num_Samples = 39,  Number_of_Tries = 4
01:03:01: Linear search in progress, Accuracy = 0.96,  Num_Samples = 29,  Number_of_Tries = 10
01:03:08: Linear search in progress, Accuracy = 0.81,  Num_Samples = 30,  Number_of_Tries = 20
01:03:16: Linear search in progress, Accuracy = 0.87,  Num_Samples = 31,  Number_of_Tries = 20
01:03:20: Linear search result for random_forest: 31 samples needed for .95 accuracy
Total samples for AllFiveFamilies: 12271
Mappings in JSON file: 40
Total vali

  _warn_prf(average, modifier, msg_start, len(result))


01:29:31: Linear search in progress, Accuracy = 0.97,  Num_Samples = 16,  Number_of_Tries = 8
01:30:36: Linear search in progress, Accuracy = 0.97,  Num_Samples = 15,  Number_of_Tries = 2
01:34:20: Linear search in progress, Accuracy = 0.97,  Num_Samples = 14,  Number_of_Tries = 8
01:36:24: Linear search in progress, Accuracy = 0.97,  Num_Samples = 13,  Number_of_Tries = 4
01:45:11: Linear search in progress, Accuracy = 1.00,  Num_Samples = 12,  Number_of_Tries = 17
01:46:45: Linear search in progress, Accuracy = 0.96,  Num_Samples = 11,  Number_of_Tries = 3
01:47:46: Linear search in progress, Accuracy = 1.00,  Num_Samples = 10,  Number_of_Tries = 2
01:48:29: Linear search result for lstm_kfold: 10 samples needed for .95 accuracy
Total samples for AllFiveFamilies: 12271
Mappings in JSON file: 40
Total valid samples for AllFiveFamilies: 11335
02:41:19: Binary search in progress, Accuracy = 0.97,  Num_Samples = 154
02:43:13: Binary search in progress, Accuracy = 0.99,  Num_Samples = 20


  _warn_prf(average, modifier, msg_start, len(result))


02:50:29: Linear search in progress, Accuracy = 1.00,  Num_Samples = 17,  Number_of_Tries = 6


  _warn_prf(average, modifier, msg_start, len(result))


02:56:34: Linear search in progress, Accuracy = 0.97,  Num_Samples = 16,  Number_of_Tries = 7
02:57:23: Linear search in progress, Accuracy = 0.97,  Num_Samples = 15,  Number_of_Tries = 1
03:06:13: Linear search in progress, Accuracy = 0.97,  Num_Samples = 14,  Number_of_Tries = 11
03:20:48: Linear search in progress, Accuracy = 0.97,  Num_Samples = 13,  Number_of_Tries = 17


  _warn_prf(average, modifier, msg_start, len(result))


03:23:25: Linear search in progress, Accuracy = 1.00,  Num_Samples = 12,  Number_of_Tries = 3
03:24:16: Linear search in progress, Accuracy = 0.96,  Num_Samples = 11,  Number_of_Tries = 1
03:27:37: Linear search in progress, Accuracy = 0.96,  Num_Samples = 10,  Number_of_Tries = 4
03:28:38: Linear search result for bi_lstm_kfold: 10 samples needed for .95 accuracy
Total samples for AllFiveFamilies: 12271
Mappings in JSON file: 40
Total valid samples for AllFiveFamilies: 11335
03:50:53: Binary search in progress, Accuracy = 0.91,  Num_Samples = 22
03:50:53: Binary search result for cnn_lstm_kfold: 22 samples needed for .95 accuracy
03:51:22: Linear search in progress, Accuracy = 0.95,  Num_Samples = 11,  Number_of_Tries = 1


  _warn_prf(average, modifier, msg_start, len(result))


03:53:12: Linear search in progress, Accuracy = 0.96,  Num_Samples = 10,  Number_of_Tries = 4
03:53:45: Linear search result for cnn_lstm_kfold: 10 samples needed for .95 accuracy
AllFiveFamilies
-------------------------------------------------------------
SVM Epochs:                      10
Random Forest Epochs:            31
k-Nearest Neighbor Epochs:       11335
Standard LSTM Epochs:            10
Bidirectional LSTM Epochs:       10
CNN LSTM Epochs:                 10
-------------------------------------------------------------
