Malware Opcode Classification 

Authored by David Luong

Reference: https://machinelearningmastery.com/sequence-classification-lstm-recurrent-neural-networks-python-keras/

In [None]:
import os
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt # for plotting model loss
import math
from datetime import datetime

#Import svm model
from sklearn import svm
#Import random forest model
from sklearn.ensemble import RandomForestClassifier

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Bidirectional
from tensorflow.keras.layers import Conv1D
from tensorflow.keras.layers import MaxPooling1D
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import LeakyReLU
from tensorflow.keras.preprocessing import sequence

#Import knn model
from sklearn.neighbors import KNeighborsClassifier

# Final evaluation of the model
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.metrics import ConfusionMatrixDisplay, classification_report
from sklearn.metrics import confusion_matrix, recall_score, precision_score
from sklearn.naive_bayes import GaussianNB

# fix random seed for reproducibility
tf.random.set_seed(7)
np.random.seed(7)
tf.keras.utils.set_random_seed(7)

import os
import sys
import json

# Assign main directory to a variable
main_dir=os.path.dirname(sys.path[0])

import data
from tensorflow.keras.models import load_model

Define LSTM Model

In [None]:
def lstm(X, y, real_malware, target_names,max_sequence_length, embedding_vector_length, num_unique):
    classifier_name = 'LSTM'
    #print('******%s******' % classifier_name)

    # truncate and pad input sequences
    X = sequence.pad_sequences(X, maxlen=max_sequence_length)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True)
    clf = Sequential()
    clf.add(Embedding(num_unique, embedding_vector_length, input_length=max_sequence_length))
    clf.add(LSTM(100))
    clf.add(Dense(1, activation='sigmoid'))
    clf.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])   
    callback = tf.keras.callbacks.EarlyStopping(monitor='loss', min_delta = 0.01, patience=10, restore_best_weights=True)
    clf.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=100, batch_size=64, callbacks=[callback], verbose=0)
    scores = clf.evaluate(X_test, y_test, verbose=0)
    accuracy = scores[1]
    y_pred = np.round(clf.predict(X_test, verbose=0))
    precision = precision_score(y_test, y_pred)
    recall= recall_score(y_test, y_pred)

    
    #fig, ax = plt.subplots(figsize=(4,4))
    #ConfusionMatrixDisplay.from_predictions(y_test, y_pred, display_labels=np.array(['real', 'fake']), cmap=plt.cm.Blues, ax=ax)
    #ax.set_title('%s Confusion Matrix \n using %s'%(real_malware,classifier_name))
    #plt.show()
    
    class_report = classification_report(y_test, y_pred, target_names=target_names) 
    return accuracy, precision, recall, class_report

def lstm_kfold(X, y, real_malware, target_names,max_sequence_length, embedding_vector_length, num_unique):
    classifier_name = 'LSTM'
    #print('******%s******' % classifier_name)
    k = 5
    accuracy = []
    precision = []
    recall = []
    best = 0.
    kf = KFold(n_splits=k, shuffle=True, random_state=18245)

    # truncate and pad input sequences
    X = sequence.pad_sequences(X, maxlen=max_sequence_length)
    


    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        # build LSTM model
        clf = Sequential()
        clf.add(Embedding(num_unique, embedding_vector_length, input_length=max_sequence_length))
        clf.add(LSTM(100))
        clf.add(Dense(1, activation='sigmoid'))
        clf.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])   
        callback = tf.keras.callbacks.EarlyStopping(monitor='loss', min_delta = 0.01, patience=10, restore_best_weights=True)
        clf.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=100, batch_size=64, callbacks=[callback], verbose=0)
        scores = clf.evaluate(X_test, y_test, verbose=0)
        acc = scores[1]
        accuracy.append(acc)
        if acc>best:
            best = acc
            best_train, best_test = train_index, test_index
            
        y_pred = np.round(clf.predict(X_test, verbose=0))
        precision.append(precision_score(y_test, y_pred))
        recall.append(recall_score(y_test, y_pred))
        
    # print('\n%s accuracy: ' % classifier_name, accuracy)
    # print('%s precision: ' % classifier_name, precision)
    # print('%s recall: ' % classifier_name, recall)
    
    # print('\nAvg %s accuracy: %0.2f'% (classifier_name, np.mean(np.array(accuracy))))
    # print('Avg %s precision: %0.2f'% (classifier_name, np.mean(np.array(precision))))
    # print('Avg %s recall: %0.2f\n'% (classifier_name ,np.mean(np.array(recall))))
    
    X_train, X_test, y_train, y_test = X[best_train], X[best_test], y[best_train], y[best_test]
    clf = Sequential()
    clf.add(Embedding(num_unique, embedding_vector_length, input_length=max_sequence_length))
    clf.add(LSTM(100))
    clf.add(Dense(1, activation='sigmoid'))
    clf.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])   
    callback = tf.keras.callbacks.EarlyStopping(monitor='loss', min_delta = 0.01, patience=10, restore_best_weights=True)
    clf.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=100, batch_size=64, callbacks=[callback], verbose=0)
    y_pred = np.round(clf.predict(X_test, verbose=0))
    
    #fig, ax = plt.subplots(figsize=(4,4))
    #ConfusionMatrixDisplay.from_predictions(y_test, y_pred, display_labels=np.array(['real', 'fake']), cmap=plt.cm.Blues, ax=ax)
    #ax.set_title('%s Confusion Matrix \n using %s'%(real_malware,classifier_name))
    #plt.show()
    
    class_report = classification_report(y_test, y_pred, target_names=target_names) 

    return np.mean(np.array(accuracy)), np.mean(np.array(precision)), np.mean(np.array(recall)), class_report

Define Bidirectional LSTM Classifier

In [None]:
def bi_lstm(X, y, real_malware, target_names,max_sequence_length, embedding_vector_length, num_unique):
    classifier_name = 'Bidirectional_LSTM'
    #print('******%s******' % classifier_name)

    # truncate and pad input sequences
    X = sequence.pad_sequences(X, maxlen=max_sequence_length)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True)
    clf = Sequential()
    clf.add(Embedding(num_unique, embedding_vector_length, input_length=max_sequence_length))
    #clf.add(Bidirectional(LSTM(100, dropout=0.2, recurrent_dropout=0.2)))
    clf.add(Bidirectional(LSTM(100, dropout=0.2)))
    clf.add(Dense(1, activation='sigmoid'))
    clf.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])  
    callback = tf.keras.callbacks.EarlyStopping(monitor='loss', min_delta = 0.01, patience=10, restore_best_weights=True)
    clf.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=100, batch_size=64, callbacks=[callback], verbose=0)
    scores = clf.evaluate(X_test, y_test, verbose=0)
    accuracy = scores[1]
    y_pred = np.round(clf.predict(X_test, verbose=0))
    precision = precision_score(y_test, y_pred)
    recall= recall_score(y_test, y_pred)

    
    #fig, ax = plt.subplots(figsize=(4,4))
    #ConfusionMatrixDisplay.from_predictions(y_test, y_pred, display_labels=np.array(['real', 'fake']), cmap=plt.cm.Blues, ax=ax)
    #ax.set_title('%s Confusion Matrix \n using %s'%(real_malware,classifier_name))
    #plt.show()
    
    class_report = classification_report(y_test, y_pred, target_names=target_names) 

    return accuracy, precision, recall, class_report

def bi_lstm_kfold(X, y, real_malware, target_names,max_sequence_length, embedding_vector_length, num_unique):
    classifier_name = 'Bidirectional_LSTM'
    #print('******%s******' % classifier_name)
    k = 5
    accuracy = []
    precision = []
    recall = []
    best = 0.
    kf = KFold(n_splits=k, shuffle=True, random_state=18245)

    # truncate and pad input sequences
    X = sequence.pad_sequences(X, maxlen=max_sequence_length)
    


    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        # build LSTM model
        clf = Sequential()
        clf.add(Embedding(num_unique, embedding_vector_length, input_length=max_sequence_length))
        #clf.add(Bidirectional(LSTM(100, dropout=0.2, recurrent_dropout=0.2)))
        clf.add(Bidirectional(LSTM(100, dropout=0.2)))
        clf.add(Dense(1, activation='sigmoid'))
        clf.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])  
        callback = tf.keras.callbacks.EarlyStopping(monitor='loss', min_delta = 0.01, patience=10, restore_best_weights=True)
        clf.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=100, batch_size=64, callbacks=[callback], verbose=0)
        scores = clf.evaluate(X_test, y_test, verbose=0)
        acc = scores[1]
        accuracy.append(acc)
        if acc>best:
            best = acc
            best_train, best_test = train_index, test_index
            
        y_pred = np.round(clf.predict(X_test, verbose=0))
        precision.append(precision_score(y_test, y_pred))
        recall.append(recall_score(y_test, y_pred))
        
    # print('\n%s accuracy: ' % classifier_name, accuracy)
    # print('%s precision: ' % classifier_name, precision)
    # print('%s recall: ' % classifier_name, recall)
    
    # print('\nAvg %s accuracy: %0.2f'% (classifier_name, np.mean(np.array(accuracy))))
    # print('Avg %s precision: %0.2f'% (classifier_name, np.mean(np.array(precision))))
    # print('Avg %s recall: %0.2f\n'% (classifier_name ,np.mean(np.array(recall))))
    
    X_train, X_test, y_train, y_test = X[best_train], X[best_test], y[best_train], y[best_test]
    clf = Sequential()
    clf.add(Embedding(num_unique, embedding_vector_length, input_length=max_sequence_length))
    #clf.add(Bidirectional(LSTM(100, dropout=0.2, recurrent_dropout=0.2)))
    clf.add(Bidirectional(LSTM(100, dropout=0.2)))
    clf.add(Dense(1, activation='sigmoid'))
    clf.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])  
    callback = tf.keras.callbacks.EarlyStopping(monitor='loss', min_delta = 0.01, patience=10, restore_best_weights=True)
    clf.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=100, batch_size=64, callbacks=[callback], verbose=0)
    y_pred = np.round(clf.predict(X_test, verbose=0))
    
    #fig, ax = plt.subplots(figsize=(4,4))
    #ConfusionMatrixDisplay.from_predictions(y_test, y_pred, display_labels=np.array(['real', 'fake']), cmap=plt.cm.Blues, ax=ax)
    #ax.set_title('%s Confusion Matrix \n using %s'%(real_malware,classifier_name))
    #plt.show()
    
    class_report = classification_report(y_test, y_pred, target_names=target_names) 

    return np.mean(np.array(accuracy)), np.mean(np.array(precision)), np.mean(np.array(recall)), class_report

Define CNN LSTM Classifier

In [None]:
def cnn_lstm(X, y, real_malware, target_names,max_sequence_length, embedding_vector_length, num_unique):
    classifier_name = 'CNN_LSTM'
    #print('******%s******' % classifier_name)

    # truncate and pad input sequences
    X = sequence.pad_sequences(X, maxlen=max_sequence_length)
    


    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True)
    clf = Sequential()
    clf.add(Embedding(num_unique, embedding_vector_length, input_length=max_sequence_length))
    clf.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
    clf.add(MaxPooling1D(pool_size=2))
    clf.add(LSTM(100))
    clf.add(Dense(1, activation='sigmoid'))
    clf.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    callback = tf.keras.callbacks.EarlyStopping(monitor='loss', min_delta = 0.01, patience=10, restore_best_weights=True)
    clf.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=100, batch_size=64, callbacks=[callback], verbose=0)
    scores = clf.evaluate(X_test, y_test, verbose=0)
    accuracy = scores[1]
    y_pred = np.round(clf.predict(X_test, verbose=0))
    precision = precision_score(y_test, y_pred)
    recall= recall_score(y_test, y_pred)

    
    #fig, ax = plt.subplots(figsize=(4,4))
    #ConfusionMatrixDisplay.from_predictions(y_test, y_pred, display_labels=np.array(['real', 'fake']), cmap=plt.cm.Blues, ax=ax)
    #ax.set_title('%s Confusion Matrix \n using %s'%(real_malware,classifier_name))
    #plt.show()
    
    class_report = classification_report(y_test, y_pred, target_names=target_names) 

    return accuracy, precision, recall, class_report

def cnn_lstm_kfold(X, y, real_malware, target_names,max_sequence_length, embedding_vector_length, num_unique):
    classifier_name = 'CNN_LSTM'
    #print('******%s******' % classifier_name)
    k = 5
    accuracy = []
    precision = []
    recall = []
    best = 0.
    kf = KFold(n_splits=k, shuffle=True, random_state=18245)

    # truncate and pad input sequences
    X = sequence.pad_sequences(X, maxlen=max_sequence_length)
    


    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        # build LSTM model
        clf = Sequential()
        clf.add(Embedding(num_unique, embedding_vector_length, input_length=max_sequence_length))
        clf.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
        clf.add(MaxPooling1D(pool_size=2))
        clf.add(LSTM(100))
        clf.add(Dense(1, activation='sigmoid'))
        clf.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
        callback = tf.keras.callbacks.EarlyStopping(monitor='loss', min_delta = 0.01, patience=10, restore_best_weights=True)
        clf.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=100, batch_size=64, callbacks=[callback], verbose=0)
        scores = clf.evaluate(X_test, y_test, verbose=0)
        acc = scores[1]
        accuracy.append(acc)
        if acc>best:
            best = acc
            best_train, best_test = train_index, test_index
            
        y_pred = np.round(clf.predict(X_test, verbose=0))
        precision.append(precision_score(y_test, y_pred))
        recall.append(recall_score(y_test, y_pred))
        
    # print('\n%s accuracy: ' % classifier_name, accuracy)
    # print('%s precision: ' % classifier_name, precision)
    # print('%s recall: ' % classifier_name, recall)
    
    # print('\nAvg %s accuracy: %0.2f'% (classifier_name, np.mean(np.array(accuracy))))
    # print('Avg %s precision: %0.2f'% (classifier_name, np.mean(np.array(precision))))
    # print('Avg %s recall: %0.2f\n'% (classifier_name ,np.mean(np.array(recall))))
    
    X_train, X_test, y_train, y_test = X[best_train], X[best_test], y[best_train], y[best_test]
    clf = Sequential()
    clf.add(Embedding(num_unique, embedding_vector_length, input_length=max_sequence_length))
    clf.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
    clf.add(MaxPooling1D(pool_size=2))
    clf.add(LSTM(100))
    clf.add(Dense(1, activation='sigmoid'))
    clf.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    callback = tf.keras.callbacks.EarlyStopping(monitor='loss', min_delta = 0.01, patience=10, restore_best_weights=True)
    clf.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=100, batch_size=64, callbacks=[callback], verbose=0)
    y_pred = np.round(clf.predict(X_test, verbose=0))
    
    #fig, ax = plt.subplots(figsize=(4,4))
    #ConfusionMatrixDisplay.from_predictions(y_test, y_pred, display_labels=np.array(['real', 'fake']), cmap=plt.cm.Blues, ax=ax)
    #ax.set_title('%s Confusion Matrix \n using %s'%(real_malware,classifier_name))
    #plt.show()
    
    class_report = classification_report(y_test, y_pred, target_names=target_names) 

    return np.mean(np.array(accuracy)), np.mean(np.array(precision)), np.mean(np.array(recall)), class_report

Define SVM Classifer

In [None]:
def support_vector_machine(X, y, real_malware, target_names,max_sequence_length):
    classifier_name = 'Support Vector Machines'
    #print('******%s******' % classifier_name)
    k = 5
    accuracy = []
    precision = []
    recall = []
    best = 0.

    kf = KFold(n_splits=k, shuffle=True)

    # truncate and pad input sequences
    X = sequence.pad_sequences(X, maxlen=max_sequence_length)

    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        clf = svm.SVC(C=5, kernel='rbf')
        clf.fit(X_train, y_train)
        acc = clf.score(X_test, y_test)
        accuracy.append(acc)
        if acc>best:
            best = acc
            best_train, best_test = train_index, test_index
            
        y_pred = clf.predict(X_test)
        precision.append(precision_score(y_test, y_pred))
        recall.append(recall_score(y_test, y_pred))
        
    # print('\n%s accuracy: ' % classifier_name, accuracy)
    # print('%s precision: ' % classifier_name, precision)
    # print('%s recall: ' % classifier_name, recall)
    
    # print('\nAvg %s accuracy: %0.2f'% (classifier_name, np.mean(np.array(accuracy))))
    # print('Avg %s precision: %0.2f'% (classifier_name, np.mean(np.array(precision))))
    # print('Avg %s recall: %0.2f\n'% (classifier_name ,np.mean(np.array(recall))))
    
    X_train, X_test, y_train, y_test = X[best_train], X[best_test], y[best_train], y[best_test]
    clf = svm.SVC(C=5, kernel='rbf')
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    
    #fig, ax = plt.subplots(figsize=(4,4))
    #ConfusionMatrixDisplay.from_predictions(y_test, y_pred, display_labels=np.array(['real', 'fake']), cmap=plt.cm.Blues, ax=ax)
    #ax.set_title('%s Confusion Matrix \n using %s'%(real_malware,classifier_name))
    #plt.show()
    
    class_report = classification_report(y_test, y_pred, target_names=target_names) 

    return np.mean(np.array(accuracy)), np.mean(np.array(precision)), np.mean(np.array(recall)), class_report

Define Random Forest Classifier

In [None]:
def random_forest(X, y, real_malware, target_names,max_sequence_length):
    classifier_name = 'Random Forest'
    #print('******%s******' % classifier_name)
    k = 5
    accuracy = []
    precision = []
    recall = []
    best = 0.
    kf = KFold(n_splits=k, shuffle=True, random_state=6357)

    # truncate and pad input sequences
    X = sequence.pad_sequences(X, maxlen=max_sequence_length)
    
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        clf = RandomForestClassifier(n_estimators=50, max_depth=5)
        clf.fit(X_train, y_train)
        acc = clf.score(X_test, y_test)
        accuracy.append(acc)
        if acc>best:
            best = acc
            best_train, best_test = train_index, test_index
            
        y_pred = clf.predict(X_test)
        precision.append(precision_score(y_test, y_pred))
        recall.append(recall_score(y_test, y_pred))
        
    # print('\n%s accuracy: ' % classifier_name, accuracy)
    # print('%s precision: ' % classifier_name, precision)
    # print('%s recall: ' % classifier_name, recall)
    
    # print('\nAvg %s accuracy: %0.2f'% (classifier_name, np.mean(np.array(accuracy))))
    # print('Avg %s precision: %0.2f'% (classifier_name, np.mean(np.array(precision))))
    # print('Avg %s recall: %0.2f\n'% (classifier_name ,np.mean(np.array(recall))))
    
    X_train, X_test, y_train, y_test = X[best_train], X[best_test], y[best_train], y[best_test]
    clf = RandomForestClassifier(n_estimators=50, max_depth=5)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    
    #fig, ax = plt.subplots(figsize=(4,4))
    #ConfusionMatrixDisplay.from_predictions(y_test, y_pred, display_labels=np.array(['real', 'fake']), cmap=plt.cm.Blues, ax=ax)
    #ax.set_title('%s Confusion Matrix \n using %s'%(real_malware,classifier_name))
    #plt.show()
    
    class_report = classification_report(y_test, y_pred, target_names=target_names) 

    return np.mean(np.array(accuracy)), np.mean(np.array(precision)), np.mean(np.array(recall)), class_report

Define k-Nearest Neighbor Classifier

In [None]:
def k_nearest_neighbors(X, y, real_malware, target_names,max_sequence_length):
    classifier_name = 'K-Nearest Neighbors'
    #print('******%s******' % classifier_name)
    k = 5
    accuracy = []
    precision = []
    recall = []
    best = 0.
    kf = KFold(n_splits=k, shuffle=True, random_state=18245)

    # truncate and pad input sequences
    X = sequence.pad_sequences(X, maxlen=max_sequence_length)
    
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        clf = KNeighborsClassifier(n_neighbors=1, p=2)
        clf.fit(X_train, y_train)
        acc = clf.score(X_test, y_test)
        accuracy.append(acc)
        if acc>best:
            best = acc
            best_train, best_test = train_index, test_index
            
        y_pred = clf.predict(X_test)
        precision.append(precision_score(y_test, y_pred))
        recall.append(recall_score(y_test, y_pred))
        
    # print('\n%s accuracy: ' % classifier_name, accuracy)
    # print('%s precision: ' % classifier_name, precision)
    # print('%s recall: ' % classifier_name, recall)
    
    # print('\nAvg %s accuracy: %0.2f'% (classifier_name, np.mean(np.array(accuracy))))
    # print('Avg %s precision: %0.2f'% (classifier_name, np.mean(np.array(precision))))
    # print('Avg %s recall: %0.2f\n'% (classifier_name ,np.mean(np.array(recall))))
    
    X_train, X_test, y_train, y_test = X[best_train], X[best_test], y[best_train], y[best_test]
    clf = KNeighborsClassifier(n_neighbors=1, p=2)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    
    #fig, ax = plt.subplots(figsize=(4,4))
    #ConfusionMatrixDisplay.from_predictions(y_test, y_pred, display_labels=np.array(['real', 'fake']), cmap=plt.cm.Blues, ax=ax)
    #ax.set_title('%s Confusion Matrix \n using %s'%(real_malware,classifier_name))
    #plt.show()
    
    class_report = classification_report(y_test, y_pred, target_names=target_names) 

    return np.mean(np.array(accuracy)), np.mean(np.array(precision)), np.mean(np.array(recall)), class_report

Main Code

In [None]:
def combined_search(real_malware, function):
    #### Define Run Settings ####
    tf.random.set_seed(7)
    np.random.seed(7)
    tf.keras.utils.set_random_seed(7)

    max_sequence_length = 600
    use_deep_classifiers = 1 # 1+ = use, 0 do not use 
    target_names = ['Real', 'Fake']
    embedding_vector_length = 32
    num_tries = 20
    save_epoch = []
    latent_dim = 100
    architecture = 'WGANGP'
    deep = ['cnn_lstm_kfold','bi_lstm_kfold','lstm_kfold','cnn_lstm','bi_lstm','lstm']


    if real_malware == 'AllFiveFamilies':
        epoch_num = 1000
    else:
        epoch_num = 10000
    

    #### Load Real Samples ####
    data_obj = data.DATA(real_malware, 600)
    real_data, num_unique = data_obj.load_data(True)
    generator = load_model('C:/Users/Albert/Desktop/CMPE_295/%s_Results/Models_test/%s/%s_generator_%d.hdf5' % (architecture, real_malware, real_malware, epoch_num), compile=False, custom_objects={'LeakyReLU': LeakyReLU})

    ### Start Binary Search ###
    low = 0
    mid = 0
    count = 0
    high = len(real_data) - 1
    saved_high = high

    while low <= high:
        mid = (high + low) // 2

        if mid < 5: # Make sure num_samples do not go under 5 because of k-folds = 5
            mid = 5

        num_samples = mid
        #print('Generated Samples: %s      Acutal Malware Files Used: %s' % (num_samples,num_samples))

        idx = np.random.randint(0, real_data.shape[0], num_samples)
        real_samples = real_data[idx]

        #### Generate Samples ####
        
        noise = np.random.normal(0, 1, (num_samples, latent_dim))
        gen_samples = generator.predict(noise, verbose=0)
        gen_samples = (gen_samples + 1) * num_unique/2
        gen_samples = np.rint(gen_samples)
        gen_samples = gen_samples.astype(int)
        gen_samples = np.reshape(gen_samples, (num_samples, 600))
        #print(gen_samples.shape)

        #### Define Training and Test Datasets ####
        X = np.concatenate([real_samples, gen_samples])
        y= [1]*num_samples + [0]*num_samples
        X = np.array(X,dtype=object)
        y = np.array(y,dtype='int64')

        if function.__name__ in deep:
            accuracy, precision, recall, svm_results = function(X,y,real_malware,target_names,max_sequence_length, embedding_vector_length, num_unique)
        else:
            accuracy, precision, recall, svm_results = function(X,y,real_malware,target_names,max_sequence_length)
        if accuracy < 0.95:
            low = mid + 1

        elif accuracy > 0.95:
            high = mid -1
            count += 1
            
    if count <= 1: # if binary search does not find a num_samples that classifys with .95 accuracy, return highest value
        mid = saved_high

    #for loop_num in range(num_tries):
    now = datetime.now()
    current_time = now.strftime("%H:%M:%S")

    print('%s: Binary search result for %s: %i samples needed for .95 accuracy' % (current_time, function.__name__, mid))

    ### Start Linear Search ###
    num_samples = mid - 1
    end_flag = 0
    while end_flag == 0:
        for loop_num in range(num_tries):

            #print('Generated Samples: %s      Acutal Malware Files Used: %s' % (num_samples,num_samples))

            idx = np.random.randint(0, real_data.shape[0], num_samples)
            real_samples = real_data[idx]

            #### Generate Samples ####
            
            noise = np.random.normal(0, 1, (num_samples, latent_dim))
            gen_samples = generator.predict(noise, verbose=0)
            gen_samples = (gen_samples + 1) * num_unique/2
            gen_samples = np.rint(gen_samples)
            gen_samples = gen_samples.astype(int)
            gen_samples = np.reshape(gen_samples, (num_samples, 600))
            #print(gen_samples.shape)

            #### Define Training and Test Datasets ####
            X = np.concatenate([real_samples, gen_samples])
            y= [1]*num_samples + [0]*num_samples
            X = np.array(X,dtype=object)
            y = np.array(y,dtype='int64')

            if function.__name__ in deep:
                accuracy, precision, recall, svm_results = function(X,y,real_malware,target_names,max_sequence_length, embedding_vector_length, num_unique)
            else:
                accuracy, precision, recall, svm_results = function(X,y,real_malware,target_names,max_sequence_length)
            if accuracy > 0.95:
                num_samples -= 1
                break
            if loop_num == (num_tries - 1):
                end_flag = 1
                num_samples += 1
            if num_samples == 5: # Make sure num_samples do not go under 5 because of k-folds = 5
                end_flag = 1
        if (mid  - num_samples) % 10 == 0:
            print('%s: Linear search in progress, Accuracy = %0.2f,  Num_Samples = %i' % (current_time, accuracy, num_samples))
            


    #### Display All Classification Results ####
    now = datetime.now()
    current_time = now.strftime("%H:%M:%S")

    print('%s: Linear search result for %s: %i samples needed for .95 accuracy' % (current_time, function.__name__, num_samples))
    return num_samples

In [None]:
#malware_familes = ['AllFiveFamilies','OnLineGames', 'Renos', 'VBInject', 'WinWebSec', 'Zbot']
malware_familes = ['AllFiveFamilies']

for real_malware in malware_familes:
    svm_num_samples  = combined_search(real_malware, support_vector_machine)
    rf_num_samples  = combined_search(real_malware, random_forest)
    knn_num_samples  = combined_search(real_malware, k_nearest_neighbors)

    lstm_num_samples  = combined_search(real_malware, lstm_kfold)
    bi_lstm_num_samples  = combined_search(real_malware, bi_lstm_kfold)
    cnn_lstm_num_samples  = combined_search(real_malware, cnn_lstm_kfold)
    
    print(real_malware)
    print('-------------------------------------------------------------')
    print("SVM Epochs:                     ", svm_num_samples)
    print("Random Forest Epochs:           ", rf_num_samples)
    print("k-Nearest Neighbor Epochs:      ", knn_num_samples)

    print("Standard LSTM Epochs:           ", lstm_num_samples)
    print("Bidirectional LSTM Epochs:      ", bi_lstm_num_samples)
    print("CNN LSTM Epochs:                ", cnn_lstm_num_samples)
    print('-------------------------------------------------------------')

Visualize Real and Fake Malware Samples

In [None]:
tmp = sequence.pad_sequences(X, maxlen=600, dtype='int32',
    padding='pre', truncating='pre', value=0.)

print('Real Malware - ' + real_malware)
plt.matshow(np.array(tmp[4]).reshape(20,30))
#plt.show()

print('Fake Malware - ' + real_malware)
plt.matshow(np.array(tmp[199]).reshape(20,30))
#plt.show()