Notebook Credit: Albert Giang

This Notebook takes the trained WGAN-GP and produces syntenic opcode files to be used as input for classifiers

In [1]:
import numpy as np
import data
import json
import tensorflow
import random
import data_mod
from tensorflow.keras.models import load_model
from tensorflow.keras.layers import LeakyReLU
from collections import Counter

In [4]:
family = 'AllFiveFamilies'
latent_dim = 100
num_samples = 5000
architecture = 'WGANGP'
epoch_num = 100000


In [5]:
data_obj = data_mod.DATA(family, 600)
data_samples, num_unique, unique_opcodes = data_obj.load_data(False)
print('Num unique: %d'%num_unique)
with open('opdict'+family+'.json', "r") as json_file:
    mapping = json.load(json_file)
factor = num_unique/2

Total samples for AllFiveFamilies: 12271
Mappings in JSON file: 40
Total Usable samples for AllFiveFamilies: 11335
Num unique: 40


In [8]:
WinWebSec = json.load(open('opdictWinWebSec.json', "r"))
VBInject = json.load(open('opdictVBInject.json', "r"))
Renos = json.load(open('opdictRenos.json', "r"))
OnLineGames = json.load(open('opdictOnLineGames.json', "r"))
Zbot = json.load(open('opdictZbot.json', "r"))

In [11]:
keys_set = set()
for dict_name in ['WinWebSec', 'VBInject', 'Renos', 'OnLineGames', 'Zbot']:
    keys_set = keys_set.union(set(eval(dict_name).keys()))

print(keys_set)


{'jz', 'cmc', 'push', 'cmp', 'insb', 'xchg', 'test', 'fadds', 'je', 'out', 'jb', 'lods', 'daa', 'movb', 'ja', 'shr', 'sub', 'stos', 'mov', 'data16', 'dec', 'jnz', 'scas', 'lea', 'adc', 'pop', 'in', 'inc', 'jle', 'xor', 'add', 'leave', 'sbb', 'imul', 'iret', 'retn', 'call', 'jmp', 'and', 'or'}


In [13]:
new_dict = {key: i for i, key in enumerate(keys_set)}

print(new_dict)
with open('opdictAllFiveFamilies.json', 'w') as f:
    # Write the dictionary to the file in JSON format
    json.dump(new_dict, f)

{'jz': 0, 'cmc': 1, 'push': 2, 'cmp': 3, 'insb': 4, 'xchg': 5, 'test': 6, 'fadds': 7, 'je': 8, 'out': 9, 'jb': 10, 'lods': 11, 'daa': 12, 'movb': 13, 'ja': 14, 'shr': 15, 'sub': 16, 'stos': 17, 'mov': 18, 'data16': 19, 'dec': 20, 'jnz': 21, 'scas': 22, 'lea': 23, 'adc': 24, 'pop': 25, 'in': 26, 'inc': 27, 'jle': 28, 'xor': 29, 'add': 30, 'leave': 31, 'sbb': 32, 'imul': 33, 'iret': 34, 'retn': 35, 'call': 36, 'jmp': 37, 'and': 38, 'or': 39}


In [4]:
Counter(data_samples.flatten()).most_common()

[('mov', 825793),
 ('push', 453543),
 ('call', 199512),
 ('add', 134940),
 ('pop', 128627),
 ('cmp', 83801),
 ('sub', 63151),
 ('retn', 49295),
 ('jnz', 46423),
 ('lea', 45087),
 ('jmp', 43811),
 ('xor', 32719),
 ('test', 32416),
 ('jz', 31159),
 ('and', 20321),
 ('jb', 17141),
 ('inc', 10511),
 ('shr', 10110),
 ('movzx', 9930),
 ('rep', 8895),
 ('jbe', 8299),
 ('or', 7561),
 ('dec', 6679),
 ('cld', 6387),
 ('std', 6308),
 ('shl', 6283),
 ('imul', 5617),
 ('jnb', 4891),
 ('nop', 4768),
 ('int', 3640),
 ('adc', 3121),
 ('jge', 3083),
 ('sbb', 2852),
 ('leave', 2446),
 ('xchg', 2188),
 ('jl', 2150),
 ('neg', 2017),
 ('div', 1890),
 ('jle', 1192),
 ('fstp', 1004),
 ('setnz', 970),
 ('in', 970),
 ('out', 907),
 ('ja', 863),
 ('popa', 802),
 ('ror', 748),
 ('cwde', 701),
 ('pusha', 627),
 ('movsx', 590),
 ('rcr', 579),
 ('jo', 565),
 ('retf', 544),
 ('fld', 523),
 ('xlat', 515),
 ('jp', 413),
 ('sar', 413),
 ('js', 396),
 ('jns', 392),
 ('clc', 381),
 ('rol', 379),
 ('aad', 377),
 ('sahf', 

In [5]:
print(unique_opcodes)

{'popa', 'cmovo', 'psllw', 'stosw', 'fcmove', 'jno', 'fcmovb', 'setnp', 'cwde', 'ficomp', 'pusha', 'fbstp', 'movsd', 'faddp', 'movntps', 'setnb', 'fxch4', 'jle', 'fist', 'fstsw', 'cmpxchg', 'fisub', 'lahf', 'por', 'fisubr', 'in', 'pslld', 'popf', 'fyl2x', 'stc', 'jnp', 'insd', 'shl', 'rol', 'paddusw', 'fsubr', 'mul', 'svts', 'cmpps', 'fnstsw', 'hlt', 'aam', 'loopwne', 'icebp', 'loop', 'setnl', 'svdc', 'aad', 'fsave', 'pcmpeqb', 'frstor', 'arpl', 'setnz', 'stosb', 'fucomi', 'rcr', 'cld', 'scasd', 'rcpps', 'pmovmskb', 'cli', 'idiv', 'fnstenv', 'setz', 'orps', 'aas', 'jnb', 'fsub', 'fcmovnb', 'div', 'pushf', 'ucomiss', 'fistp', 'fcmovbe', 'loope', 'fldlg2', 'fyl2xp1', 'fadd', 'scas', 'btc', 'lds', 'lock', 'bsf', 'xadd', 'leavew', 'psrad', 'ror', 'scasb', 'pminsw', 'addps', 'punpckhdq', 'js', 'lods', 'cmova', 'movups', 'rcl', 'sahf', 'fldenv', 'rsqrtps', 'movsw', 'fld1', 'outsb', 'fcomip', 'movs', 'iret', 'fstcw', 'fcmovnbe', 'cmovnb', 'ja', 'fimul', 'f2xm1', 'fdiv', 'das', 'jl', 'movsb', 

In [6]:
generator = load_model('C:/Users/Albert/Desktop/CMPE_295/%s_Results/Models_test/%s/%s_generator_%d.hdf5' % (architecture, family, family, epoch_num), custom_objects={'LeakyReLU': LeakyReLU})
noise = np.random.normal(0, 1, (num_samples, latent_dim))
gen_samples = generator.predict(noise)
gen_samples = (gen_samples + 1) * factor
gen_samples = np.rint(gen_samples)
gen_samples = gen_samples.astype(int)
gen_samples = np.reshape(gen_samples, (num_samples, 600))



In [None]:
print(mapping)
print(gen_samples)

In [7]:
def countX(lst, x):
    count = 0
    for ele in lst:
        if (ele == x):
            count = count + 1
    return count

In [8]:
num_twoone = []
row_length = []
for row in gen_samples:
    #print(row)
    row_length.append(len(row))
    num_twoone.append(countX(row, 21))

print(num_twoone)
print(row_length)

[1, 3, 0, 0, 0, 0, 7, 20, 0, 1, 12, 101, 50, 63, 17, 2, 0, 2, 6, 42, 0, 0, 0, 0, 0, 0, 0, 2, 60, 38, 20, 1, 48, 0, 0, 72, 10, 5, 0, 18, 0, 0, 12, 43, 1, 21, 48, 0, 1, 0, 17, 1, 37, 64, 2, 4, 2, 0, 62, 0, 10, 0, 12, 0, 2, 0, 1, 0, 10, 78, 0, 0, 0, 91, 4, 8, 40, 0, 0, 8, 11, 48, 52, 2, 0, 38, 0, 0, 10, 0, 2, 0, 73, 0, 2, 9, 12, 0, 59, 34, 34, 87, 0, 20, 13, 34, 0, 37, 2, 22, 0, 0, 4, 21, 1, 36, 54, 0, 0, 3, 33, 54, 16, 14, 6, 2, 84, 0, 0, 0, 0, 0, 0, 0, 13, 0, 0, 14, 67, 0, 0, 0, 50, 1, 14, 59, 48, 1, 14, 0, 0, 29, 0, 1, 16, 5, 13, 0, 68, 40, 22, 0, 10, 3, 19, 1, 0, 1, 36, 0, 0, 0, 2, 12, 0, 0, 17, 15, 0, 38, 1, 28, 24, 0, 0, 3, 0, 34, 0, 16, 54, 23, 64, 0, 59, 104, 0, 0, 0, 0, 4, 67, 26, 0, 9, 10, 0, 0, 0, 2, 0, 47, 19, 5, 0, 5, 43, 0, 6, 0, 0, 0, 0, 3, 0, 3, 0, 9, 2, 0, 0, 0, 0, 0, 0, 37, 0, 83, 0, 5, 0, 0, 5, 0, 0, 0, 47, 28, 7, 3, 0, 0, 57, 7, 20, 6, 1, 0, 0, 4, 7, 20, 0, 0, 0, 46, 1, 7, 20, 0, 31, 2, 0, 8, 1, 17, 58, 3, 0, 61, 3, 3, 7, 198, 0, 0, 0, 34, 2, 0, 0, 0, 9, 0, 20, 1, 6, 4

In [11]:
# Main Code to Generate Malware Files
unique_opcodes_list = list(unique_opcodes)
for count, row in enumerate(gen_samples):
    current_file = []
    for cell in row:
        if cell < len(mapping):
            current_file.append(list(mapping.keys())[cell])
        else:
            current_file.append(random.choice(unique_opcodes_list))
    with open('Output/generated_'+family+'_'+str(count+1)+'.txt', "w") as out_file:
        for opcode in current_file:
            out_file.write(opcode+'\n')


In [None]:
list(mapping.keys())[0]

In [None]:
np.shape(data_samples)
