In [1]:
import tensorflow as tf
import tensorflow.contrib.layers as layers

import functools
import os
import pickle

import numpy as np
import matplotlib.pyplot as plt
from collections import Counter

In [2]:
dir_path = os.getcwd()

In [3]:
#load embendings

embeddings = np.loadtxt(os.path.join(dir_path,'cache','embeddings.vec'))

pickle_in = open('cache/database.dict', 'rb')
db = pickle.load(pickle_in)
pickle_in.close()

pickle_in = open('cache/traindata.list', 'rb')
traindata = pickle.load(pickle_in)
pickle_in.close()

pickle_in = open('cache/data2onehot.dict', 'rb')
dictionary = pickle.load(pickle_in)
pickle_in.close()

In [4]:
from JavaClassParser import ByteCode
reverse_dictionary = dict(zip(dictionary.values(),dictionary.keys()))
X_train=[]
Y_train=[]
X_train_long=[]
Y_train_long=[]
X_labels = []
Y_labels = []

for dclass in db.values():
    for method in dclass.values():
        instructions = method['x']
        labels = method['y']
        byteIndex = method['index']
        
        
        #
        #seperating by labels

        cur_section = []
        cur_label = labels[0]
        if len(instructions)<100:
            X_train.append(instructions)
            Y_train.append(labels)
        else:
            X_train_long.append(instructions)
            Y_train_long.append(labels)
        for instruction, label in zip(instructions,labels):
            if label != cur_label:
                X_labels.append(cur_section)
                Y_labels.append(cur_label)
                cur_section = []
                cur_label = label
            cur_section.append(dictionary.get(instruction,0))

X_train = np.array(X_train)
Y_train = np.array(Y_train)
X_train_long = np.array(X_train_long)
Y_train_long = np.array(Y_train_long)
X_labels = np.array(X_labels)
Y_labels = np.array(Y_labels)

In [5]:
num_bits = 16
zero_value = 2**(num_bits-1)
def gray_code(n):
    n -= 1
    return format(n^(n >> 1), '0'+str(num_bits)+'b')

def bin_code(n):
    return format(n, '0'+str(num_bits)+'b')

def encode_arg(value):
    n=zero_value+value
    return np.array(list(map(lambda x: int(x), list(gray_code(n)))))

In [6]:
num_instructions = len(ByteCode.mnemonicMap)
encoding_len = num_instructions + num_bits
encode_dict = dict( zip(ByteCode.mnemonicMap.keys(),range(num_instructions)) )

X_encoded = []
Y_encoded = []

for x_seq,y_seq in zip(X_train,Y_train):
    encoded_x_seq = []
    encoded_y_seq = []
    for x,y in zip(x_seq, y_seq):
        x_vector = np.zeros(encoding_len)
        if x in encode_dict:
            onehot = np.zeros(num_instructions)
            onehot[encode_dict[x]] = 1
            x_vector[:num_instructions] = onehot
        else:
            try:
                value=int(x)
                x_vector[num_instructions:] = encode_arg(value)
            except ValueError as e:
                continue
        encoded_x_seq.append(x_vector)
        encoded_y_seq.append(y)
    X_encoded.append(encoded_x_seq)
    Y_encoded.append(encoded_y_seq)

In [7]:
sequence_length = 100
batch_size = 32

def batch_iterator(batch_size, seq_len = sequence_length,enc_len = encoding_len, X=X_encoded, Y=Y_encoded):
    batch_index = 0
    while(True):
        current_size = 0
        batch_X = np.zeros((batch_size, seq_len, enc_len,1))
        batch_Y = np.zeros((batch_size, seq_len))
        while(current_size < batch_size):
            if batch_index == len(X):
                batch_index = 0
            if len(X[batch_index]) <= sequence_length:
                batch_X[current_size,:len(X[batch_index]),:,0] = np.array(X[batch_index])
                batch_Y[current_size,:len(X[batch_index])] = np.array(Y[batch_index])[:,1]
                current_size += 1
            batch_index += 1
            yield batch_X, batch_Y
        

In [8]:

def doublewrap(function):
    """
    A decorator decorator, allowing to use the decorator to be used without
    parentheses if not arguments are provided. All arguments must be optional.
    """
    @functools.wraps(function)
    def decorator(*args, **kwargs):
        if len(args) == 1 and len(kwargs) == 0 and callable(args[0]):
            return function(args[0])
        else:
            return lambda wrapee: function(wrapee, *args, **kwargs)
    return decorator


@doublewrap
def define_scope(function, scope=None, *args, **kwargs):
    """
    A decorator for functions that define TensorFlow operations. The wrapped
    function will only be executed once. Subsequent calls to it will directly
    return the result so that operations are added to the graph only once.
    The operations added by the function live within a tf.variable_scope(). If
    this decorator is used with arguments, they will be forwarded to the
    variable scope. The scope name defaults to the name of the wrapped
    function.
    """
    attribute = '_cache_' + function.__name__
    name = scope or function.__name__
    @property
    @functools.wraps(function)
    def decorator(self):
        if not hasattr(self, attribute):
            with tf.variable_scope(name, *args, **kwargs):
                setattr(self, attribute, function(self))
        return getattr(self, attribute)
    return decorator


class ConvolutionModel:

    
    def __init__(self, image, label,
                 learning_rate=3e-2, 
                 num_epochs=10,
                 weight_decay = 1e-2,
                 conv1sz = 4,
                 conv2sz = 8,
                 fc3sz = 512,
                 num_instructions = 100,
                 representation_size = encoding_len):
        self.learning_rate = learning_rate
        self.num_epochs = num_epochs
        self.weight_decay = weight_decay
        self.conv1sz = conv1sz
        self.conv2sz = conv2sz
        self.fc3sz = fc3sz
        
        self.image = image
        self.label = label
        
        self.num_instructions = num_instructions
        self.representation_size = representation_size
        
        self.prediction
        self.optimize
        self.error

    @define_scope
    def prediction(self):
        inputs = self.image
        weight_decay = self.weight_decay
        conv1sz = self.conv1sz
        conv2sz = self.conv2sz
        fc3sz = self.fc3sz
        num_instructions = self.num_instructions
        representation_size = self.representation_size
        
        with tf.contrib.framework.arg_scope([layers.convolution2d],
          kernel_size=[2, representation_size], stride=1, padding='VALID', activation_fn=tf.nn.relu,
          weights_initializer=layers.variance_scaling_initializer(),
          weights_regularizer=layers.l2_regularizer(weight_decay)):

            net1 = layers.convolution2d(inputs, conv1sz, scope='conv1_1')
        
        with tf.contrib.framework.arg_scope([layers.convolution2d],
          kernel_size=[2, 1], stride=1, padding='VALID', activation_fn=tf.nn.relu,
          weights_initializer=layers.variance_scaling_initializer(),
          weights_regularizer=layers.l2_regularizer(weight_decay)):
            
            net1 = layers.convolution2d(net1, conv2sz, scope='conv2_1')
            net1 = layers.max_pool2d(net1, [2,1], representation_size, scope='pool1')
            
        with tf.contrib.framework.arg_scope([layers.convolution2d],
          kernel_size=[3, representation_size], stride=1, padding='VALID', activation_fn=tf.nn.relu,
          weights_initializer=layers.variance_scaling_initializer(),
          weights_regularizer=layers.l2_regularizer(weight_decay)):
            
            net2 = layers.convolution2d(inputs, conv1sz, scope='conv1_2')
        
        with tf.contrib.framework.arg_scope([layers.convolution2d],
          kernel_size=[3, 1], stride=1, padding='VALID', activation_fn=tf.nn.relu,
          weights_initializer=layers.variance_scaling_initializer(),
          weights_regularizer=layers.l2_regularizer(weight_decay)):
            
            net2 = layers.convolution2d(net2, conv2sz, scope='conv2_2')
            net2 = layers.max_pool2d(net2, [2,1], representation_size, scope='pool2')
            
        with tf.contrib.framework.arg_scope([layers.convolution2d],
          kernel_size=[4, representation_size], stride=1, padding='VALID', activation_fn=tf.nn.relu,
          weights_initializer=layers.variance_scaling_initializer(),
          weights_regularizer=layers.l2_regularizer(weight_decay)):
            
            net3 = layers.convolution2d(inputs, conv1sz, scope='conv1_3')
        
        with tf.contrib.framework.arg_scope([layers.convolution2d],
          kernel_size=[4, 1], stride=1, padding='VALID', activation_fn=tf.nn.relu,
          weights_initializer=layers.variance_scaling_initializer(),
          weights_regularizer=layers.l2_regularizer(weight_decay)):
            
            net3 = layers.convolution2d(net3, conv2sz, scope='conv2_3')
            net3 = layers.max_pool2d(net3, [2,1], representation_size, scope='pool3')
    
        with tf.contrib.framework.arg_scope([layers.convolution2d],
          kernel_size=[5, representation_size], stride=1, padding='VALID', activation_fn=tf.nn.relu,
          weights_initializer=layers.variance_scaling_initializer(),
          weights_regularizer=layers.l2_regularizer(weight_decay)):
            
            net4 = layers.convolution2d(inputs, conv1sz, scope='conv1_4')
        
        with tf.contrib.framework.arg_scope([layers.convolution2d],
          kernel_size=[5, 1], stride=1, padding='VALID', activation_fn=tf.nn.relu,
          weights_initializer=layers.variance_scaling_initializer(),
          weights_regularizer=layers.l2_regularizer(weight_decay)):
            
            net4 = layers.convolution2d(net4, conv2sz, scope='conv2_4')
            net4 = layers.max_pool2d(net4, [2,1], representation_size, scope='pool4')
        
        inputs = [net1, net2, net3, net4]
            
        with tf.contrib.framework.arg_scope([layers.fully_connected],
          activation_fn=tf.nn.relu,
          weights_initializer=layers.variance_scaling_initializer(),
          weights_regularizer=layers.l2_regularizer(weight_decay)):

            inputs = list(map(lambda net: layers.flatten(net), inputs))
            net = tf.concat(inputs, axis=1)
            net = layers.fully_connected(net, fc3sz, scope='fc3')
        
        instructions_predict = layers.fully_connected(net, num_instructions, activation_fn=tf.nn.sigmoid, scope='instructions')
        return instructions_predict

    @define_scope
    def optimize(self):
        mse = tf.losses.mean_squared_error(self.label, self.prediction, scope ='loss')
        optimizer = tf.train.RMSPropOptimizer(self.learning_rate)
        return optimizer.minimize(mse)

    @define_scope
    def error(self):
        mistakes = tf.not_equal(
            self.label, tf.round(self.prediction) )
        return tf.reduce_mean(tf.cast(mistakes, tf.float32))


In [9]:
image = tf.placeholder(tf.float32, [None, sequence_length, encoding_len, 1])
label = tf.placeholder(tf.float32, [None, sequence_length])
model = ConvolutionModel(image, label)
batch = batch_iterator(batch_size)

In [10]:
N=len(X_encoded)
num_epochs = 10
n_steps = N//batch_size
sess = tf.Session()
sess.run(tf.global_variables_initializer())
for _ in range(num_epochs):
    avg_error = 0
    for _ in range(n_steps):
        images, labels = next(batch)
        error = sess.run(model.error, {image: images, label: labels})
        avg_error += error
    print(avg_error/n_steps)

0.212285043128
0.232957009011
0.23038656039
0.223713873304
0.231031430341
0.211320448215
0.230083093149
0.227006864115
0.225341401887
0.237722182415
0.222975071875
0.229705563693
0.21961705174
0.223609104575
0.237149566606
0.213368858291
0.231625722323
0.220592484836
0.22351156068
0.229850072774
0.217599349446
0.229692919104
0.218477239579
0.227028540508
0.237201950631
0.228171966076
0.23328757215
0.221325866959
0.227951589606
0.229257587338
0.229913294504
0.244105852096
0.211018785942
0.233143063364
0.229004696388
0.223190028517
0.231143424347
0.210074060742
0.229141980646
0.226053106765
0.226486633053
0.237924493478
0.222257947705
0.232028540595
0.222814305712
0.222398843998
0.240917630343
0.216089234515
0.232189306241
0.219176300995
0.224454479615
0.227687860898
0.215301661558
0.227731213641
0.217536127216
0.226033236288
0.234611633024
0.228206286481
0.23119580919
0.216526372089
0.230881503713
0.230339595725
0.226392702092
0.244216040563
0.214528540424
0.231305996726
0.230899565735
