In [1]:
import json
import networkx as nx
from networkx import json_graph

import pandas as pd
from pandas.plotting import table
import numpy as np
import pickle
import time
import re

import matplotlib.pyplot as plt
import matplotlib.image as pltimg
%matplotlib inline

In [2]:
# getting lines from the json file
lines = open("dataset/blindtest.json",'r').readlines()

# getting the size of data
data_size = len(lines)
print("data size = ", data_size)

data size =  757


In [3]:
# creating lists of number of nodes, edges, connected components
# and simple cycles for each CGF graph

n_nodes = []
n_edges = []
n_conncomps = []
n_cycles = []

for item in lines:
    d = json.loads(item)
    nx_graph = json_graph.adjacency_graph(d['cfg'])
    name, type, nodes, edges, a_in, a_out = nx.info(nx_graph).split("\n",6)
    n_nodes.append(int(nodes.split(": ")[1]))
    n_edges.append(int(edges.split(": ")[1]))
    u_graph = nx_graph.to_undirected()
    n_conncomps.append(nx.number_connected_components(u_graph))
    n_cycles.append(len(nx.cycle_basis(u_graph)))

In [4]:
# computing the cyclomatic complexity for each item

cycl_complex = []
for i in range(data_size):
    cycl_complex.append(n_edges[i]-n_nodes[i]+2*n_conncomps[i])

In [5]:
# getting the lists of labels and asm_lists
asm_lists = []

for item in lines:
    d = json.loads(item)
    values = list(d.values())
    asm_lists.append(values[2])

In [9]:
n_memory_ins = [0]*data_size
n_arithmetic_ins = [0]*data_size
n_float_ins = [0]*data_size
n_bitwise_ins = [0]*data_size
n_jumps = [0]*data_size
n_comparisons = [0]*data_size
n_calls = [0]*data_size

for i in range(data_size):
    asm = asm_lists[i]
    
    n_memory_ins[i] += len(re.findall('mov', str(asm)))
    n_memory_ins[i] += len(re.findall('push', str(asm)))
    n_memory_ins[i] += len(re.findall('pop', str(asm)))
    n_memory_ins[i] += len(re.findall('cwtl', str(asm)))
    n_memory_ins[i] += len(re.findall('cltq', str(asm)))
    n_memory_ins[i] += len(re.findall('cqto', str(asm)))

    n_arithmetic_ins[i] += len(re.findall('lea', str(asm)))
    n_arithmetic_ins[i] += len(re.findall('inc', str(asm)))
    n_arithmetic_ins[i] += len(re.findall('dec', str(asm)))
    n_arithmetic_ins[i] += len(re.findall('neg', str(asm)))
    n_arithmetic_ins[i] += len(re.findall('add', str(asm)))
    n_arithmetic_ins[i] += len(re.findall('sub', str(asm)))
    n_arithmetic_ins[i] += len(re.findall('imul', str(asm)))
    n_arithmetic_ins[i] += len(re.findall('div', str(asm)))


    n_bitwise_ins[i] += len(re.findall('not', str(asm)))
    n_bitwise_ins[i] += len(re.findall('xor', str(asm)))
    n_bitwise_ins[i] += len(re.findall('or', str(asm)))
    n_bitwise_ins[i] += len(re.findall('and', str(asm)))
    n_bitwise_ins[i] += len(re.findall('sal', str(asm)))
    n_bitwise_ins[i] += len(re.findall('sar', str(asm)))
    n_bitwise_ins[i] += len(re.findall('shr', str(asm)))

    n_float_ins[i] += len(re.findall('xmm', str(asm)))
    n_float_ins[i] += len(re.findall('movs', str(asm)))
    n_float_ins[i] += len(re.findall('cvtss2sd', str(asm)))
    n_float_ins[i] += len(re.findall('cvtsd2ss', str(asm)))
    n_float_ins[i] += len(re.findall('cvtsi2s', str(asm)))
    n_float_ins[i] += len(re.findall('cvtsi2s', str(asm)))
    n_float_ins[i] += len(re.findall('cvtts', str(asm)))
    n_float_ins[i] += len(re.findall('adds', str(asm)))
    n_float_ins[i] += len(re.findall('subs', str(asm)))
    n_float_ins[i] += len(re.findall('muls', str(asm)))
    n_float_ins[i] += len(re.findall('divs', str(asm)))
    n_float_ins[i] += len(re.findall('maxs', str(asm)))
    n_float_ins[i] += len(re.findall('mins', str(asm)))
    n_float_ins[i] += len(re.findall('sqrts', str(asm)))
    n_float_ins[i] += len(re.findall('ucomis', str(asm)))

    n_jumps[i] += len(re.findall('j', str(asm)))
    
    n_calls[i] += len(re.findall('call', str(asm)))

    n_comparisons[i] += len(re.findall('cmp', str(asm)))
    n_comparisons[i] += len(re.findall('test', str(asm)))
    n_comparisons[i] += len(re.findall('set', str(asm)))

In [10]:
data = {'n_nodes': n_nodes,
       'cyclomatic_complexity' : cycl_complex,
       'n_cycles' : n_cycles,
       'n_memory_ins': n_memory_ins,
       'n_arithmetic_ins' : n_arithmetic_ins,
       'n_float_ins' : n_float_ins,
       'n_bitwise_ins' : n_bitwise_ins,
       'n_jumps' : n_jumps,
       'n_comparisons' : n_comparisons,
       'n_calls' : n_calls}

df = pd.DataFrame (data, columns = ['n_nodes', 'cyclomatic_complexity', 'n_cycles', 'n_memory_ins', 
                                    'n_arithmetic_ins', 'n_float_ins', 'n_bitwise_ins', 
                                    'n_jumps', 'n_comparisons', 'n_calls'])

In [11]:
df.head(10)

Unnamed: 0,n_nodes,cyclomatic_complexity,n_cycles,n_memory_ins,n_arithmetic_ins,n_float_ins,n_bitwise_ins,n_jumps,n_comparisons,n_calls
0,93,77,68,241,67,56,198,52,39,41
1,7,1,0,7,5,3,12,4,1,2
2,9,1,0,8,0,0,7,5,1,3
3,38,16,15,63,10,5,35,23,16,11
4,6,3,2,15,3,37,6,5,1,1
5,19,8,7,43,8,0,42,14,7,5
6,3,1,0,19,5,0,14,1,0,2
7,10,6,4,76,23,5,91,6,3,3
8,8,8,5,10,3,1,10,8,7,0
9,15,6,5,54,34,299,49,11,2,3


In [13]:
model = pickle.load(open('final_model.sav', 'rb'))
y_pred = model.predict(df)

In [15]:
with open("1739846.txt", 'w') as output:
    for value in y_pred:
        output.write(str(value) + '\n')