In [47]:
import sys
sys.path.append("/home/luiz/Work/ING/experimentation-day/ai-attack/Attacks using macros/python_code/vba2graph/")

from vba2graph import *

def extract_features(input_file_name="path_to_file", macro_file=False, output_folder="output"):

    # ****************************************************************************
    # *                              Argument Parser                             *
    # ****************************************************************************

    input_vba_content = ""

    # handle files
    if os.path.isfile(input_file_name):
        if macro_file is False:
            input_vba_content = handle_input(input_file_name, is_piped=False)
        else:
            input_vba_content = vba2graph_from_vba_object(input_file_name)

    # handle wrong input
    else:
        logger.error("Invalid input path")
        sys.exit(1)

    input_file_name = os.path.basename(input_file_name)
    
    return vba2graph_gen(input_vba_content, output_folder, input_file_name, color_scheme)

def vba2graph_gen(input_vba_content, output_folder="output", input_file_name="vba2graph", color_scheme=color_scheme):

    """ Generage graph from processed vba macros
    Args:
        input_vba_content (string): data generated by handle_olevba_input
        output_folder (string): output folder
        input_file_name (string): base filename
        color_scheme (int): color scheme number [0, 1, 2, 3] (default: 0 - B&W)
    """

    # ****************************************************************************
    # *                               Process Input                              *
    # ****************************************************************************

    vba_content_lines = vba_seperate_lines(input_vba_content)
    vba_content_lines_no_whitespace = vba_clean_whitespace(vba_content_lines)
    vba_content_lines_no_metadata = vba_clean_metadata(
        vba_content_lines_no_whitespace)
    vba_content_deobfuscated = vba_deobfuscation(vba_content_lines_no_metadata)
    vba_func_dict = vba_extract_functions(vba_content_deobfuscated)
    vba_prop_dict = vba_extract_properties(vba_content_lines_no_metadata)

    # treat properties like functions and merge both dictionaries
    vba_func_dict = dict(vba_func_dict.items() + vba_prop_dict.items())

    ##############################################################################
    # at this point, vba_func_dict should contain the code of functions and
    # properties, without comments or whitespaces.
    ##############################################################################

    # ****************************************************************************
    # *                              Generate Graph                              *
    # ****************************************************************************

    DG = create_call_graph(vba_func_dict)
    DG = find_keywords_in_graph(vba_func_dict, DG)
    DG = find_change_flow(vba_func_dict, DG)
    return DG

In [63]:
def find_keywords_in_graph(vba_func_dict, DG):
    """Find and highlight possible malicious keywords in graph
    
    Args:
        vba_func_dict (dict[func_name]=func_code): Functions dictionary
        DG (networkx.DiGraph): Generated directed graph
    
    Returns:
        networkx.DiGraph: Directed Graph with keywords highlighted in red
    """
    # analyze function calls
    for func_name in vba_func_dict:

        func_code = vba_func_dict[func_name]
        # split function code into lines
        func_code_lines = filter(None, re.split("\n", func_code))

        # handle malicious keywords
        keywords_re_sensetive = "(" + ")|(".join(lst_mal_case_sensetive) + ")"
        keywords_re_insensetive = "(" + ")|(".join(lst_mal_case_insensetive) + ")"

        # iterate over all the words in func_code and match mal_regexes
        dict_items = {}
        for token in func_code_lines:
            match_findall_sensetive = re.findall(keywords_re_sensetive, token)
            match_findall_insensetive = re.findall(keywords_re_insensetive, token, re.IGNORECASE)
            match_findall = match_findall_sensetive + match_findall_insensetive
            if match_findall:
                for match in match_findall:
                    match_list = list(match)

                    # use dictionary dict_items to count occurances of keywords
                    for list_item in match_list:
                        if list_item != "":
                            if list_item not in dict_items:
                                dict_items[list_item] = 1
                            else:
                                dict_items[list_item] = dict_items[list_item] + 1

        # add keywords to graph
        for dic_key in dict_items:
            if dic_key in lst_obfuscation_keywords:
                keyword_color = color_scheme["COLOR_OBFUSCATION_KEYWORD"]
                sensitivity = "OBFUSCATED"
            else:
                keyword_color = color_scheme["COLOR_REGULAR_KEYWORD"]
                sensitivity = 'REGULAR'
                

            keyword_count = dict_items[dic_key]
            if DG.node[func_name]["keywords"] != "":
                DG.node[func_name]["keywords"] = DG.node[func_name]["keywords"] + ","

            DG.node[func_name]["keywords"] = DG.node[func_name]["keywords"] + "<font color='" + keyword_color + "'>" + dic_key + "[" + str(keyword_count) + "]" + "</font>"
            
            call = {'name': dic_key, 
                    "count": keyword_count,
                    "sensitivity": sensitivity,}

            if "calls" in DG.node[func_name]:
                DG.node[func_name]["calls"].append(call)
            else:
                DG.node[func_name]["calls"] = [call]

        # handle autorun keywords
        keywords_re = "(" + ")|(".join(lst_autorun) + ")"
        if re.match(keywords_re, func_name, re.IGNORECASE):
            DG.node[func_name]["color"] = color_scheme["COLOR_AUTORUN_FUNCTIONS"]

    return DG

In [106]:
path = '/home/luiz/Work/ING/experimentation-day/ai-attack/Attacks using macros/data/'
data = []
for label in ('good', 'bad'):
    for f in os.listdir(path + label):
        f_path = os.path.join(path, label, f)
        print f_path
        if ".xls" in f:
            dg = extract_features(f_path, macro_file=False)
        else:
            dg = extract_features(f_path, macro_file=False)


        print dg
        for node in dg.nodes(data=True):
            calls = node[1]['calls']
            for c in calls:
                d = c
                d["function_name"] = node[0]
                d["file_name"] = f
                d["label"] = label
                data.append(d)


data_pd = pd.DataFrame(data)

/home/luiz/Work/ING/experimentation-day/ai-attack/Attacks using macros/data/good/DiversenBerekeningen.xlsm

/home/luiz/Work/ING/experimentation-day/ai-attack/Attacks using macros/data/good/VBAatje.xlsm

/home/luiz/Work/ING/experimentation-day/ai-attack/Attacks using macros/data/good/DiversenGrafieken.xlsm

/home/luiz/Work/ING/experimentation-day/ai-attack/Attacks using macros/data/good/FormulierOpgave.xlsm

/home/luiz/Work/ING/experimentation-day/ai-attack/Attacks using macros/data/good/MSEAnder.xlsm

/home/luiz/Work/ING/experimentation-day/ai-attack/Attacks using macros/data/good/OpmaakOpgave.xlsm

/home/luiz/Work/ING/experimentation-day/ai-attack/Attacks using macros/data/good/Lijsten.xlsm

/home/luiz/Work/ING/experimentation-day/ai-attack/Attacks using macros/data/good/DiversenGrafiekenFunctieAnalyse (1).xlsm

/home/luiz/Work/ING/experimentation-day/ai-attack/Attacks using macros/data/good/QatifAbstract.xlsx

/home/luiz/Work/ING/experimentation-day/ai-attack/Attacks using macros/dat

KeyError: 'calls'

In [112]:
dg = extract_features("/home/luiz/Work/ING/experimentation-day/ai-attack/Attacks using macros/data/good/DiversenBerekeningen.xlsm",
                      macro_file=False)
dg.nodes()

NodeView(())

In [114]:
data_pd

Unnamed: 0,count,file_name,function_name,label,name,sensitivity
0,20,99eb1d90eb5f0d012f35fcc2a7dedd2229312794354843...,zxOInozC,bad,Right,OBFUSCATED
1,16,99eb1d90eb5f0d012f35fcc2a7dedd2229312794354843...,zxOInozC,bad,StrConv,OBFUSCATED
2,22,99eb1d90eb5f0d012f35fcc2a7dedd2229312794354843...,zxOInozC,bad,Mid,OBFUSCATED
3,35,99eb1d90eb5f0d012f35fcc2a7dedd2229312794354843...,zxOInozC,bad,Replace,OBFUSCATED
4,35,99eb1d90eb5f0d012f35fcc2a7dedd2229312794354843...,zxOInozC,bad,StrReverse,OBFUSCATED
5,12,99eb1d90eb5f0d012f35fcc2a7dedd2229312794354843...,zxOInozC,bad,Left,OBFUSCATED
6,31,99eb1d90eb5f0d012f35fcc2a7dedd2229312794354843...,Document_Open,bad,Right,OBFUSCATED
7,1,99eb1d90eb5f0d012f35fcc2a7dedd2229312794354843...,Document_Open,bad,Log,OBFUSCATED
8,40,99eb1d90eb5f0d012f35fcc2a7dedd2229312794354843...,Document_Open,bad,StrConv,OBFUSCATED
9,23,99eb1d90eb5f0d012f35fcc2a7dedd2229312794354843...,Document_Open,bad,Mid,OBFUSCATED


In [96]:
for node in dg.nodes(data=True):
    print(node)

('Document_Open', {'keywords': "<font color='black'>Shell[1]</font>,<font color='#666699'>Chr[8]</font>,<font color='#666699'>StrReverse[251]</font>,<font color='#666699'>Replace[307]</font>", 'color': 'red', 'calls': [{'count': 1, 'sensitivity': 'REGULAR', 'name': 'Shell'}, {'count': 8, 'sensitivity': 'OBFUSCATED', 'name': 'Chr'}, {'count': 251, 'sensitivity': 'OBFUSCATED', 'name': 'StrReverse'}, {'count': 307, 'sensitivity': 'OBFUSCATED', 'name': 'Replace'}]})
('pAKEfogafk', {'keywords': "<font color='#666699'>StrReverse[114]</font>,<font color='#666699'>StrConv[2]</font>,<font color='#666699'>Replace[114]</font>", 'calls': [{'count': 114, 'sensitivity': 'OBFUSCATED', 'name': 'StrReverse'}, {'count': 2, 'sensitivity': 'OBFUSCATED', 'name': 'StrConv'}, {'count': 114, 'sensitivity': 'OBFUSCATED', 'name': 'Replace'}]})


In [97]:
print node[0]
print node[1]['calls'][0]

pAKEfogafk
{'count': 114, 'sensitivity': 'OBFUSCATED', 'name': 'StrReverse'}


In [98]:
import pandas as pd

In [99]:
data = []
for node in dg.nodes(data=True):
    calls = node[1]['calls']
    for c in calls:
        d = c
        d["function_name"] = node[0]
        data.append(d)
data

[{'count': 1,
  'function_name': 'Document_Open',
  'name': 'Shell',
  'sensitivity': 'REGULAR'},
 {'count': 8,
  'function_name': 'Document_Open',
  'name': 'Chr',
  'sensitivity': 'OBFUSCATED'},
 {'count': 251,
  'function_name': 'Document_Open',
  'name': 'StrReverse',
  'sensitivity': 'OBFUSCATED'},
 {'count': 307,
  'function_name': 'Document_Open',
  'name': 'Replace',
  'sensitivity': 'OBFUSCATED'},
 {'count': 114,
  'function_name': 'pAKEfogafk',
  'name': 'StrReverse',
  'sensitivity': 'OBFUSCATED'},
 {'count': 2,
  'function_name': 'pAKEfogafk',
  'name': 'StrConv',
  'sensitivity': 'OBFUSCATED'},
 {'count': 114,
  'function_name': 'pAKEfogafk',
  'name': 'Replace',
  'sensitivity': 'OBFUSCATED'}]

In [100]:
import pandas as pd

In [101]:
pd.DataFrame(data)

Unnamed: 0,count,function_name,name,sensitivity
0,1,Document_Open,Shell,REGULAR
1,8,Document_Open,Chr,OBFUSCATED
2,251,Document_Open,StrReverse,OBFUSCATED
3,307,Document_Open,Replace,OBFUSCATED
4,114,pAKEfogafk,StrReverse,OBFUSCATED
5,2,pAKEfogafk,StrConv,OBFUSCATED
6,114,pAKEfogafk,Replace,OBFUSCATED


In [None]:
for 