In [76]:
import json
import pandas as pd


def read_data(path, limiter):
    """
    Args:
        path: path to dataset in jsonl file format
        limiter: number of rows

    Returns:
        data: json dict, where every line is one sample as json
    """
    with open(path) as file:
        data = [json.loads(next(file)) for x in range(limiter)]
    return data


def transform_dll_imports(json_sample):
    """
    Args:
        json_sample: one sample of dataset

    Returns:
        functions_dict: dict with all dll functions name with value True
    """
    imports = sample["imports"]
    functions_dict = {}
    for key in imports.keys():
        functions = imports[key]
        functions_with_values = {key.lower() + "-" + f_name: True for f_name in functions}
        functions_dict.update(functions_with_values)
    return functions_dict


def transform_dict(json_dict):
    """
    Args:
        json_dict: json dict with nested key-value, where value is list

    Returns:
        functions_dict: dict with value as a key and default value True
    """
    result_dict = {}
    for key in json_dict.keys():
        keys = json_dict[key]
        values_with_default = {key.lower() + "-" + f_name: True for f_name in keys}
        result_dict.update(values_with_default)
    return result_dict


def transform_list(json_list):
    """
    Args:
        json_list: json list of values

    Returns:
        functions_dict: dict with all dll functions name with value True
    """
    result_dict = {}
    result_dict.update({i: True for i in json_list})
    return result_dict


def flatten_json(y, separator=''):
    """
    Args:
        y: json object
        separator: separator

    Returns:
        functions_dict: dict with flatten values
    """
    out = {}

    def flatten(x, name=separator):
        if type(x) is dict:
            for a in x:
                flatten(x[a], name + a + '_')
        elif type(x) is list:
            i = 0
            for a in x:
                flatten(a, name + str(i) + '_')
                i += 1
        else:
            out[name[:-1]] = x

    flatten(y)
    return out


# 1. read datas
data_path = '/sample_data_6k.jsonl'
data = read_data(data_path, limiter=2)
# 2. iterate over dataset
flatten_dataset = []
for sample in data:
    # version 1 dummy way
    # flat = flatten_json(sample)
    # flatten_dataset.append(flat)
    # collect transformed values for each sample

    # version 2
    transformed = {}
    transformed.update({"label": sample['label']})
    transformed.update({"sha256": sample['sha256']})
    transformed.update({"md5": sample['md5']})
    transformed.update({"appeared": sample['appeared']})
    transformed.update({"avclass": sample['avclass']})
    transformed.update(flatten_json(sample['histogram'], 'histogram.'))
    transformed.update(flatten_json(sample['byteentropy'], 'byteentropy.'))
    transformed.update(transform_dict(sample["imports"]))
    transformed.update(transform_list(sample["exports"]))
    transformed.update(flatten_json(sample['general']))
    transformed.update(flatten_json(sample['strings']))
    transformed.update(flatten_json(sample['header']))
    transformed.update(flatten_json(sample['datadirectories']))

    # here we fill with transform data
    flatten_dataset.append(transformed)

# change all transformed dataset to dataframe object
df = pd.DataFrame(flatten_dataset)
df


Unnamed: 0,label,sha256,md5,appeared,avclass,histogram.0,histogram.1,histogram.2,histogram.3,histogram.4,...,wintrust.dll-CryptCATAdminReleaseCatalogContext,wintrust.dll-WTHelperProvDataFromStateData,wintrust.dll-WinVerifyTrust,wintrust.dll-WTHelperGetProvSignerFromChain,wintrust.dll-CryptCATAdminReleaseContext,ws2_32.dll-ordinal116,ws2_32.dll-getaddrinfo,ws2_32.dll-ordinal115,coff_characteristics_5,optional_dll_characteristics_0
0,0,0abb4fda7d5b13801d63bee53e5e256be43e141faa077a...,63956d6417f8f43357d9a8e79e52257e,2006-12,,45521,13095,12167,12496,12429,...,,,,,,,,,,
1,0,c9cafff8a596ba8a80bafb4ba8ae6f2ef3329d95b85f15...,6f7bde7a1126debf0cc359a54953efc1,2007-01,,93059,15789,2871,3005,4107,...,True,True,True,True,True,True,True,True,AGGRESSIVE_WS_TRIM,TERMINAL_SERVER_AWARE
