In [2]:
import json
import pandas as pd


def read_data(path, limiter):
    """
    Args:
        path: path to dataset in jsonl file format
        limiter: number of rows

    Returns:
        data: json dict, where every line is one sample as json
    """
    with open(path) as file:
        data = [json.loads(next(file)) for x in range(limiter)]
    return data


def transform_dll_imports(json_sample):
    """
    Args:
        json_sample: one sample of dataset

    Returns:
        functions_dict: dict with all dll functions name with value True
    """
    imports = sample["imports"]
    functions_dict = {}
    for key in imports.keys():
        functions = imports[key]
        functions_with_values = {key.lower() + "-" + f_name: True for f_name in functions}
        functions_dict.update(functions_with_values)
    return functions_dict


def transform_dict(json_dict):
    """
    Args:
        json_dict: json dict with nested key-value, where value is list

    Returns:
        functions_dict: dict with value as a key and default value True
    """
    result_dict = {}
    for key in json_dict.keys():
        keys = json_dict[key]
        values_with_default = {key.lower() + "-" + f_name: True for f_name in keys}
        result_dict.update(values_with_default)
    return result_dict


def transform_list(json_list):
    """
    Args:
        json_list: json list of values

    Returns:
        functions_dict: dict with all dll functions name with value True
    """
    result_dict = {}
    result_dict.update({i: True for i in json_list})
    return result_dict


def flatten_json(y, separator=''):
    """
    Args:
        y: json object
        separator: separator

    Returns:
        functions_dict: dict with flatten values
    """
    out = {}

    def flatten(x, name=separator):
        if type(x) is dict:
            for a in x:
                flatten(x[a], name + a + '_')
        elif type(x) is list:
            i = 0
            for a in x:
                flatten(a, name + str(i) + '_')
                i += 1
        else:
            out[name[:-1]] = x

    flatten(y)
    return out


# 1. read datas
data_path = '/sample_data_6k.jsonl'
data = read_data(data_path, limiter=2)
# 2. iterate over dataset
flatten_dataset = []
for sample in data:
    # version 1 dummy way
    # flat = flatten_json(sample)
    # flatten_dataset.append(flat)
    # collect transformed values for each sample

    # version 2
    transformed = {}
    transformed.update({"label": sample['label']})
    transformed.update({"sha256": sample['sha256']})
    transformed.update({"md5": sample['md5']})
    transformed.update({"appeared": sample['appeared']})
    transformed.update({"avclass": sample['avclass']})
    transformed.update(flatten_json(sample['histogram'], 'histogram.'))
    transformed.update(flatten_json(sample['byteentropy'], 'byteentropy.'))
    transformed.update(transform_dict(sample["imports"]))
    transformed.update(transform_list(sample["exports"]))
    transformed.update(flatten_json(sample['general']))
    transformed.update(flatten_json(sample['strings']))
    transformed.update(flatten_json(sample['header']))
    transformed.update(flatten_json(sample['datadirectories']))

    # here we fill with transform data
    flatten_dataset.append(transformed)

# change all transformed dataset to dataframe object
df = pd.DataFrame(flatten_dataset)
df


FileNotFoundError: [Errno 2] No such file or directory: '/sample_data_6k.jsonl'

In [3]:
import json
import pandas as pd

In [4]:
def read_data(path, limiter):
    """
    Args:
        path: path to dataset in jsonl file format
        limiter: number of rows

    Returns:
        data: json dict, where every line is one sample as json
    """
    with open(path) as file:
        data = [json.loads(next(file)) for x in range(limiter)]
    return data


def transform_dll_imports(json_sample):
    """
    Args:
        json_sample: one sample of dataset

    Returns:
        functions_dict: dict with all dll functions name with value True
    """
    imports = sample["imports"]
    functions_dict = {}
    for key in imports.keys():
        functions = imports[key]
        functions_with_values = {key.lower() + "-" + f_name: True for f_name in functions}
        functions_dict.update(functions_with_values)
    return functions_dict


def transform_dict(json_dict):
    """
    Args:
        json_dict: json dict with nested key-value, where value is list

    Returns:
        functions_dict: dict with value as a key and default value True
    """
    result_dict = {}
    for key in json_dict.keys():
        keys = json_dict[key]
        values_with_default = {key.lower() + "-" + f_name: True for f_name in keys}
        result_dict.update(values_with_default)
    return result_dict


def transform_list(json_list):
    """
    Args:
        json_list: json list of values

    Returns:
        functions_dict: dict with all dll functions name with value True
    """
    result_dict = {}
    result_dict.update({i: True for i in json_list})
    return result_dict


def flatten_json(y, separator=''):
    """
    Args:
        y: json object
        separator: separator

    Returns:
        functions_dict: dict with flatten values
    """
    out = {}

    def flatten(x, name=separator):
        if type(x) is dict:
            for a in x:
                flatten(x[a], name + a + '_')
        elif type(x) is list:
            i = 0
            for a in x:
                flatten(a, name + str(i) + '_')
                i += 1
        else:
            out[name[:-1]] = x

    flatten(y)
    return out



In [30]:
# 1. read datas
data_path = 'sample_data_6k.jsonl'
data = read_data(data_path, limiter=6000)
# 2. iterate over dataset
flatten_dataset = []

In [31]:
data[0]['section']['sections']

[{'name': '.text',
  'size': 26624,
  'entropy': 6.532239617101003,
  'vsize': 26134,
  'props': ['CNT_CODE', 'MEM_EXECUTE', 'MEM_READ']},
 {'name': '.rdata',
  'size': 6656,
  'entropy': 5.433081641309689,
  'vsize': 6216,
  'props': ['CNT_INITIALIZED_DATA', 'MEM_READ']},
 {'name': '.data',
  'size': 512,
  'entropy': 1.7424160994148217,
  'vsize': 172468,
  'props': ['CNT_INITIALIZED_DATA', 'MEM_READ', 'MEM_WRITE']},
 {'name': '.rsro',
  'size': 0,
  'entropy': -0.0,
  'vsize': 135168,
  'props': ['CNT_UNINITIALIZED_DATA', 'MEM_READ', 'MEM_WRITE']},
 {'name': '.rsrc',
  'size': 27648,
  'entropy': 5.020929764194735,
  'vsize': 28672,
  'props': ['CNT_INITIALIZED_DATA', 'MEM_READ']}]

In [32]:
flatten_dataset = []
for sample in data:
    
    transformed = {}
    transformed.update(flatten_json(sample['section']['sections']))
    flatten_dataset.append(transformed)

# change all transformed dataset to dataframe object
df = pd.DataFrame(flatten_dataset)
df

Unnamed: 0,0_name,0_size,0_entropy,0_vsize,0_props_0,0_props_1,0_props_2,1_name,1_size,1_entropy,...,29_props_2,29_props_3,30_name,30_size,30_entropy,30_vsize,30_props_0,30_props_1,30_props_2,30_props_3
0,.text,26624.0,6.532240,26134.0,CNT_CODE,MEM_EXECUTE,MEM_READ,.rdata,6656.0,5.433082,...,,,,,,,,,,
1,.text,455680.0,6.822879,455304.0,CNT_CODE,MEM_EXECUTE,MEM_READ,.data,8192.0,6.204700,...,,,,,,,,,,
2,.text,81920.0,6.459014,80594.0,CNT_CODE,MEM_EXECUTE,MEM_READ,.data,4096.0,1.589853,...,,,,,,,,,,
3,CODE,36864.0,6.599333,36628.0,CNT_CODE,MEM_EXECUTE,MEM_READ,DATA,1024.0,4.128593,...,,,,,,,,,,
4,.text,34304.0,6.591424,33908.0,CNT_CODE,MEM_EXECUTE,MEM_READ,.data,1024.0,4.099097,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5995,.code,1841.0,5.338839,1841.0,CNT_CODE,MEM_EXECUTE,MEM_READ,.text,6552.0,6.191553,...,,,,,,,,,,
5996,CODE,90624.0,7.996807,204800.0,CNT_INITIALIZED_DATA,MEM_READ,MEM_WRITE,DATA,1536.0,6.598744,...,,,,,,,,,,
5997,.text,104448.0,6.610239,104384.0,CNT_CODE,MEM_EXECUTE,MEM_READ,.rdata,17920.0,4.452120,...,,,,,,,,,,
5998,,2886656.0,7.985471,6692864.0,CNT_INITIALIZED_DATA,MEM_EXECUTE,MEM_READ,.rsrc,81920.0,7.996871,...,,,,,,,,,,


In [36]:
frame = df.loc[:,df.columns.str.startswith('0')]

In [45]:
frame[frame['0_name']=='.text']

Unnamed: 0,0_name,0_size,0_entropy,0_vsize,0_props_0,0_props_1,0_props_2,0_props_3,0_props_4,0_props_5,0_props_6,0_props_7,0_props_8,0_props_9,0_props_10,0_props_11,0_props_12,0_props_13,0_props_14,0_props_15,0_props_16
0,.text,26624.00,6.53,26134.00,CNT_CODE,MEM_EXECUTE,MEM_READ,,,,,,,,,,,,,,
1,.text,455680.00,6.82,455304.00,CNT_CODE,MEM_EXECUTE,MEM_READ,,,,,,,,,,,,,,
2,.text,81920.00,6.46,80594.00,CNT_CODE,MEM_EXECUTE,MEM_READ,,,,,,,,,,,,,,
4,.text,34304.00,6.59,33908.00,CNT_CODE,MEM_EXECUTE,MEM_READ,,,,,,,,,,,,,,
5,.text,8704.00,6.27,8259.00,CNT_CODE,MEM_EXECUTE,MEM_READ,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5990,.text,1706496.00,7.19,1706326.00,CNT_CODE,MEM_LOCKED,MEM_READ,MEM_WRITE,,,,,,,,,,,,,
5993,.text,2559488.00,6.52,2559196.00,CNT_CODE,MEM_EXECUTE,MEM_READ,,,,,,,,,,,,,,
5994,.text,0.00,-0.00,96153.00,CNT_CODE,MEM_EXECUTE,MEM_READ,,,,,,,,,,,,,,
5997,.text,104448.00,6.61,104384.00,CNT_CODE,MEM_EXECUTE,MEM_READ,,,,,,,,,,,,,,


In [None]:
for s

In [29]:
df.columns

Index(['0_name', '0_size', '0_entropy', '0_vsize', '0_props_0', '0_props_1',
       '0_props_2', '1_name', '1_size', '1_entropy', '1_vsize', '1_props_0',
       '1_props_1', '2_name', '2_size', '2_entropy', '2_vsize', '2_props_0',
       '2_props_1', '2_props_2', '3_name', '3_size', '3_entropy', '3_vsize',
       '3_props_0', '3_props_1', '3_props_2', '4_name', '4_size', '4_entropy',
       '4_vsize', '4_props_0', '4_props_1', '1_props_2'],
      dtype='object')

In [33]:
# this helps with scrolling df
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.float_format', '{:.2f}'.format)