# Search for elements and addresses in heap dump mem files

In [1]:
import os
import json
import glob
from dataclasses import dataclass
from graphviz import Source

In [2]:

@dataclass
class ProgramParams:
    """
    Wrapper class for program parameters.
    """
    XXD_LINE_BLOCK_BYTE_SIZE = 16
    POINTER_BYTE_SIZE = 8 # 64-bit, ex: C0 03 7B 09 2A 56 00 00

    TEST_JSON_TEST_FILE_PATH = os.environ['HOME'] + "/Documents/code/phdtrack/phdtrack_project_3/data/302-1644391327.json"
    TEST_HEAP_DUMP_RAW_FILE_PATH = os.environ['HOME'] + "/Documents/code/phdtrack/phdtrack_project_3/data/302-1644391327-heap.raw"
    TEST_DATA_DIR = os.environ['HOME'] + "/Documents/code/phdtrack/phdtrack_project_3/data/graphs"
    TEST_GRAPH_DATA_FILENAME = "graph_302-1644391327.gv"

    DATA_DIR_PATH = os.environ['HOME'] + "/Documents/code/phdtrack/phdtrack_data/Training/Training/scp/V_7_8_P1/16"

    def __init__(self, **kwargs):
        if (
            self.check_path_exists(self.TEST_JSON_TEST_FILE_PATH) and
            self.check_path_exists(self.TEST_HEAP_DUMP_RAW_FILE_PATH) and
            self.check_path_exists(self.TEST_DATA_DIR) and
            self.check_path_exists(self.DATA_DIR_PATH)
        ):
            print("Program paths are OK.")
        else:
            print("Program paths are NOT OK.")
            exit(1)
    
    def check_path_exists(self, path: str):
        """
        Check if the path exists. Return True if it exists, False otherwise.
        """
        if not os.path.exists(path):
            print('WARNING: Path does not exist: %s' % path)
            return False
        return True



PARAMS = ProgramParams()

Program paths are OK.


In [3]:
# read the JSON file and get all pair of addresses and keys
@dataclass
class KeyData:
    """
    Wrapper class for key data.
    """
    name: str
    key: bytes
    addr: bytes
    len: int
    real_len: int

heap_start_addr = None
addr_key_pairs: dict[int, KeyData] = {} # key addr (int in base 16 - hex) -> key data (KeyData)

with open(PARAMS.TEST_JSON_TEST_FILE_PATH, 'r') as f:
    json_data = json.load(f)

    heap_start_addr = bytes.fromhex(json_data["HEAP_START"])
    
    for json_key_name in json_data:
        # match json key names that start with 'KEY_' and are followed by a single letter
        if json_key_name.startswith('KEY_') and len(json_key_name) == 5:
            real_key_addr = bytes.fromhex(json_data[json_key_name + "_ADDR"])
            addr_key_pairs[int.from_bytes(real_key_addr, byteorder='big', signed=False)] = KeyData(
                name=json_key_name,
                key=bytes.fromhex(json_data[json_key_name]),
                addr=real_key_addr,
                len=int(json_data[json_key_name + "_LEN"]),
                real_len=int(json_data[json_key_name + "_REAL_LEN"])
            )
            # print(
            #     'addr: ', hex(int.from_bytes(real_key_addr, byteorder='big', signed=False)), 
            #     'real key addr: ', json_data[json_key_name + "_ADDR"]
            # )

# print nb of keys
print("Nb of keys: %d" % len(addr_key_pairs))

Nb of keys: 6


In [4]:
# read the heap dump file and search for the keys
with open(PARAMS.TEST_HEAP_DUMP_RAW_FILE_PATH, 'rb') as f:
    heap_dump = f.read()

    # split the heap dump into lines of 16 bytes
    heap_dump_lines = [heap_dump[i:i+PARAMS.XXD_LINE_BLOCK_BYTE_SIZE] for i in range(0, len(heap_dump), PARAMS.XXD_LINE_BLOCK_BYTE_SIZE)]
    
    # print first 5 lines
    for i in range(5):
        print(heap_dump_lines[i].hex())
    
    print("Number of dump lines: %d" % len(heap_dump_lines), "of size:", PARAMS.XXD_LINE_BLOCK_BYTE_SIZE, "bytes")

    # go to known key addresses and check if the key is there
    for key_addr in addr_key_pairs:
        key_data = addr_key_pairs[key_addr]

        # get the line index of the key address
        # WARN: Need to divide the line index by 16 because the heap dump is in bytes
        # and line addresses is the address of the first byte of the line.
        # so each line address is 16 bytes apart.
        line_index = (int.from_bytes(key_data.addr, byteorder='big', signed=False) - int.from_bytes(heap_start_addr, byteorder='big', signed=False)) // PARAMS.XXD_LINE_BLOCK_BYTE_SIZE
        print("key name:", key_data.name, "index:", line_index, "index in hex:", hex(line_index))
        if (heap_dump_lines[line_index] == key_data.key):
            print("Key found: %s" % key_data.name)
        else:
            print("Key NOT found: %s" % key_data.name)
        
    

00000000000000005102000000000000
02040706070704070504070204060106
06070107060702020201000000000001
03010001000000000000000000000001
00000000030200000000000000000000
Number of dump lines: 17408 of size: 16 bytes
key name: KEY_A index: 5537 index in hex: 0x15a1
Key found: KEY_A
key name: KEY_B index: 4533 index in hex: 0x11b5
Key found: KEY_B
key name: KEY_C index: 5546 index in hex: 0x15aa
Key found: KEY_C
key name: KEY_D index: 4537 index in hex: 0x11b9
Key found: KEY_D
key name: KEY_E index: 6069 index in hex: 0x17b5
Key found: KEY_E
key name: KEY_F index: 3620 index in hex: 0xe24
Key found: KEY_F


In [24]:
# follow the pointers and build the graphs
def follow_pointers_and_build_graph(
    raw_heap_dump_file_path: str, 
    pointer_byte_size=PARAMS.POINTER_BYTE_SIZE,
    debug=False
):
    with open(raw_heap_dump_file_path, 'rb') as f:
        heap_dump = f.read()

        # split the heap dump into lines of POINTER_BYTE_SIZE bytes
        heap_dump_lines = [heap_dump[i:i+pointer_byte_size] for i in range(0, len(heap_dump), pointer_byte_size)]
        
        # print some lines
        if debug: 
            for i in range(100, 105):
                print(heap_dump_lines[i].hex(), "int:", int.from_bytes(heap_dump_lines[i], byteorder='big', signed=False))
        
            print("Number of dump lines: %d" % len(heap_dump_lines), "of size:", pointer_byte_size, "bytes")
        
        graph_file_as_string = ""
        graph_file_as_string += "digraph %s {\n" % str(os.path.basename(raw_heap_dump_file_path)).replace(".raw", "")

        # get HEAP_START from the JSON file
        heap_start_addr = None
        with open(raw_heap_dump_file_path.replace("-heap.raw", ".json"), 'r') as json_file:
            json_data = json.load(json_file)
            heap_start_addr = bytes.fromhex(json_data["HEAP_START"])
        assert heap_start_addr is not None

        # get the min and max address of the heap
        min_addr = int.from_bytes(heap_start_addr, byteorder='big', signed=False) # HEAP_START
        max_addr = min_addr + len(heap_dump_lines) * pointer_byte_size

        if debug:
            print("min_addr: %d, max_addr: %d" % (min_addr, max_addr))
            print("hex min_addr: %s, hex max_addr: %s" % (hex(min_addr), hex(max_addr)))

        # go through all the potential pointers in the heap dump
        counter = 0
        for i, potential_ptr in enumerate(heap_dump_lines):
            potential_ptr_int = int.from_bytes(potential_ptr, byteorder='big', signed=False)
            if potential_ptr_int <= max_addr and potential_ptr_int > 0 and potential_ptr_int % 16 == 0:
                print("found potential_ptr_int: %d, hex potential_ptr_int: %s" % (potential_ptr_int, hex(potential_ptr_int)))

            # check is the potential pointer is in range of the heap
            if potential_ptr_int >= min_addr and potential_ptr_int <= max_addr:
                current_ptr_addr = i * pointer_byte_size + min_addr

                # write the pointer to the graph file
                graph_file_as_string += "    {} -> {};\n".format(
                    hex(current_ptr_addr), 
                    hex(potential_ptr_int)
                )
                
                counter += 1

        # end of graph
        graph_file_as_string += "}"

        if counter > 0:
            # open .gv file
            save_file_path = os.path.join(
                PARAMS.TEST_DATA_DIR, 
                str(os.path.basename(raw_heap_dump_file_path)).replace('.raw', '.gv')
            )

            # save the graph file
            with open(save_file_path, 'w') as graph_file:
                graph_file.write(graph_file_as_string)

            print("Writing graph to file: %s done." % PARAMS.TEST_DATA_DIR + PARAMS.TEST_GRAPH_DATA_FILENAME)
            print("Nb of found potential pointers: %d" % counter)


In [27]:
# follow the pointers and build the graphs for all files
file_paths = glob.glob(os.path.join(PARAMS.DATA_DIR_PATH, '*.raw'), recursive=False)
file_paths: list[str] = list(set(file_paths)) # remove duplicates
print("Nb of files to process: %d" % len(file_paths))

# print first 4 file_paths
for i in range(4):
    print(file_paths[i])

count = 0
for file_path in file_paths:
    if count == 30:
        follow_pointers_and_build_graph(file_path, debug=True)
    count += 1

Nb of files to process: 1065
/home/onyr/Documents/code/phdtrack/phdtrack_data/Training/Training/scp/V_7_8_P1/16/32668-1644391327-heap.raw
/home/onyr/Documents/code/phdtrack/phdtrack_data/Training/Training/scp/V_7_8_P1/16/1495-1644391327-heap.raw
/home/onyr/Documents/code/phdtrack/phdtrack_data/Training/Training/scp/V_7_8_P1/16/31270-1644391327-heap.raw
/home/onyr/Documents/code/phdtrack/phdtrack_data/Training/Training/scp/V_7_8_P1/16/551-1644391327-heap.raw
a014ddeb84550000 int: 11535088549209899008
e014ddeb84550000 int: 16146774567637286912
0015ddeb84550000 int: 6154978118139904
2015ddeb84550000 int: 2311997987331833856
4015ddeb84550000 int: 4617840996545527808
Number of dump lines: 35328 of size: 8 bytes
min_addr: 94029381177344, max_addr: 94029381459968
hex min_addr: 0x5584ebdd1000, hex max_addr: 0x5584ebe16000
found potential_ptr_int: 50462720, hex potential_ptr_int: 0x3020000
found potential_ptr_int: 1099511627776, hex potential_ptr_int: 0x10000000000
found potential_ptr_int: 3435

In [7]:
# read a .gv file and display it
with open(os.path.join(PARAMS.TEST_DATA_DIR, PARAMS.TEST_GRAPH_DATA_FILENAME), 'r') as f:
    graph = f.read()
    graph_png_file_path = str(os.path.join(PARAMS.TEST_DATA_DIR, PARAMS.TEST_GRAPH_DATA_FILENAME)).replace('.gv', '.png')
    s = Source(graph)
    s.render(outfile=graph_png_file_path, format='png', view=True)


FileNotFoundError: [Errno 2] No such file or directory: '/home/onyr/Documents/code/phdtrack/phdtrack_project_3/data/graphs/graph_302-1644391327.gv'