In [17]:
!CMAKE_ARGS="-DLLAMA_METAL=on" pip install llama-cpp-python==0.2.27

Collecting llama-cpp-python==0.2.27
  Downloading llama_cpp_python-0.2.27.tar.gz (9.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.4/9.4 MB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Installing backend dependencies ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Building wheels for collected packages: llama-cpp-python
  Building wheel for llama-cpp-python (pyproject.toml) ... [?25ldone
[?25h  Created wheel for llama-cpp-python: filename=llama_cpp_python-0.2.27-cp311-cp311-macosx_13_0_arm64.whl size=2013018 sha256=8bfa0760389a7154c5d7b8fc39a20351a5eb6fbf5f48c9c91a45bf5e6ebff344
  Stored in directory: /Users/junan/Library/Caches/pip/wheels/0e/27/a4/13df52c36a09d5eaab1bd43ccce1bdee5d5a4e282537267fdd
Successfully built llama-cpp-python
Installing collected packages: llama-cpp-python
  Attempting unins

In [1]:
import ast
import inspect


def extract_functions(code):
    tree = ast.parse(code)
    functions = []
    for node in tree.body:
        if isinstance(node, ast.FunctionDef):
            function_def = ast.get_source_segment(code, node)
            functions.append(function_def)
    return functions


# Example usage:
code = """
@my_decorator
def add(a, b):
    return a + b

def subtract(a, b):
    return a - b
"""
functions = extract_functions(code)
for function in functions:
    print(function)

def add(a, b):
    return a + b
def subtract(a, b):
    return a - b


In [2]:
from agent import LLM_Agent
from models import MODEL_IDENTIFIERS
from config import load_user_config


class FunctionNameGPT:
    """
    FunctionNameGPT facilitates querying a local Large Language Model (LLM) to suggest function names based on
    decompiler output. This functionality is particularly useful in the context of reverse engineering, where accurate
    and meaningful function names can significantly enhance the readability and understanding of disassembled code.
    Decompiler output from tools like Ghidra, Binary Ninja, or IDA Pro can be fed into this class to

    generate suggestions. The parameters used for querying the LLM are empirically determined
    to offer a balanced trade-off between the quality of the suggestions and the analysis time required.
    """

    def __init__(self, config):
        """
        Initializes the FunctionNameGPT instance with specific configurations for querying the LLM model.

        The configuration includes selecting the appropriate model from MODEL_IDENTIFIERS, setting
        a context limit, and defining generation parameters aimed at optimizing the name suggestion process.

        Parameters:
        - config (dict): A configuration dictionary to be passed to the LLM_Agent.
        """
        # Overrides specific configuration settings for FunctionNameGPT usage
        # Model identifier for the LLM
        config["model_identifier"] = MODEL_IDENTIFIERS["deepseek-coder-6.7b-instruct"]
        # Context length limit to manage large functions

        # Define generation kwargs with empirically determined values for optimal performance
        config["generation_kwargs"] = {
            # Limit model output to prevent overly verbose responses
            "max_tokens": 5000,
            # Token indicating the end of the model's output
            "stop": ["</s>"],
            # Minimum probability threshold for token generation
            "min_p": 0.1,
            # Sampling temperature for diversity
            "temperature": 0.3,
            # Penalty for repeated token generation to encourage diversity
            "repeat_penalty": 1,
        }

        # Instantiate LLM_Agent with the modified configuration
        self.agent = LLM_Agent(config)

    def build_prompt(self, code):
        """
        Constructs a custom prompt tailored for querying the LLM to suggest function names based on decompiler output.

        Parameters:
        - code (str): The decompiler output for a given function.

        Returns:
        - str: A formatted prompt for the LLM including the instruction and the decompiled code.
        """
        # Constructing a detailed prompt to guide the LLM in generating a suitable function name
        user_prompt = f"""### Instruction:

            Given the following decompiler output for a function, \
            analyze its operations, logic, and any identifiable patterns to suggest a suitable function name. \
            Your response should strictly be the function name suggestion and up to 20 characters. \
            Discard all explanations or content, only return the suggested function name. An example output would be "hello_world".

            Here's the code:\n\n {code}"""
        # prompt = "write a snake game"
        # user_prompt = """
        # You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer.
        # ### Instruction:
        # {prompt}
        # """
        return self.agent.build_prompt(user_prompt)

    def query_gpt_for_function_name_suggestion(self, code):
        """
        Directly queries the GPT model for a function name suggestion based on the provided decompiler output.

        Parameters:
        - code (str): The decompiler output for a given function.

        Returns:
        - The raw output from the LLM model as a response to the query.
        """
        # Passes the custom prompt to the LLM_Agent and returns the raw response
        prompt = self.build_prompt(code)
        print(prompt)
        return self.agent.generate_response(prompt)

    def get_function_name_suggestion(self, code):
        """
        Attempts to get a function name suggestion from the LLM. If the suggestion process fails
        (e.g., due to the code being too long), it raises an exception.

        Parameters:
        - code (str): The decompiler output for the function.

        Returns:
        - str: The suggested function name or the original name if suggestion fails.
        """
        # try:
        # Attempts to query the LLM for a name suggestion and filter the output
        suggested_name = self.query_gpt_for_function_name_suggestion(code)
        return self.filter_output(suggested_name)
        # except:
        #     # Raise an error
        #     raise ValueError(
        #         "Failed to query the LLM for a name suggestion. The input code may exceed the maximum token limit supported by the LLM."
        #     )

    @staticmethod
    def filter_output(output):
        """
        Cleans the model's response by removing any additional explanations and normalizing the function name format.
        Specifically, it ensures function names containing underscores are correctly formatted without
        escape characters.

        Parameters:
        - output (str): The raw model output containing the function name suggestion.

        Returns:
        - str: The filtered and normalized function name.
        """
        # Process the model's output to extract and normalize the function name
        filtered_output = output.strip().split("\n")[0].strip().replace("\\_", "_")
        return filtered_output

In [3]:
config = load_user_config("example_config.toml")
gpt = FunctionNameGPT(config)

llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /Users/junan/.cache/lm-studio/models/TheBloke/deepseek-coder-6.7B-instruct-GGUF/deepseek-coder-6.7b-instruct.Q4_K_M.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = deepseek-ai_deepseek-coder-6.7b-instruct
llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 11008
llama_model_loader: - kv   6:                 llama.r

In [4]:
for function in functions:
    print(gpt.filter_output(gpt.get_function_name_suggestion(function)))

### Instruction:

            Given the following decompiler output for a function,             analyze its operations, logic, and any identifiable patterns to suggest a suitable function name.             Your response should strictly be the function name suggestion and up to 20 characters.             Discard all explanations or content, only return the suggested function name. An example output would be "hello_world".

            Here's the code:

 def add(a, b):
    return a + b

 ### Response:
return "add"
### Instruction:

            Given the following decompiler output for a function,             analyze its operations, logic, and any identifiable patterns to suggest a suitable function name.             Your response should strictly be the function name suggestion and up to 20 characters.             Discard all explanations or content, only return the suggested function name. An example output would be "hello_world".

            Here's the code:

 def subtract(a, b):
    r

In [4]:
!pwd

/Users/junan/Downloads


In [6]:
import llvmlite.binding as llvm


def parse_c_code(filename):
    # Initialize LLVM
    llvm.initialize()
    llvm.initialize_native_target()
    llvm.initialize_native_asmprinter()

    # Create LLVM context and module
    context = llvm.Context()
    module = llvm.parse_assembly(open(filename).read())

    # Print LLVM IR (Intermediate Representation)
    print(module)


# Example usage:
parse_c_code("secret.c")

AttributeError: module 'llvmlite.binding' has no attribute 'Context'

In [2]:
import clang.cindex


def traverse(node, depth=0):
    print("  " * depth + str(node.kind) + " " + str(node.spelling))
    for child in node.get_children():
        traverse(child, depth + 1)


def parse_c_file(filename):
    index = clang.cindex.Index.create()
    translation_unit = index.parse(filename)
    root = translation_unit.cursor
    traverse(root)


# Example usage:
parse_c_file("secret.c")

LibclangError: dlsym(0x93466370, clang_CXXMethod_isDeleted): symbol not found. Please ensure that your python bindings are compatible with your libclang.so version.

In [5]:
!pip3 install clang --upgrade



In [19]:
from pycparser import preprocess_file

preprocess_file("/Users/junan/Downloads/memmgr.c")

    // that if nbytes is a multiple of nquantas, we don't allocate too much
                                                       ^
                // its prev's next to its next
                           ^


'# 1 "/Users/junan/Downloads/memmgr.c"\n# 1 "<built-in>" 1\n# 1 "<built-in>" 3\n# 399 "<built-in>" 3\n# 1 "<command line>" 1\n# 1 "<built-in>" 2\n# 1 "/Users/junan/Downloads/memmgr.c" 2\n# 1 "/Users/junan/Downloads/memmgr.h" 1\n\n\n\n\n\n\n\n\n\ntypedef unsigned char byte;\ntypedef unsigned long ulong;\n\n\nvoid memmgr_init();\n\nvoid* memmgr_alloc(ulong nbytes);\n\nvoid memmgr_free(void* ap);\n\nvoid memmgr_print_stats();\n\n\n\n# 2 "/Users/junan/Downloads/memmgr.c" 2\n\ntypedef ulong Align;\n\nunion mem_header_union\n{\n    struct\n    {\n        // Pointer to the next block in the free list\n        //\n        union mem_header_union* next;\n\n        // Size of the block (in quantas of sizeof(mem_header_t))\n        //\n        ulong size;\n    } s;\n\n    // Used to align headers in memory to a boundary\n    //\n    Align align_dummy;\n};\n\ntypedef union mem_header_union mem_header_t;\n\n// Initial empty list\n//\nstatic mem_header_t base;\n\n// Start of free list\n//\nstatic mem

In [23]:
from pycparser import c_parser, c_lexer


def extract_comments(filename):
    parser = c_parser.CParser()
    with open(filename, "r") as f:
        code = f.read()
    tokens = list(c_lexer.CLexer().lex(code))
    comments = [token for token in tokens if token.type == "COMMENT"]
    return comments


# Example usage:
comments = extract_comments("/Users/junan/Downloads/memmgr.c")
for comment in comments:
    print(comment.value)

TypeError: CLexer.__init__() missing 4 required positional arguments: 'error_func', 'on_lbrace_func', 'on_rbrace_func', and 'type_lookup_func'

In [33]:
import sys

# This is not required if you've installed pycparser into
# your site-packages/ with setup.py
sys.path.extend([".", ".."])

from pycparser import c_ast, parse_file


# A simple visitor for FuncDef nodes that prints the names and
# locations of function definitions.
class FuncDefVisitor(c_ast.NodeVisitor):

    def __init__(self):
        self.function_bodies = []

    def visit_FuncDef(self, node):
        print("Function:", node.decl.name)
        if node.body is not None:
            self.function_bodies.append(node.body)


def show_func_defs(filename):
    # Note that cpp is used. Provide a path to your own cpp or
    # make sure one exists in PATH.
    ast = parse_file(
        filename,
        use_cpp=True,
        cpp_path="gcc",
        cpp_args=["-E", r"-Iutils/fake_libc_include"],
    )

    v = FuncDefVisitor()
    v.visit(ast)
    return v.function_bodies


show_func_defs("/Users/junan/Downloads/memmgr.c")

Function: memmgr_init
Function: get_mem_from_pool
Function: memmgr_alloc
Function: memmgr_free


[Compound(block_items=[Assignment(op='=',
                                  lvalue=StructRef(name=StructRef(name=ID(name='base'
                                                                          ),
                                                                  type='.',
                                                                  field=ID(name='s'
                                                                           )
                                                                  ),
                                                   type='.',
                                                   field=ID(name='next'
                                                            )
                                                   ),
                                  rvalue=Constant(type='int',
                                                  value='0'
                                                  )
                                  ),
                       Assign

In [6]:
from pycparser import c_parser, preprocess_file


def print_ast(node, indent=""):
    print(indent + str(node.__class__.__name__))
    for name, child in node.children():
        print_ast(child, indent + "  ")


# Preprocess a C file
preprocessed_code = preprocess_file("/Users/junan/Downloads/secret.c")

# Parse the preprocessed code into AST
parser = c_parser.CParser()
ast = parser.parse(preprocessed_code)

# Print the AST
print_ast(ast)

In file included from /Users/junan/Downloads/secret.c:1:
In file included from /Library/Developer/CommandLineTools/SDKs/MacOSX.sdk/usr/include/stdio.h:64:
In file included from /Library/Developer/CommandLineTools/SDKs/MacOSX.sdk/usr/include/_stdio.h:69:
In file included from /Library/Developer/CommandLineTools/SDKs/MacOSX.sdk/usr/include/Availability.h:166:
/Library/Developer/CommandLineTools/SDKs/MacOSX.sdk/usr/include/AvailabilityInternal.h:142:2: error: #endif without #if
#endif /* #if defined(__has_builtin) */
 ^
/Library/Developer/CommandLineTools/SDKs/MacOSX.sdk/usr/include/AvailabilityInternal.h:150:2: error: #endif without #if
#endif /* __ENABLE_LEGACY_IPHONE_AVAILABILITY */
 ^
/Library/Developer/CommandLineTools/SDKs/MacOSX.sdk/usr/include/AvailabilityInternal.h:153:32: error: missing '(' after '__has_attribute'
    #if defined(__has_attribute) && defined(__has_feature)
                               ^
/Library/Developer/CommandLineTools/SDKs/MacOSX.sdk/usr/include/Availabilit

CalledProcessError: Command '['cpp', '/Users/junan/Downloads/secret.c']' returned non-zero exit status 1.