In [1]:
from typing import List

from langchain.prompts import PromptTemplate, FewShotPromptTemplate
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_community.llms import LlamaCpp
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

In [2]:
# Define your desired data structure.
class CodeAnalyzer(BaseModel):
    function_name: str = Field(description="Suggested name for the function")
    function_description: str = Field(description="Description of the function")

In [50]:
model = LlamaCpp(
    model_path="/Users/junan/Desktop/text-generation-webui/models/deepseek-coder-6.7b-instruct.Q4_K_M.gguf",
    n_gpu_layers=-1,
    n_threads=4,
    f16_kv=True,
    use_mlock=True,
    use_mmap=True,
    n_ctx=1600,
    n_batch=512,
    rope_freq_base=0,
    rope_freq_scale=0,
    max_tokens=2000,
    stop=["</s>", "<|EOT|>", "###", "\n"],
    temperature=0.2,
    repeat_penalty=1.1,
    seed=-1,
    callbacks=[StreamingStdOutCallbackHandler()],
)

llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /Users/junan/Desktop/text-generation-webui/models/deepseek-coder-6.7b-instruct.Q4_K_M.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = deepseek-ai_deepseek-coder-6.7b-instruct
llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 11008
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32       

In [52]:
# Set up a parser + inject instructions into the prompt template.
parser = JsonOutputParser(pydantic_object=CodeAnalyzer)
format_instructions = parser.get_format_instructions()
# And a query intented to prompt a language model to populate the data structure.
response = '{{"function_name": "hello_world()", "function_description": "This function prints the string hello world."}}'
examples = [
    {
        "Instruction": """                
        ### Instruction:
        
        Analyze the following code's operations, logic, and any identifiable patterns to suggest a suitable function name, do not return the original function name.
                        
        Only return the suggested name and description of the following code function, strictly a JSON object.
                        
        Do not include any unnecessary information or symbols beyond the JSON output.
        
        {format_instructions}
        
        Code:
                
        def xxx():
            print("hello world")
            
        ### Response:
        """,
        "Response": response,
    }
]
example_prompt = PromptTemplate(
    input_variables=["Instruction", "Response"],
    template="{Instruction}\n{Response}",
)


fewshotprompt = FewShotPromptTemplate(
    examples=examples,
    example_prompt=example_prompt,
    suffix="""
        {input} 
        """,
    input_variables=["input"],
    partial_variables={"format_instructions": parser.get_format_instructions()},
)

# fewshotprompt = prompt.format(input=input)
print(fewshotprompt)

code_list = [
    """
    def test1(a, b): 
        return a+b
    """,
    """
    def test2(a, b): 
        return a-b""",
]

chain = fewshotprompt | model | parser

# prompt = """
#         ### Instruction:

#         Analyze the following code's operations, logic, and any identifiable patterns to suggest a suitable function name, do not return the original function name.

#         Only return the suggested name and description of the following code function, strictly a JSON object.

#         Do not include any unnecessary information or symbols beyond the JSON output.

#         Code:

#         def test1(a,b):
#             return a+b

#         ### Response:
#         """

for code in code_list:
    code_query = """
    Below is an instruction that describes a task. Write a response that appropriately completes the request.

    ### Instruction:

    Analyze the following code's operations, logic, and any identifiable patterns to suggest a suitable function name, do not return the original function name.

    Only return the suggested name and description of the following code function, strictly a JSON object.

    Do not include any unnecessary information or symbols beyond the JSON output.

    {format_instructions}
    
    Code:

    {code}

    ### Response:
    """

    prompt = PromptTemplate.from_template(
        code_query,
        partial_variables={"format_instructions": parser.get_format_instructions()},
    ).format(code=code)
    # print(prompt)

    # for text in chain.stream(
    #     prompt
    # ):
    #     print(text, flush=True)
    json_output = chain.invoke({"input": prompt})
    print(json_output)

input_variables=['input'] partial_variables={'format_instructions': 'The output should be formatted as a JSON instance that conforms to the JSON schema below.\n\nAs an example, for the schema {"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array", "items": {"type": "string"}}}, "required": ["foo"]}\nthe object {"foo": ["bar", "baz"]} is a well-formatted instance of the schema. The object {"properties": {"foo": ["bar", "baz"]}} is not well-formatted.\n\nHere is the output schema:\n```\n{"properties": {"function_name": {"title": "Function Name", "description": "Suggested name for the function", "type": "string"}, "function_description": {"title": "Function Description", "description": "Description of the function", "type": "string"}}, "required": ["function_name", "function_description"]}\n```'} examples=[{'Instruction': '                \n        ### Instruction:\n        \n        Analyze the following code\'s operations, logic, and any identifiable

Llama.generate: prefix-match hit


 {"function_name": "add(a, b)", "function_description": "This function takes two arguments and returns their sum."}{'function_name': 'add(a, b)', 'function_description': 'This function takes two arguments and returns their sum.'}



llama_print_timings:        load time =    5064.30 ms
llama_print_timings:      sample time =      22.37 ms /    30 runs   (    0.75 ms per token,  1341.38 tokens per second)
llama_print_timings: prompt eval time =    4575.62 ms /    30 tokens (  152.52 ms per token,     6.56 tokens per second)
llama_print_timings:        eval time =    3241.11 ms /    29 runs   (  111.76 ms per token,     8.95 tokens per second)
llama_print_timings:       total time =    9374.88 ms
Llama.generate: prefix-match hit


 {"function_name": "subtract(a, b)", "function_description": "This function subtracts the second parameter from the first."}{'function_name': 'subtract(a, b)', 'function_description': 'This function subtracts the second parameter from the first.'}



llama_print_timings:        load time =    5064.30 ms
llama_print_timings:      sample time =      30.88 ms /    34 runs   (    0.91 ms per token,  1100.89 tokens per second)
llama_print_timings: prompt eval time =     334.82 ms /    23 tokens (   14.56 ms per token,    68.69 tokens per second)
llama_print_timings:        eval time =    3865.32 ms /    33 runs   (  117.13 ms per token,     8.54 tokens per second)
llama_print_timings:       total time =    6583.32 ms


In [59]:
!pip install pygccxml

Collecting pygccxml
  Downloading pygccxml-2.5.0-py3-none-any.whl (121 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m121.4/121.4 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pygccxml
Successfully installed pygccxml-2.5.0


In [72]:
import pygccxml
from pygccxml import parser, declarations, utils


def extract_function_names(cpp_file):
    # Find the location of the xml generator (castxml or gccxml)
    generator_path, generator_name = utils.find_xml_generator()

    # Configure the xml generator
    xml_generator_config = parser.xml_generator_configuration_t(
        xml_generator_path=generator_path, xml_generator=generator_name
    )

    # The c++ file we want to parse
    # filename = "example.hpp"

    # Parse the c++ file
    decls = parser.parse([cpp_file], xml_generator_config)

    # Get access to the global namespace
    global_namespace = declarations.get_global_namespace(decls)

    # Get access to the 'ns' namespace
    ns = global_namespace.namespace("ns")

    # Extract function names
    function_names = []
    for decl in decls:
        if isinstance(decl, declarations.calldef.free_templated_t):
            # If it's a templated function, skip
            continue
        if decl.location.file_name != cpp_file:
            # If it's from a different file, skip
            continue
        if isinstance(decl, declarations.namespace.namespace_t):
            # If it's a namespace, recursively extract from it
            function_names.extend(extract_function_names(cpp_file))
        elif isinstance(decl, declarations.calldef.free_function_t):
            # If it's a free function, extract its name
            function_names.append(decl.name)

    return function_names


# Example usage
function_names = extract_function_names(
    "/Users/junan/Desktop/auto-naming-ai/data/test.cpp"
)

print("Function names:", function_names)

INFO Parsing source file "/Users/junan/Desktop/auto-naming-ai/data/test.cpp" ... 


   33 |     for (int num : vec) {
      |                  ^


declaration_not_found_t: Unable to find declaration. Matcher: [(decl type==namespace_t) and (name==ns)]

In [77]:
from pygccxml import utils
from pygccxml import declarations
from pygccxml import parser

# Find the location of the xml generator (castxml or gccxml)
generator_path, generator_name = utils.find_xml_generator()

# Configure the xml generator
xml_generator_config = parser.xml_generator_configuration_t(
    xml_generator_path=generator_path, xml_generator=generator_name
)

# Write a string containing some c++ code
decls = parser.parse(
    ["/Users/junan/Desktop/auto-naming-ai/data/test.cpp"], xml_generator_config
)
global_namespace = declarations.get_global_namespace(decls)
# Iterate over all declarations in the global namespace
for decl in global_namespace.declarations:
    # Check if the declaration is a free function
    if isinstance(decl, declarations.free_function_t):
        # Print the name of the function
        print("Function:", decl.name)
        # Print information about function arguments
        print("Function Body", decl.body)
        for arg in decl.arguments:
            print("Argument:", arg.name, str(arg.decl_type))

INFO Parsing source file "/Users/junan/Desktop/auto-naming-ai/data/test.cpp" ... 


   33 |     for (int num : vec) {
      |                  ^


Function: renameat
Argument: arg0 int
Argument: arg1 char const *
Argument: arg2 int
Argument: arg3 char const *
Function: renamex_np
Argument: arg0 char const *
Argument: arg1 char const *
Argument: arg2 unsigned int
Function: renameatx_np
Argument: arg0 int
Argument: arg1 char const *
Argument: arg2 int
Argument: arg3 char const *
Argument: arg4 unsigned int
Function: clearerr
Argument: arg0 FILE *
Function: fclose
Argument: arg0 FILE *
Function: feof
Argument: arg0 FILE *
Function: ferror
Argument: arg0 FILE *
Function: fflush
Argument: arg0 FILE *
Function: fgetc
Argument: arg0 FILE *
Function: fgetpos
Argument: arg0 FILE *
Argument: arg1 fpos_t *
Function: fgets
Argument: arg0 char *
Argument: arg1 int
Argument: arg2 FILE *
Function: fopen
Argument: __filename char const *
Argument: __mode char const *
Function: fprintf
Argument: arg0 FILE *
Argument: arg1 char const *
Argument:  ...
Function: fputc
Argument: arg0 int
Argument: arg1 FILE *
Function: fputs
Argument: arg0 char const

In [74]:
global_ns

<pygccxml.declarations.namespace.namespace_t at 0x17459d010>

In [43]:
json_output["function_name"]

'subtract(a, b)'

In [41]:
parser.parse(json_output)

ValidationError: 1 validation error for Generation
text
  str type expected (type=type_error.str)

In [30]:
parser.get_format_instructions()

'The output should be formatted as a JSON instance that conforms to the JSON schema below.\n\nAs an example, for the schema {"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array", "items": {"type": "string"}}}, "required": ["foo"]}\nthe object {"foo": ["bar", "baz"]} is a well-formatted instance of the schema. The object {"properties": {"foo": ["bar", "baz"]}} is not well-formatted.\n\nHere is the output schema:\n```\n{"properties": {"setup": {"title": "Setup", "description": "question to set up a joke", "type": "string"}, "punchline": {"title": "Punchline", "description": "answer to resolve the joke", "type": "string"}}, "required": ["setup", "punchline"]}\n```'

In [51]:
# And a query intented to prompt a language model to populate the data structure.
code_query = """
            ### Instruction:

            Analyze the following code's operations, logic, and any identifiable patterns to suggest a suitable function name, do not return the original function name.
                            
            Only return the suggested name and description of the following code function, strictly a JSON object.
                            
            Do not include any unnecessary information or symbols beyond the JSON output.
            
            
            def test1(a,b):
                return a+b
                
            ### Response:
            """

# Set up a parser + inject instructions into the prompt template.
parser = JsonOutputParser(pydantic_object=CodeAnalyzer)

PromptTemplate.from_template("Say {foo}")
prompt = PromptTemplate(
    template="Answer the user query.\n{format_instructions}\n{query}\n",
    input_variables=["query"],
    partial_variables={"format_instructions": parser.get_format_instructions()},
)

chain = prompt | model | parser

print(prompt.format(code_query))
# json_output = chain.invoke({"query": prompt})

TypeError: PromptTemplate.format() takes 1 positional argument but 2 were given

In [18]:
json_output

{'function_name': 'subtract(a, b)',
 'function_description': 'This function subtracts the second parameter from the first and returns the result.'}

In [28]:
import json
import ast

json_string = """
            {
                'name': 'add_numbers(a, b)',
                'description': 'This function takes two numbers as arguments and returns their sum.',
            }
"""

json_object = ast.literal_eval(json_string.strip())

print(json_object["name"])

add_numbers(a, b)


In [10]:
import ast
import inspect


def extract_functions(code):
    tree = ast.parse(code)
    functions = []
    for node in tree.body:
        if isinstance(node, ast.FunctionDef):
            function_def = ast.get_source_segment(code, node)
            functions.append(function_def)
    return functions


# Example usage:
code = """
    void xxx(void* ap)
    {
        mem_header_t* block;
        mem_header_t* p;

        // acquire pointer to block header
        block = ((mem_header_t*) ap) - 1;

        // Find the correct place to place the block in (the free list is sorted by
        // address, increasing order)
        //
        for (p = freep; !(block > p && block < p->s.next); p = p->s.next)
        {
            // Since the free list is circular, there is one link where a
            // higher-addressed block points to a lower-addressed block.
            // This condition checks if the block should be actually
            // inserted between them
            //
            if (p >= p->s.next && (block > p || block < p->s.next))
                break;
        }

        // Try to combine with the higher neighbor
        //
        if (block + block->s.size == p->s.next)
        {
            block->s.size += p->s.next->s.size;
            block->s.next = p->s.next->s.next;
        }
        else
        {
            block->s.next = p->s.next;
        }

        // Try to combine with the lower neighbor
        //
        if (p + p->s.size == block)
        {
            p->s.size += block->s.size;
            p->s.next = block->s.next;
        }
        else
        {
            p->s.next = block;
        }

        freep = p;
    }
"""
# functions = extract_functions(code)
# for function in functions:
#     print(function)

In [11]:
from agent import LLM_Agent
from models import MODEL_IDENTIFIERS
from config import load_user_config


class FunctionNameGPT:
    """
    FunctionNameGPT facilitates querying a local Large Language Model (LLM) to suggest function names based on
    decompiler output. This functionality is particularly useful in the context of reverse engineering, where accurate
    and meaningful function names can significantly enhance the readability and understanding of disassembled code.
    Decompiler output from tools like Ghidra, Binary Ninja, or IDA Pro can be fed into this class to

    generate suggestions. The parameters used for querying the LLM are empirically determined
    to offer a balanced trade-off between the quality of the suggestions and the analysis time required.
    """

    def __init__(self, config):
        """
        Initializes the FunctionNameGPT instance with specific configurations for querying the LLM model.

        The configuration includes selecting the appropriate model from MODEL_IDENTIFIERS, setting
        a context limit, and defining generation parameters aimed at optimizing the name suggestion process.

        Parameters:
        - config (dict): A configuration dictionary to be passed to the LLM_Agent.
        """
        # Overrides specific configuration settings for FunctionNameGPT usage
        # Model identifier for the LLM
        config["model_identifier"] = MODEL_IDENTIFIERS["deepseek-coder-6.7b-instruct"]
        # Context length limit to manage large functions

        # Define generation kwargs with empirically determined values for optimal performance
        config["generation_kwargs"] = {
            # Limit model output to prevent overly verbose responses
            "max_tokens": 5000,
            # Token indicating the end of the model's output
            "stop": ["</s>"],
            # Minimum probability threshold for token generation
            "min_p": 0.1,
            # Sampling temperature for diversity
            "temperature": 0.3,
            # Penalty for repeated token generation to encourage diversity
            "repeat_penalty": 1,
        }

        # Instantiate LLM_Agent with the modified configuration
        self.agent = LLM_Agent(config)

    def build_prompt(self, code):
        """
        Constructs a custom prompt tailored for querying the LLM to suggest function names based on decompiler output.

        Parameters:
        - code (str): The decompiler output for a given function.

        Returns:
        - str: A formatted prompt for the LLM including the instruction and the decompiled code.
        """
        # Constructing a detailed prompt to guide the LLM in generating a suitable function name
        user_prompt = f"""### Instruction:

            Given the following decompiler output for a function, \
            analyze its operations, logic, and any identifiable patterns to suggest a suitable function name. \
            Your response should strictly be the function name suggestion and up to 20 characters. \
            Discard all explanations or content, only return the suggested function name. An example output would be "hello_world".

            Here's the code:\n\n {code}"""
        # prompt = "write a snake game"
        # user_prompt = """
        # You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer.
        # ### Instruction:
        # {prompt}
        # """
        return self.agent.build_prompt(user_prompt)

    def query_gpt_for_function_name_suggestion(self, code):
        """
        Directly queries the GPT model for a function name suggestion based on the provided decompiler output.

        Parameters:
        - code (str): The decompiler output for a given function.

        Returns:
        - The raw output from the LLM model as a response to the query.
        """
        # Passes the custom prompt to the LLM_Agent and returns the raw response
        prompt = self.build_prompt(code)
        print(prompt)
        return self.agent.generate_response(prompt)

    def get_function_name_suggestion(self, code):
        """
        Attempts to get a function name suggestion from the LLM. If the suggestion process fails
        (e.g., due to the code being too long), it raises an exception.

        Parameters:
        - code (str): The decompiler output for the function.

        Returns:
        - str: The suggested function name or the original name if suggestion fails.
        """
        # try:
        # Attempts to query the LLM for a name suggestion and filter the output
        suggested_name = self.query_gpt_for_function_name_suggestion(code)
        return self.filter_output(suggested_name)
        # except:
        #     # Raise an error
        #     raise ValueError(
        #         "Failed to query the LLM for a name suggestion. The input code may exceed the maximum token limit supported by the LLM."
        #     )

    @staticmethod
    def filter_output(output):
        """
        Cleans the model's response by removing any additional explanations and normalizing the function name format.
        Specifically, it ensures function names containing underscores are correctly formatted without
        escape characters.

        Parameters:
        - output (str): The raw model output containing the function name suggestion.

        Returns:
        - str: The filtered and normalized function name.
        """
        # Process the model's output to extract and normalize the function name
        filtered_output = output.strip().split("\n")[0].strip().replace("\\_", "_")
        return filtered_output

In [12]:
config = load_user_config("example_config.toml")
gpt = FunctionNameGPT(config)

llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from /Users/junan/.cache/lm-studio/models/TheBloke/deepseek-coder-6.7B-instruct-GGUF/deepseek-coder-6.7b-instruct.Q4_K_M.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = deepseek-ai_deepseek-coder-6.7b-instruct
llama_model_loader: - kv   2:                       llama.context_length u32              = 16384
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 11008
llama_model_loader: - kv   6:                 llama.r

In [13]:
print(gpt.filter_output(gpt.get_function_name_suggestion(code)))

### Instruction:

            Given the following decompiler output for a function,             analyze its operations, logic, and any identifiable patterns to suggest a suitable function name.             Your response should strictly be the function name suggestion and up to 20 characters.             Discard all explanations or content, only return the suggested function name. An example output would be "hello_world".

            Here's the code:

 
    void xxx(void* ap)
    {
        mem_header_t* block;
        mem_header_t* p;

        // acquire pointer to block header
        block = ((mem_header_t*) ap) - 1;

        // Find the correct place to place the block in (the free list is sorted by
        // address, increasing order)
        //
        for (p = freep; !(block > p && block < p->s.next); p = p->s.next)
        {
            // Since the free list is circular, there is one link where a
            // higher-addressed block points to a lower-addressed block.
        

The function name suggests that it is related to memory management in a heap-based system. The function deallocates a block of memory that was previously allocated by a similar function. It checks for neighboring free blocks and combines them if possible. The function name could be something like "deallocate_block", "free_memory", or "release_block".


In [4]:
!pwd

/Users/junan/Downloads


In [6]:
import llvmlite.binding as llvm


def parse_c_code(filename):
    # Initialize LLVM
    llvm.initialize()
    llvm.initialize_native_target()
    llvm.initialize_native_asmprinter()

    # Create LLVM context and module
    context = llvm.Context()
    module = llvm.parse_assembly(open(filename).read())

    # Print LLVM IR (Intermediate Representation)
    print(module)


# Example usage:
parse_c_code("secret.c")

AttributeError: module 'llvmlite.binding' has no attribute 'Context'

In [2]:
import clang.cindex


def traverse(node, depth=0):
    print("  " * depth + str(node.kind) + " " + str(node.spelling))
    for child in node.get_children():
        traverse(child, depth + 1)


def parse_c_file(filename):
    index = clang.cindex.Index.create()
    translation_unit = index.parse(filename)
    root = translation_unit.cursor
    traverse(root)


# Example usage:
parse_c_file("secret.c")

LibclangError: dlsym(0x93466370, clang_CXXMethod_isDeleted): symbol not found. Please ensure that your python bindings are compatible with your libclang.so version.

In [5]:
!pip3 install clang --upgrade



In [19]:
from pycparser import preprocess_file

preprocess_file("/Users/junan/Downloads/memmgr.c")

    // that if nbytes is a multiple of nquantas, we don't allocate too much
                                                       ^
                // its prev's next to its next
                           ^


'# 1 "/Users/junan/Downloads/memmgr.c"\n# 1 "<built-in>" 1\n# 1 "<built-in>" 3\n# 399 "<built-in>" 3\n# 1 "<command line>" 1\n# 1 "<built-in>" 2\n# 1 "/Users/junan/Downloads/memmgr.c" 2\n# 1 "/Users/junan/Downloads/memmgr.h" 1\n\n\n\n\n\n\n\n\n\ntypedef unsigned char byte;\ntypedef unsigned long ulong;\n\n\nvoid memmgr_init();\n\nvoid* memmgr_alloc(ulong nbytes);\n\nvoid memmgr_free(void* ap);\n\nvoid memmgr_print_stats();\n\n\n\n# 2 "/Users/junan/Downloads/memmgr.c" 2\n\ntypedef ulong Align;\n\nunion mem_header_union\n{\n    struct\n    {\n        // Pointer to the next block in the free list\n        //\n        union mem_header_union* next;\n\n        // Size of the block (in quantas of sizeof(mem_header_t))\n        //\n        ulong size;\n    } s;\n\n    // Used to align headers in memory to a boundary\n    //\n    Align align_dummy;\n};\n\ntypedef union mem_header_union mem_header_t;\n\n// Initial empty list\n//\nstatic mem_header_t base;\n\n// Start of free list\n//\nstatic mem

In [23]:
from pycparser import c_parser, c_lexer


def extract_comments(filename):
    parser = c_parser.CParser()
    with open(filename, "r") as f:
        code = f.read()
    tokens = list(c_lexer.CLexer().lex(code))
    comments = [token for token in tokens if token.type == "COMMENT"]
    return comments


# Example usage:
comments = extract_comments("/Users/junan/Downloads/memmgr.c")
for comment in comments:
    print(comment.value)

TypeError: CLexer.__init__() missing 4 required positional arguments: 'error_func', 'on_lbrace_func', 'on_rbrace_func', and 'type_lookup_func'

In [33]:
import sys

# This is not required if you've installed pycparser into
# your site-packages/ with setup.py
sys.path.extend([".", ".."])

from pycparser import c_ast, parse_file


# A simple visitor for FuncDef nodes that prints the names and
# locations of function definitions.
class FuncDefVisitor(c_ast.NodeVisitor):

    def __init__(self):
        self.function_bodies = []

    def visit_FuncDef(self, node):
        print("Function:", node.decl.name)
        if node.body is not None:
            self.function_bodies.append(node.body)


def show_func_defs(filename):
    # Note that cpp is used. Provide a path to your own cpp or
    # make sure one exists in PATH.
    ast = parse_file(
        filename,
        use_cpp=True,
        cpp_path="gcc",
        cpp_args=["-E", r"-Iutils/fake_libc_include"],
    )

    v = FuncDefVisitor()
    v.visit(ast)
    return v.function_bodies


show_func_defs("/Users/junan/Downloads/memmgr.c")

Function: memmgr_init
Function: get_mem_from_pool
Function: memmgr_alloc
Function: memmgr_free


[Compound(block_items=[Assignment(op='=',
                                  lvalue=StructRef(name=StructRef(name=ID(name='base'
                                                                          ),
                                                                  type='.',
                                                                  field=ID(name='s'
                                                                           )
                                                                  ),
                                                   type='.',
                                                   field=ID(name='next'
                                                            )
                                                   ),
                                  rvalue=Constant(type='int',
                                                  value='0'
                                                  )
                                  ),
                       Assign

In [6]:
from pycparser import c_parser, preprocess_file


def print_ast(node, indent=""):
    print(indent + str(node.__class__.__name__))
    for name, child in node.children():
        print_ast(child, indent + "  ")


# Preprocess a C file
preprocessed_code = preprocess_file("/Users/junan/Downloads/secret.c")

# Parse the preprocessed code into AST
parser = c_parser.CParser()
ast = parser.parse(preprocessed_code)

# Print the AST
print_ast(ast)

In file included from /Users/junan/Downloads/secret.c:1:
In file included from /Library/Developer/CommandLineTools/SDKs/MacOSX.sdk/usr/include/stdio.h:64:
In file included from /Library/Developer/CommandLineTools/SDKs/MacOSX.sdk/usr/include/_stdio.h:69:
In file included from /Library/Developer/CommandLineTools/SDKs/MacOSX.sdk/usr/include/Availability.h:166:
/Library/Developer/CommandLineTools/SDKs/MacOSX.sdk/usr/include/AvailabilityInternal.h:142:2: error: #endif without #if
#endif /* #if defined(__has_builtin) */
 ^
/Library/Developer/CommandLineTools/SDKs/MacOSX.sdk/usr/include/AvailabilityInternal.h:150:2: error: #endif without #if
#endif /* __ENABLE_LEGACY_IPHONE_AVAILABILITY */
 ^
/Library/Developer/CommandLineTools/SDKs/MacOSX.sdk/usr/include/AvailabilityInternal.h:153:32: error: missing '(' after '__has_attribute'
    #if defined(__has_attribute) && defined(__has_feature)
                               ^
/Library/Developer/CommandLineTools/SDKs/MacOSX.sdk/usr/include/Availabilit

CalledProcessError: Command '['cpp', '/Users/junan/Downloads/secret.c']' returned non-zero exit status 1.