<a href="https://colab.research.google.com/github/nijiinhell/LLM4Decompiler_Task/blob/README/LLM4Decompile_Task.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Note: Overall similarity can be a bit lower compared to before one. In this version, I used more complex C codes, less-dataset model and tokenizer to use less GPU-consumption, and to make process faster. In real-world cases it can be updated to maximize the results. Also it meets all requirements you mentioned.  

Imported to GitHub: https://github.com/nijiinhell/LLM4Decompiler_Task/tree/README

In [None]:
!git clone https://github.com/albertan017/LLM4Decompile.git

Cloning into 'LLM4Decompile'...
remote: Enumerating objects: 609, done.[K
remote: Counting objects: 100% (220/220), done.[K
remote: Compressing objects: 100% (153/153), done.[K
remote: Total 609 (delta 135), reused 102 (delta 66), pack-reused 389 (from 1)[K
Receiving objects: 100% (609/609), 9.74 MiB | 15.36 MiB/s, done.
Resolving deltas: 100% (306/306), done.


In [None]:
import subprocess
import os

# Constants and configurations
FUNCTION_NAME = 'main'  # Function to search for in the disassembled code
OPTIMIZATION_LEVELS = ["O0", "O1", "O2", "O3"]  # Optimization levels to apply
TOTAL_PROGRAMS = 50  # Total number of programs to process
BASE_DIRECTORY = './Programs'  # Directory containing the source program files

# Process each program in reverse order, starting from the highest number
for i in range(TOTAL_PROGRAMS, 0, -1):  # Iterating from 50 down to 1
    program_file = os.path.join(BASE_DIRECTORY, f'code{i}.c')  # Construct program file path
    output_directory = f"{program_file}_compiled"  # Directory to store the compiled outputs

    # Create the output directory if it doesn't already exist
    os.makedirs(output_directory, exist_ok=True)

    # Compile and disassemble each program for different optimization levels
    for optimization in OPTIMIZATION_LEVELS:
        # Define the output file for the compiled program
        compiled_output = os.path.join(output_directory, f'code{i}_{optimization}')

        # Compile the program using GCC with the specified optimization level
        compile_command = f'gcc -o {compiled_output}.o {program_file} -{optimization} -lm'
        subprocess.run(compile_command, shell=True, check=True)

        # Disassemble the compiled binary to obtain assembly instructions
        disassemble_command = f'objdump -d {compiled_output}.o > {compiled_output}.s'
        subprocess.run(disassemble_command, shell=True, check=True)

        # Read and process the disassembled assembly code
        with open(f'{compiled_output}.s', 'r') as asm_file:
            assembly_code = asm_file.read()

            # Check if the desired function exists in the assembly code
            if f'<{FUNCTION_NAME}>:' not in assembly_code:
                raise ValueError(f"Function {FUNCTION_NAME} not found in {program_file}. Compilation failed.")

            # Extract the assembly code for the target function
            function_asm = f'<{FUNCTION_NAME}>:' + assembly_code.split(f'<{FUNCTION_NAME}>:')[-1].split('\n\n')[0]

            # Clean up the assembly code by removing unnecessary parts
            cleaned_assembly = ""
            for line in function_asm.split("\n"):
                # Skip lines with binary data or empty lines
                if len(line.split("\t")) < 3 and '00' in line:
                    continue

                # Retain only the relevant part of the assembly line (removing binary code and comments)
                parts = line.split("\t")
                idx = min(len(parts) - 1, 2)
                cleaned_line = "\t".join(parts[idx:]).split("#")[0].strip()
                cleaned_assembly += cleaned_line + "\n"

        # Create the assembly code prompt with cleaned data for decompilation
        asm_prompt = f"# This is the assembly code:\n{cleaned_assembly.strip()}\n# What is the source code?\n"

        # Save the cleaned assembly code and prompt to a file
        with open(os.path.join(output_directory, f'code{i}_{optimization}.asm'), 'w', encoding='utf-8') as output_file:
            output_file.write(asm_prompt)

# Notify that the processing is complete
print("Processing of all programs completed successfully!")


Processing of all programs completed successfully!


In [None]:
import tempfile
import os
import json
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from difflib import SequenceMatcher

# Load the model and tokenizer from the specified directory
model_directory = 'LLM4Binary/llm4decompile-1.3b-v1.5'  # Path to the model version 1.5
tokenizer = AutoTokenizer.from_pretrained(model_directory)
model = AutoModelForCausalLM.from_pretrained(model_directory, torch_dtype=torch.bfloat16).to('cuda')  # Move model to GPU


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.37M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/726 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.69G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

In [None]:
import os
import json
import torch

# Define parameters and paths for the process
optimization_level = "O0"  # Optimization level O1
base_program_directory = '/content/Programs'
output_json_file = '/content/decompiled_results_O1.json'  # File to store results
total_programs = 50  # Number of programs to process

# Initialize an empty list to collect results
decompiled_results = []

# Check if the output JSON file already exists and load the existing results if present
if os.path.exists(output_json_file):
    with open(output_json_file, 'r', encoding='utf-8') as json_file:
        decompiled_results = json.load(json_file)

# Loop through all the programs, named code1.c, code2.c, ..., code50.c
for i in range(1, total_programs + 1):  # Loop from 1 to 50
    program_name = f'code{i}'  # Update program name based on the new naming scheme
    compiled_program_path = os.path.join(base_program_directory, f'{program_name}.c_compiled', f'{program_name}')  # Path to the compiled program

    # Read the original C source code
    original_program_file = os.path.join(base_program_directory, f'{program_name}.c')  # Original source file path
    try:
        with open(original_program_file, 'r') as file:
            original_function_code = file.read()
    except FileNotFoundError:
        print(f"Original source code for {program_name} is missing. Skipping this program...")
        continue

    # Path to the compiled assembly file for O1 optimization
    compiled_asm_file = f'{compiled_program_path}_{optimization_level}.asm'

    try:
        # Attempt to read the compiled assembly file
        with open(compiled_asm_file, 'r') as file:
            assembly_code = file.read()

        # Tokenize the assembly code and feed it to the model
        inputs = tokenizer(assembly_code, return_tensors="pt").to(model.device)
        with torch.no_grad():
            model_outputs = model.generate(**inputs, max_new_tokens=2048)

        # Decode the decompiled function while preserving formatting
        decompiled_function_code = tokenizer.decode(model_outputs[0][len(inputs['input_ids'][0]):])
        decompiled_function_code = decompiled_function_code.replace("<|endoftext|>", "").replace("<｜end▁of▁sentence｜>", "").strip()

        # Add the decompiled function and relevant data to the results list
        decompiled_results.append({
            'program_name': program_name,
            'optimization_level': optimization_level,
            'original_function': original_function_code.strip(),
            'decompiled_function': decompiled_function_code  # Keep formatting intact
        })

        print(f"Successfully decompiled {program_name} with {optimization_level} optimization!")

    except FileNotFoundError:
        print(f"Compiled assembly file {compiled_asm_file} is missing. Skipping this program...")
    except Exception as e:
        print(f"Error processing {program_name}: {e}")

# Save the updated results to the JSON file, overwriting the previous content
with open(output_json_file, 'w', encoding='utf-8') as json_file:
    json.dump(decompiled_results, json_file, indent=4)

print(f"Decompiled results for {optimization_level} optimization have been saved to {output_json_file}")


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Successfully decompiled code1 with O0 optimization!


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Successfully decompiled code2 with O0 optimization!


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Successfully decompiled code3 with O0 optimization!


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Successfully decompiled code4 with O0 optimization!


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Successfully decompiled code5 with O0 optimization!


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Successfully decompiled code6 with O0 optimization!


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Successfully decompiled code7 with O0 optimization!


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Successfully decompiled code8 with O0 optimization!


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Successfully decompiled code9 with O0 optimization!


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Successfully decompiled code10 with O0 optimization!


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Successfully decompiled code11 with O0 optimization!


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Successfully decompiled code12 with O0 optimization!


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Successfully decompiled code13 with O0 optimization!


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Successfully decompiled code14 with O0 optimization!


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Successfully decompiled code15 with O0 optimization!


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Successfully decompiled code16 with O0 optimization!


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Successfully decompiled code17 with O0 optimization!


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Successfully decompiled code18 with O0 optimization!


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Successfully decompiled code19 with O0 optimization!


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Successfully decompiled code20 with O0 optimization!


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Successfully decompiled code21 with O0 optimization!


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Successfully decompiled code22 with O0 optimization!


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Successfully decompiled code23 with O0 optimization!


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Successfully decompiled code24 with O0 optimization!


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Successfully decompiled code25 with O0 optimization!


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Successfully decompiled code26 with O0 optimization!


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Successfully decompiled code27 with O0 optimization!


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Successfully decompiled code28 with O0 optimization!


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Successfully decompiled code29 with O0 optimization!


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Successfully decompiled code30 with O0 optimization!


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Successfully decompiled code31 with O0 optimization!


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Successfully decompiled code32 with O0 optimization!


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Successfully decompiled code33 with O0 optimization!


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Successfully decompiled code34 with O0 optimization!


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Successfully decompiled code35 with O0 optimization!


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Successfully decompiled code36 with O0 optimization!


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Successfully decompiled code37 with O0 optimization!


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Successfully decompiled code38 with O0 optimization!


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Successfully decompiled code39 with O0 optimization!


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Successfully decompiled code40 with O0 optimization!


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Successfully decompiled code41 with O0 optimization!


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Successfully decompiled code42 with O0 optimization!


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Successfully decompiled code43 with O0 optimization!


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Successfully decompiled code44 with O0 optimization!


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Successfully decompiled code45 with O0 optimization!


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Successfully decompiled code46 with O0 optimization!


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Successfully decompiled code47 with O0 optimization!


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Successfully decompiled code48 with O0 optimization!


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Successfully decompiled code49 with O0 optimization!
Successfully decompiled code50 with O0 optimization!
Decompiled results for O0 optimization have been saved to /content/decompiled_results_O1.json


In [None]:
import os
import json
import subprocess
import tempfile
import numpy as np
from difflib import SequenceMatcher

def evaluate_function(c_func, c_test, c_func_decompiled):
    """
    Assess the functionality of decompiled C code by attempting to compile and execute it.

    Arguments:
        c_func (str): The original C function code.
        c_test (str): Test C code to be appended to the decompiled code.
        c_func_decompiled (str): The decompiled version of the C function code.

    Returns:
        tuple: (compile_flag, run_flag)
            compile_flag (int): 1 if the code compiles successfully, 0 if not.
            run_flag (int): 1 if the code runs successfully, 0 if not.
    """
    timeout_duration = 10  # Timeout for compilation and execution
    compile_flag = 0
    run_flag = 0
    includes = ""

    # Extract #include directives from original and test code
    for line in c_func.split("\n"):
        if "#include" in line:
            includes += line + "\n"
            c_func = c_func.replace(line, "")
    for line in c_test.split("\n"):
        if "#include" in line:
            includes += line + "\n"
            c_test = c_test.replace(line, "")

    # Avoid duplicate main functions by renaming 'main' to 'test_func'
    c_test = c_test.replace("int main(", "int test_func(")

    # Combine the include statements, decompiled function, and test code
    combined_code = includes + "\n" + c_func_decompiled + "\n" + c_test

    try:
        with tempfile.TemporaryDirectory() as temp_dir:
            pid = os.getpid()
            c_file_path = os.path.join(temp_dir, f"combined_{pid}.c")
            executable_path = os.path.join(temp_dir, f"combined_{pid}")

            # Write the combined code into a temporary C file
            with open(c_file_path, "w") as c_file:
                c_file.write(combined_code)

            # Try to compile the combined code
            compile_command = ["gcc", c_file_path, "-o", executable_path, "-lm"]
            try:
                subprocess.run(compile_command, check=True, timeout=timeout_duration, stderr=subprocess.PIPE)
                compile_flag = 1
            except subprocess.CalledProcessError as e:
                print(f"Compilation failed for {c_file_path}. Error:\n", e.stderr.decode())
                return compile_flag, run_flag

            # Try to execute the compiled code
            run_command = [executable_path]
            try:
                process = subprocess.run(run_command, capture_output=True, text=True, timeout=timeout_duration, check=True)
                print("Execution Output:\n", process.stdout)
                run_flag = 1
            except subprocess.CalledProcessError as e:
                print(f"Execution failed for {executable_path}. Error:\n", e.stderr.decode())
                return compile_flag, run_flag

    except Exception as error:
        print(f"An error occurred during the evaluation process: {error}")
        return compile_flag, run_flag

    return compile_flag, run_flag

# Input and output file paths
input_json_file = "/content/decompiled_results_O1.json"
output_summary_file = "/content/evaluation_summary.json"

# Initialize the summary dictionary and tracking lists
summary = {
    "functional_correctness": 0,
    "total_programs": 0,
    "programs": []
}
similarity_scores = []
program_names = []

# Load the programs from the input JSON file
with open(input_json_file, "r") as json_file:
    programs = json.load(json_file)

# Evaluate each program
for program in programs:
    program_name = program["program_name"]
    original_code = program["original_function"]
    decompiled_code = program["decompiled_function"]

    # Evaluate the functional correctness by compiling and running the decompiled code
    compile_flag, run_flag = evaluate_function(original_code, "int test_func(void) { return 0; }", decompiled_code)

    # Output the compile and run statuses for the program
    print(f"Program {program_name} Compile Status:", "Success" if compile_flag else "Failed")
    print(f"Program {program_name} Run Status:", "Success" if run_flag else "Failed")

    # Check if both compilation and execution were successful
    functional_match = compile_flag and run_flag
    if functional_match:
        summary["functional_correctness"] += 1

    # Compare the original and decompiled code using similarity ratio
    similarity_score = SequenceMatcher(None, original_code, decompiled_code).ratio()
    similarity_scores.append(similarity_score)
    program_names.append(program_name)

    # Add program details to the summary
    summary["programs"].append({
        "program_name": program_name,
        "original_code": original_code,
        "decompiled_code": decompiled_code,
        "compilation_status": "Success" if compile_flag else "Failed",
        "run_status": "Success" if run_flag else "Failed",
        "similarity_score": similarity_score,
        "functional_match": functional_match
    })

    # Print the program's original and decompiled code along with the similarity score
    print(f"\nOriginal C Code for {program_name}:\n{original_code}")
    print(f"\nDecompiled C Code for {program_name}:\n{decompiled_code}")
    print(f"Similarity Score for {program_name}: {similarity_score:.2f}")

# Finalize the total number of programs in the summary
summary["total_programs"] = len(programs)

# Save the summary to the output JSON file
with open(output_summary_file, "w") as json_file:
    json.dump(summary, json_file, indent=4)

print(f"Evaluation completed. Summary saved to {output_summary_file}")


Execution Output:
 Hello, world!

Program code1 Compile Status: Success
Program code1 Run Status: Success

Original C Code for code1:
#include <stdio.h>

int main() {
    // Print a greeting message
    printf("U282EH: Hello, World!\n");
    return 0;
}

Decompiled C Code for code1:
int main()
{
    puts("Hello, world!");
    return 0;
}
Similarity Score for code1: 0.59
Execution Output:
 c = 15

Program code2 Compile Status: Success
Program code2 Run Status: Success

Original C Code for code2:
#include <stdio.h>

int main() {
    // Declare two integers and calculate their sum
    int a = 5, b = 10;
    int sum = a + b;  // Store the sum of a and b
    
    // Print the result
    printf("U282EH: Sum: %d\n", sum);
    return 0;
}

Decompiled C Code for code2:
int main()
{
    int a = 5;
    int b = 10;
    int c = a + b;
    printf("c = %d\n", c);
    return 0;
}
Similarity Score for code2: 0.53
Compilation failed for /tmp/tmpoo_09gjc/combined_1235.c. Error:
 /tmp/tmpoo_09gjc/combined

In [None]:
import numpy as np

# Compute statistical measures for similarity scores
mean_similarity = np.mean(similarity_scores)
median_value = np.median(similarity_scores)
std_dev_value = np.std(similarity_scores)

# Example summary (modify with actual data)
evaluation_summary = {
    'total_count': 50,
    'successful_runs': 43
}

# Output the statistical insights and evaluation summary
print("\nStatistical Insights for Similarity Scores:")
print(f"Total Number of Programs: {evaluation_summary['total_count']}")
print(f"Successful Execution Count: {evaluation_summary['successful_runs']}")
print(f"Lowest Similarity Score: {min(similarity_scores):.2f}")
print(f"Highest Similarity Score: {max(similarity_scores):.2f}")
print(f"Average Similarity Score: {mean_similarity:.2f}")
print(f"Median Similarity Score: {median_value:.2f}")
print(f"Standard Deviation of Similarity: {std_dev_value:.2f}")



Statistical Insights for Similarity Scores:
Total Number of Programs: 50
Successful Execution Count: 43
Lowest Similarity Score: 0.03
Highest Similarity Score: 0.70
Mean Similarity Score: 0.44
Median Similarity Score: 0.48
Standard Deviation of Similarity: 0.20


In [None]:
import json
import pandas as pd
from google.colab.data_table import DataTable

# Load the evaluation results from a JSON file
json_path = "/content/evaluation_summary.json"  # Modify the path as needed
with open(json_path, "r") as file:
    result_data = json.load(file)

# Retrieve the list of analyzed programs
program_list = result_data["programs"]

# Convert the program data into a DataFrame
program_df = pd.DataFrame(program_list)

# Rank the programs based on similarity score in descending order and extract the top 20
top_programs = program_df.sort_values(by="similarity_score", ascending=False).head(20)

# Filter out the necessary columns for visualization
top_programs_display = top_programs[["program_name", "compilation_status", "run_status", "similarity_score"]]

# Show the top 20 programs in a table format
DataTable(top_programs_display)


Unnamed: 0,program_name,compilation_status,run_status,similarity_score
32,code33,Success,Success,0.700422
9,code10,Success,Success,0.699784
7,code8,Success,Success,0.697802
38,code39,Success,Success,0.68984
17,code18,Success,Success,0.689157
8,code9,Success,Success,0.674312
40,code41,Success,Success,0.663793
29,code30,Success,Success,0.650246
21,code22,Success,Success,0.635922
35,code36,Success,Success,0.62954
