In [1]:
import os
import re

In [None]:
files = []
directory = "../decompiler_workspace"

In [3]:
def list_top_level_subdirectories(root_dir):
    # List only the top-level subdirectories
    for entry in os.listdir(root_dir):
        full_path = os.path.join(root_dir, entry)
        if os.path.isdir(full_path):
            files.append(entry)

In [4]:
def has_user_patches_subdir(folder_path):
    return os.path.isdir(os.path.join(folder_path, 'user_patches'))

In [5]:
# Ask user for a directory path
if os.path.isdir(directory):
    # print(f"Subdirectories inside '{directory}':\n")
    list_top_level_subdirectories(directory)
else:
    print("Invalid directory path.")

In [6]:
def get_error_word_message_from_content(filepath):
    FILE_LINE_RE = re.compile(r'^\s*File "([^"]+)", line (\d+)(?:, in (.+))?')
    ERROR_LINE_RE = re.compile(r'^\s*(\w+(?:Error|Exception))(?:\s*:\s*(.*))?$')
    SORRY_LINE_RE = re.compile(r'^\s*Sorry:\s*(\w+(?:Error|Exception))\s*:\s*(.*?)\s*\(([^,]+),\s*line\s*(\d+)\)\s*$')
    error_word = []
    messages = []
    last_file = None
    last_line = None
    last_ctx = None
    with open(filepath, "r", encoding="utf-8") as f:
        for line in f:
            s = line.rstrip("\n")
            m_file = FILE_LINE_RE.match(s)
            if m_file:
                last_file = m_file.group(1)
                try:
                    last_line = int(m_file.group(2))
                except ValueError:
                    last_line = None
                last_ctx = m_file.group(3)
                continue
            m_err = ERROR_LINE_RE.match(s)
            if m_err:
                error_word.append(m_err.group(1))
                messages.append((m_err.group(2) or "").strip())
                last_file = None
                last_line = None
                last_ctx = None
                break
            # 1) Handle 'Sorry:' one-liners immediately
            m_sorry = SORRY_LINE_RE.match(line)
            if m_sorry:
                error_word.append(m_sorry.group(1))
                messages.append((m_sorry.group(2) or "").strip())
                last_file = None
                last_line = None
                last_ctx = None
                break
            
    error_word = error_word[0] if len(error_word) > 0 else None
    message = messages[0] if len(messages) > 0 else None
    # print(f"File: {filepath}, Error Word: {error_word}, Message: {message}")
    return error_word, message


In [7]:
def search_success_equal_line(filepath,content):
    has_module_line = False
    error_word, error_message = get_error_word_message_from_content(filepath)
    # print(error_word, error_message)
    for line in content.splitlines():
        if line.startswith('***<module>'):
            has_module_line = True
            break
    return (error_word, error_message, has_module_line)

In [8]:
def get_error_description_from_content(content, error_word):
    import re
    lines = content.splitlines()
    description_lines = []
    for line in lines:
        description_lines.append(line)
        if error_word and error_word in line:
            break
    description = ' '.join(description_lines)
    description = re.sub(r'\s+', ' ', description)
    return description.strip()

In [9]:
def get_error_message_after_word(content, error_word):
    import re
    for line in content.splitlines():
        if error_word and error_word in line:
            idx = line.find(error_word)
            if idx != -1:
                # Remove colon and whitespace after error word
                after = line[idx + len(error_word):]
                after = re.sub(r'^\s*:\s*', '', after)
                return after.strip()
    return None

In [10]:
def get_module_lines(content):
    module_lines = []
    for line in content.splitlines():
        if '***<module>' in line:
            module_lines.append(line)
    return module_lines

In [11]:
def clean_error_message(error_message):
    import re
    if not error_message:
        return None
    # Convert to lowercase
    cleaned_message = error_message.lower()
    # Remove leading and trailing whitespace
    cleaned_message = cleaned_message.strip()
    # Remove contents inside parentheses (including parentheses)
    cleaned_message = re.sub(r'\(.*?\)', '', cleaned_message)
    cleaned_message = cleaned_message.strip()
    return cleaned_message

In [None]:
import pandas as pd

dataframe = pd.DataFrame()
def parse_highest_equivalence_report(subdir):
    file_name = subdir
    user_patches = has_user_patches_subdir(os.path.join("../decompiler_workspace", subdir))
    subdir_path = os.path.join("../decompiler_workspace", subdir, "decompiler_output")
    pattern = re.compile(r"equivalence_report_(\d+)\.txt")
    max_num = -1
    max_file = None
    
    if not os.path.isdir(subdir_path):
        # print(f"Directory not found: {subdir_path}")
        return
    for fname in os.listdir(subdir_path):
        error_type = None
        match = pattern.match(fname)
        if match:
            num = int(match.group(1))
            if num > max_num:
                max_num = num
                max_file = fname
    if max_file:
        with open(os.path.join(subdir_path, max_file), "r") as f:
            content = f.read()
        error_word, error_message, has_module_line = search_success_equal_line(os.path.join(subdir_path, max_file), content)
        equivalence = not (error_word or has_module_line)
        if error_word:
            error_type = "syntactic_error"
        elif has_module_line and not error_word:
            error_type = "semantic_error"
        error_description = None
        if error_word:
            error_description = get_error_description_from_content(content, error_word)
            # if error_description:
            #     error_message = get_error_message_after_word(content, error_word)
        module_lines = get_module_lines(content)
        parsing_metadata = {"file_hash": file_name, "equivalence": equivalence, "error_type": error_type, "syntactic_error_word": error_word, "syntactic_error_message": error_message if error_message else None,  "precessed_error_message": clean_error_message (error_message) if error_message else None, "syntactic_error_description": error_description if error_word else None, "user_patches": user_patches, "semantic_error_lines": module_lines}
        new_df = pd.DataFrame([parsing_metadata])
        global dataframe
        dataframe = pd.concat([dataframe, new_df], ignore_index=True)

In [13]:
for file in files:
    parse_highest_equivalence_report(file)

In [14]:
dataframe

Unnamed: 0,file_hash,equivalence,error_type,syntactic_error_word,syntactic_error_message,precessed_error_message,syntactic_error_description,user_patches,semantic_error_lines
0,42c022a2ec74e489bd030de03d6136bef5ad4a6f248994...,True,,,,,,False,[]
1,3cff2c6ae2e1cd68d7482fe7899f6b06560bdea1ba9884...,False,syntactic_error,IndentationError,unexpected indent,unexpected indent,Sorry: IndentationError: unexpected indent (in...,True,[***<module>: Failure detected at line number ...
2,29bcf17560cf73d238cb88d698b3592ebe82e294ac4258...,True,,,,,,False,[]
3,bb8012635e13ebabe986b66d605cbc49af8e487bda654b...,False,syntactic_error,SyntaxError,invalid syntax,invalid syntax,"File ""/decompiler_workspace/bb8012635e13ebabe9...",True,[***<module>.TestTypingError.test_unknown_func...
4,2e09dff2125573b7e081a6c999f5891e7a9d1b6d0c01f5...,False,syntactic_error,SyntaxError,unterminated string literal (detected at line 78),unterminated string literal,"File ""/decompiler_workspace/2e09dff2125573b7e0...",False,[]
...,...,...,...,...,...,...,...,...,...
294091,baa6f65a2914e3a382bf4fc330a913f894f2c44bcdd946...,False,syntactic_error,SyntaxError,expected 'except' or 'finally' block,expected 'except' or 'finally' block,"File ""/decompiler_workspace/baa6f65a2914e3a382...",False,"[***<module>: Failure: Compilation Error, ***<..."
294092,eb7d245db118647fa19686380712bd4679d5fbdcb29449...,False,semantic_error,,,,,False,[***<module>.main: Failure detected at line nu...
294093,b4a2263a647ec8a7e2290dadc6f12c186da8b8b008613a...,True,,,,,,False,[]
294094,115f55bc80a38414aee6ba13e260dff0b25d805c7443a7...,False,syntactic_error,SyntaxError,invalid syntax,invalid syntax,"File ""/decompiler_workspace/115f55bc80a38414ae...",False,"[***<module>: Failure: Compilation Error, ***<..."


In [15]:
# Replace None with 'N/A' in the dataframe
dataframe = dataframe.fillna("None")

# Example usage after replacement
print(dataframe)

                                                file_hash  equivalence  \
0       42c022a2ec74e489bd030de03d6136bef5ad4a6f248994...         True   
1       3cff2c6ae2e1cd68d7482fe7899f6b06560bdea1ba9884...        False   
2       29bcf17560cf73d238cb88d698b3592ebe82e294ac4258...         True   
3       bb8012635e13ebabe986b66d605cbc49af8e487bda654b...        False   
4       2e09dff2125573b7e081a6c999f5891e7a9d1b6d0c01f5...        False   
...                                                   ...          ...   
294091  baa6f65a2914e3a382bf4fc330a913f894f2c44bcdd946...        False   
294092  eb7d245db118647fa19686380712bd4679d5fbdcb29449...        False   
294093  b4a2263a647ec8a7e2290dadc6f12c186da8b8b008613a...         True   
294094  115f55bc80a38414aee6ba13e260dff0b25d805c7443a7...        False   
294095  3af7bf15a5f2a1e7d4a53d239deb287c932316efe1f785...        False   

             error_type syntactic_error_word  \
0                  None                 None   
1       syntact

In [16]:
def summarize_dataframe(df):
    if df.empty:
        print("The dataframe is empty. No summaries to display.")
        return

    # Group by error type
    grouped = df.groupby("error_type").agg(
        total_data=("file_hash", "count"),
        user_patches=("user_patches", "sum"),
        equivalence=("equivalence", "sum")
    )

    print("Summary by Error Type:")
    print(grouped)

# Example usage
summarize_dataframe(dataframe)

Summary by Error Type:
                 total_data  user_patches  equivalence
error_type                                            
None                 168856            25       168856
semantic_error        44420          2318            0
syntactic_error       80820          4828            0


In [17]:
def save_dataframe_to_csv(df, file_path):
    try:
        df.to_csv(file_path, index=False)
        print(f"Dataframe successfully saved to {file_path}")
    except Exception as e:
        print(f"An error occurred while saving the dataframe: {e}")

In [18]:
save_dataframe_to_csv(dataframe, "dataset_summary.csv")

Dataframe successfully saved to dataset_summary.csv


In [19]:
files_df = pd.DataFrame(files, columns=['file_hash'])

In [20]:
# Find files in files_df that are not in dataframe and make it a DataFrame
files_not_processed_list = list(set(files_df['file_hash']) - set(dataframe['file_hash']))
files_not_processed_df = pd.DataFrame(files_not_processed_list, columns=['file_hash'])
print({"total_files": len(files), "files_processed": len(dataframe), "files_not_processed": len(files_not_processed_df)})

{'total_files': 305501, 'files_processed': 294096, 'files_not_processed': 11405}


In [21]:
import pandas as pd

cross_dataframe = pd.DataFrame()
def validate_missing_files(subdir):
    file_name = subdir
    subdir_path = os.path.join("decompiler_workspace", subdir, "decompiler_output")
    max_num = -1
    max_file = None
    report_found = False
    pattern = re.compile(r"equivalence_report_(\d+)\.txt")
    if not os.path.isdir(subdir_path):
        print(f"Directory not found: {subdir_path}")
        return
    else:
        print(f"Processing directory: {subdir_path}")
    for fname in os.listdir(subdir_path):
        error_type = None
        match = pattern.match(fname)
        if match:
            num = int(match.group(1))
            if num > max_num:
                max_num = num
                max_file = fname
    if max_file:
       cross_dataframe = pd.DataFrame([{"file_hash": file_name, "report_found": True}])
    else:
        print("No equivalence_report_{number}.txt file found.")

In [22]:
for file_hash in files_not_processed_df['file_hash']:
    validate_missing_files(file_hash)

Processing directory: decompiler_workspace/f8dfa2ad14ccff1b58d48e2383734c86bf93a0ff83dfd1bae6aa8c64e4739946/decompiler_output
No equivalence_report_{number}.txt file found.
Directory not found: decompiler_workspace/f01d745bc47ffa42fcb21c8f82adc2695c4b9ea967c37762fdaec94c5e241bd3/decompiler_output
Directory not found: decompiler_workspace/bf5503ef0101f01a97f7b1fac8414536250f4e4b149922367b9fb1e1a252ddf3/decompiler_output
Directory not found: decompiler_workspace/de52369e1e6690c6b0334cbd00e3a8439374b354f23dba486c89521ef8fa1f94/decompiler_output
Processing directory: decompiler_workspace/449a133fb4c7f40cad5d229176292dd2258f58807b323bf53f7f9347291fbb71/decompiler_output
No equivalence_report_{number}.txt file found.
Directory not found: decompiler_workspace/db2c1c43e7a3e3b0b83223e439b48b2d89111070d59773ebba73c570565091c6/decompiler_output
Directory not found: decompiler_workspace/5e7f261c6f4208bedf0ca3a077ef59cccb14d0db94b3b29bbfc0d47c27e71b62/decompiler_output
Directory not found: decompil

In [23]:
if(len(cross_dataframe)==0):
    print("No reports found for the missing files.")
else:
    print(cross_dataframe)

No reports found for the missing files.


In [24]:
user_patches_df = dataframe[(dataframe['user_patches'] == True)]
print(user_patches_df)

                                                file_hash  equivalence  \
1       3cff2c6ae2e1cd68d7482fe7899f6b06560bdea1ba9884...        False   
3       bb8012635e13ebabe986b66d605cbc49af8e487bda654b...        False   
22      1b071182ed192b46c3663a602507bca58c08890a1a6a0f...        False   
158     11b1becb1c340a39803513371a438fe2568976dbaa1364...        False   
185     fb220588ba8326ea6ab280a69757f69a7188edf5da1108...        False   
...                                                   ...          ...   
293820  cd1ff614cf27d8e231563312a0a701452d51e566ad45f6...        False   
293882  d4bd2da9c4f235cf10cb398a1677b96b6308f91bb6a518...        False   
293961  76b97d50c7b6eb3b66e8eba6e93eec261bd1c63866d15a...        False   
294036  807053a2b8989d887d093dad92ee0b622ac21a8b4dd9c4...        False   
294075  dea86a39d440448626a63f860412c80da11bd6500c2c7f...        False   

             error_type syntactic_error_word  \
1       syntactic_error     IndentationError   
3       syntact

In [None]:
import pandas as pd

user_patch_dataframe = pd.DataFrame()
def parse_highest_equivalence_report_user_patches(subdir):
    file_name = subdir
    subdir_path = os.path.join("../decompiler_workspace", subdir, "user_patches")
    if not os.path.isdir(subdir_path):
        print(f"User patches directory not found: {subdir_path}")
        return
    user_patch_paths = os.listdir(subdir_path)
    for user_patch in user_patch_paths:
        patch_path = os.path.join(subdir_path, user_patch)
        if os.path.isfile(patch_path) and user_patch == "successful_patches.txt":
            with open(patch_path, "r") as f:
                lines = f.readlines()
                patch_list = [line.strip() for line in lines if line.strip()]
                patch_list_str = ", ".join(patch_list)
                new_df = pd.DataFrame([{"file_hash": file_name, "successful_patches": patch_list_str}])
                global user_patch_dataframe
                user_patch_dataframe = pd.concat([user_patch_dataframe, new_df], ignore_index=True)

In [26]:
for _, file in user_patches_df.iterrows():
    parse_highest_equivalence_report_user_patches(file["file_hash"])

In [27]:
user_patch_dataframe

Unnamed: 0,file_hash,successful_patches
0,1b071182ed192b46c3663a602507bca58c08890a1a6a0f...,090083fc164802921b7a3f2f831630a7688419d21500e1...
1,11b1becb1c340a39803513371a438fe2568976dbaa1364...,0c6c22354361d136b981a91f3b4fead472dd05781e5857...
2,f6a25baadda3fa3fdaae3edc080251abeb540e10b97524...,9f5ef2e81fbb379fea38d9caea6dc197a249449a5ffb75...
3,e25c38320dba6e5c7979d9818b3d82ccc7f0c0938f5b1b...,d847e3b74fb8cf82a1752ca272c4b57e4f3994afcbf5f8...
4,93c7ea3a0911ff7bfb80d6017b07715e000d634e2967d6...,6e22671bdd9cad2d1b3dbdf411ad48b47242a7421d53f7...
...,...,...
2060,f2268e6e160be5d043faaf33fbce932e09a08a9c79219b...,9d2fcf88663ee091c12558fa6b2d1c7d9307c78ae20ff5...
2061,f9d844fc0e5ab4caeb5a729a59217a9888b59ce607f759...,8bc2fcbffd7821690f546d2de15f48fed9d77032772b0c...
2062,76b97d50c7b6eb3b66e8eba6e93eec261bd1c63866d15a...,4e628f2ebc0b1afbc4e8b57232d4d56de0de03be582c67...
2063,807053a2b8989d887d093dad92ee0b622ac21a8b4dd9c4...,0f03fb861d35b3279259265e1b83309a22bbe25f21ab1d...


In [28]:
user_patch_dataframe.to_csv("successfull_user_patches_mapping.csv", index=False)

In [29]:
def summarize_dataframe(df):
    if df.empty:
        print("The dataframe is empty. No summaries to display.")
        return

    # Group by error type
    grouped = df.groupby("precessed_error_message").agg(
        total_data=("file_hash", "count"),
    )

    print("Summary by precessed_error_message:")
    print(grouped)

# Example usage
summarize_dataframe(dataframe)

Summary by precessed_error_message:
                                           total_data
precessed_error_message                              
'(' was never closed                             1722
':' expected after dictionary key                 314
'[' was never closed                             1481
'async for' outside async function                  1
'async with' outside async function                 2
...                                               ...
unmatched ']'                                     201
unmatched '}'                                      94
unterminated f-string literal                      42
unterminated string literal                      5386
unterminated triple-quoted string literal           5

[420 rows x 1 columns]


In [30]:
# Get unique values of syntactic_error_word
unique_error_words = dataframe["syntactic_error_word"].unique()
print("Unique syntactic_error_word values:", len(list(unique_error_words)))
module_error_count = sum(1 for word in unique_error_words if str(word).startswith("<module>"))
print(f"Entries containing <module>: {module_error_count}")
dot_not_module_count = sum(1 for word in unique_error_words if "." in str(word) and "<module>" not in str(word))
print(f"Entries containing '.' but not '<module>': {dot_not_module_count}")

Unique syntactic_error_word values: 4
Entries containing <module>: 0
Entries containing '.' but not '<module>': 0


In [31]:
unique_error_words = dataframe["syntactic_error_word"].unique()
print("Unique syntactic_error_word values:", list(unique_error_words))

Unique syntactic_error_word values: ['None', 'IndentationError', 'SyntaxError', 'PermissionError']
