In [141]:
import os
import re
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
import difflib
import json


In [142]:
def read_csv_file(file_name: str) -> pd.DataFrame:
    csv_path = Path(__file__).with_name(file_name) if '__file__' in globals() else Path(file_name)
    print(f"Trying to read: {csv_path.resolve()}")
    df = pd.read_csv(csv_path)
    print('DataFrame shape:', df.shape)
    return df

In [143]:
df_all_data = read_csv_file('../dataset/dataset_summary.csv')
df_all_data_successful_patches = read_csv_file('../dataset/successfull_user_patches_mapping.csv')

Trying to read: /home/diogenes/pylingual_colaboration/pylingual_download/code/dataset/dataset_summary.csv
DataFrame shape: (294096, 9)
Trying to read: /home/diogenes/pylingual_colaboration/pylingual_download/code/dataset/successfull_user_patches_mapping.csv
DataFrame shape: (2065, 2)


In [144]:
df_user_patch_with_syntax_error = df_all_data[(df_all_data['error_type'] == 'syntactic_error') & (df_all_data['user_patches']==True)]

In [145]:
df_user_patch_with_syntax_error.shape

(4828, 9)

In [146]:
df_user_patch_with_syntax_error = df_user_patch_with_syntax_error[
    df_user_patch_with_syntax_error['file_hash'].isin(
        df_all_data_successful_patches['file_hash'].tolist()
    )
]


In [147]:
df_user_patch_with_syntax_error.shape

(1152, 9)

In [148]:
df_user_patch_with_syntax_error.head()

Unnamed: 0,file_hash,equivalence,error_type,syntactic_error_word,syntactic_error_message,precessed_error_message,syntactic_error_description,user_patches,semantic_error_lines
22,1b071182ed192b46c3663a602507bca58c08890a1a6a0f...,False,syntactic_error,SyntaxError,'{' was never closed,'{' was never closed,"File ""/decompiler_workspace/1b071182ed192b46c3...",True,['***<module>.getip: Failure: Different byteco...
158,11b1becb1c340a39803513371a438fe2568976dbaa1364...,False,syntactic_error,SyntaxError,invalid syntax,invalid syntax,"File ""/decompiler_workspace/11b1becb1c340a3980...",True,['***<module>.console: Failure: Different byte...
214,e25c38320dba6e5c7979d9818b3d82ccc7f0c0938f5b1b...,False,syntactic_error,SyntaxError,EOL while scanning string literal,eol while scanning string literal,"File ""/decompiler_workspace/e25c38320dba6e5c79...",True,"['***<module>: Failure: Compilation Error', '*..."
1044,3127c51d682432be364e4374ad2c5baacf8ff0c9935b2e...,False,syntactic_error,SyntaxError,expected 'except' or 'finally' block,expected 'except' or 'finally' block,"File ""/decompiler_workspace/3127c51d682432be36...",True,"['***<module>: Failure: Compilation Error', '*..."
1287,37406cf3f9b0fb17f909d1a77c862541d8c9b1daacdae7...,False,syntactic_error,IndentationError,unexpected indent,unexpected indent,Sorry: IndentationError: unexpected indent (in...,True,['***<module>.XBOX: Failure detected at line n...


In [149]:
BASE_DIR = Path("../../decompiler_workspace")

In [150]:
def extract_code_block(filepath: str, line_range: tuple):
    start, end = line_range
    with open(filepath, "r", encoding="utf-8") as f:
        lines = f.readlines()
    # Convert 1-based range → Python 0-based indexing
    return "".join(lines[start - 1 : end])

In [151]:
def extract_hunks(old_path, new_path):
    import difflib

    with open(old_path, "r", encoding="utf-8") as f:
        old_lines = f.read().splitlines()

    with open(new_path, "r", encoding="utf-8") as f:
        new_lines = f.read().splitlines()

    diff = difflib.unified_diff(old_lines, new_lines, lineterm="")

    hunk_header = re.compile(
        r"@@ -(?P<old_start>\d+),(?P<old_len>\d+) "
        r"\+(?P<new_start>\d+),(?P<new_len>\d+) @@"
    )

    hunks = []
    current = None

    for line in diff:
        m = hunk_header.match(line)
        if m:
            if current:
                hunks.append(current)

            old_start = int(m.group("old_start"))
            old_len = int(m.group("old_len"))
            new_start = int(m.group("new_start"))
            new_len = int(m.group("new_len"))

            current = {
                "old_range": (old_start, old_start + old_len - 1),
                "new_range": (new_start, new_start + new_len - 1),
                "old_code": [],
                "new_code": [],
            }
            continue

        if current:
            if line.startswith("-"):
                current["old_code"].append(line[1:])
            elif line.startswith("+"):
                current["new_code"].append(line[1:])

    if current:
        hunks.append(current)

    return hunks

In [152]:
from typing import Optional, Tuple
INDENTED_RE = re.compile(r"^indented_(\d+)(?:\.[^.]+)?$")  # matches indented_12 or indented_12.py

def highest_indented_file(
    directory: Path | str,
    pattern: str = "indented_*"  # change to "indented_*.py" to require .py
) -> Optional[Tuple[Path, int]]:
    """
    Return (path, idx) for the highest-numbered file named like 'indented_{idx}[.ext]'
    in the given directory. Returns None if none found.
    """
    directory = Path(directory)
    best: Optional[Tuple[Path, int]] = None

    for p in directory.glob(pattern):
        if not p.is_file():
            continue
        m = INDENTED_RE.match(p.name)
        if not m:
            continue
        n = int(m.group(1))
        if best is None or n > best[1]:
            best = (p, n)
    return best

In [153]:
def serialize_hunks(hunks, old_file_path=None, new_file_path=None):
    """
    Convert hunks into JSON-safe dictionaries.
    If file paths are provided, full code blocks for each hunk are added.
    """

    fixed = []
    for h in hunks:
        entry = {
            "old_range": list(h["old_range"]),
            "new_range": list(h["new_range"]),
            "old_code_diff": h["old_code"],
            "new_code_diff": h["new_code"],
        }

        # If file paths are provided → include full code blocks
        if old_file_path and new_file_path:
            entry["old_code_full"] = extract_code_block(old_file_path, h["old_range"])
            entry["new_code_full"] = extract_code_block(new_file_path, h["new_range"])

        fixed.append(entry)

    return fixed


In [154]:
def process_files(df_user_patch_with_syntax_error, df_all_data_successful_patches, BASE_DIR):
    results = []

    for index, row in df_user_patch_with_syntax_error.iterrows():
        file_hash = row["file_hash"]

        old_dir = BASE_DIR / file_hash / "decompiler_output"
        res = highest_indented_file(old_dir, pattern="indented_*.py")
        if res is None:
            continue
        old_file_path, max_idx = res
        success_patch = df_all_data_successful_patches.loc[
            df_all_data_successful_patches["file_hash"] == file_hash,
            "successful_patches"
        ].iloc[0]

        new_file_path = BASE_DIR / file_hash / "user_patches" / success_patch / "source.py"

        if not (old_file_path.exists() and new_file_path.exists()):
            continue

        hunks = extract_hunks(str(old_file_path), str(new_file_path))

        enriched = []
        for h in hunks:
            enriched.append({
                "old_range": h["old_range"],
                "new_range": h["new_range"],
                "old_code_diff": h["old_code"],
                "new_code_diff": h["new_code"],
                "old_code_full": extract_code_block(str(old_file_path), h["old_range"]),
                "new_code_full": extract_code_block(str(new_file_path), h["new_range"]),
            })

        results.append({
            "file_hash": file_hash,
            "patch_hash": success_patch,
            "chunks": enriched,
        })

        # Remove break if you want to process ALL rows
        break  

    return results

In [155]:
for index, row in df_user_patch_with_syntax_error.iterrows():
    file_hash = row['file_hash']

    # Locate old file
    old_dir = BASE_DIR / file_hash / "decompiler_output"
    res = highest_indented_file(old_dir, pattern="indented_*.py")
    if res is None:
        continue

    old_file_path, max_idx = res

    # Find matching successful patch
    successfull_patch_hash = df_all_data_successful_patches[
        df_all_data_successful_patches['file_hash'] == file_hash
    ]['successful_patches'].iloc[0]

    new_file_path = BASE_DIR / file_hash / "user_patches" / successfull_patch_hash / "source.py"

    # Check both files exist
    if not (old_file_path.exists() and new_file_path.exists()):
        continue

    print(f"Processing file_hash={file_hash}, patch_hash={successfull_patch_hash}")

    # Extract diff hunks
    diffs = extract_hunks(str(old_file_path), str(new_file_path))

    if len(diffs) == 0:
        continue

    enriched_chunks = []
    for h in diffs:
        old_full = extract_code_block(str(old_file_path), h["old_range"])
        new_full = extract_code_block(str(new_file_path), h["new_range"])

        enriched_chunks.append({
            "old_range": list(h["old_range"]),
            "new_range": list(h["new_range"]),
            "old_code_diff": h["old_code"],
            "new_code_diff": h["new_code"],
            "old_code_full": old_full,
            "new_code_full": new_full,
        })


    output_path = BASE_DIR / file_hash / "user_patches" / "source_diffs.json"

    with open(output_path, "w", encoding="utf-8") as f:
        serialized = serialize_hunks(diffs, str(old_file_path), str(new_file_path))
        json.dump(serialized, f, indent=4, ensure_ascii=False)


    # break


Processing file_hash=1b071182ed192b46c3663a602507bca58c08890a1a6a0fffd52f3787123d49f8, patch_hash=090083fc164802921b7a3f2f831630a7688419d21500e1a05283b1861178c1ee
Processing file_hash=11b1becb1c340a39803513371a438fe2568976dbaa1364f52d20225b987ef5d8, patch_hash=0c6c22354361d136b981a91f3b4fead472dd05781e5857d14ce5324e390b96d3
Processing file_hash=e25c38320dba6e5c7979d9818b3d82ccc7f0c0938f5b1b8cae4576c0f66e9750, patch_hash=d847e3b74fb8cf82a1752ca272c4b57e4f3994afcbf5f89c9e9f9c37e6faf368
Processing file_hash=3127c51d682432be364e4374ad2c5baacf8ff0c9935b2eb9394ad8b678485272, patch_hash=ef06bbb5a0cb33dab4a7f217fbb41cd1cc45dff85c89e193c2e3f15e1a2da763
Processing file_hash=37406cf3f9b0fb17f909d1a77c862541d8c9b1daacdae7010facef916b50637e, patch_hash=3a7a87210567ebdb0d7b26b27c4c1d97172154669bfa323b052f5072e0d07555
Processing file_hash=59516f7c4c7059f5566ee3e12896bfc654b4468452e415845eee04c388243149, patch_hash=2743c570ea75991e334799ee654bf02f586df86344e7762c485a379d2a3701f1
Processing file_hash=6

In [156]:
def extract_line_number(error_msg: str):
    # Find patterns like: "line 131"
    m = re.search(r"line\s+(\d+)", error_msg)
    if m:
        return int(m.group(1))
    return None

In [157]:
df_user_patch_with_syntax_error['error_line_number'] = df_user_patch_with_syntax_error['syntactic_error_description'].apply(extract_line_number)

In [158]:
df_user_patch_with_syntax_error['error_line_number'].isnull().sum()

np.int64(0)

In [159]:
df_user_patch_with_syntax_error.drop(columns=['equivalence', 'error_type', 'user_patches', 'semantic_error_lines'], inplace=True)

In [160]:
df_user_patch_with_syntax_error.head()

Unnamed: 0,file_hash,syntactic_error_word,syntactic_error_message,precessed_error_message,syntactic_error_description,error_line_number
22,1b071182ed192b46c3663a602507bca58c08890a1a6a0f...,SyntaxError,'{' was never closed,'{' was never closed,"File ""/decompiler_workspace/1b071182ed192b46c3...",68
158,11b1becb1c340a39803513371a438fe2568976dbaa1364...,SyntaxError,invalid syntax,invalid syntax,"File ""/decompiler_workspace/11b1becb1c340a3980...",101
214,e25c38320dba6e5c7979d9818b3d82ccc7f0c0938f5b1b...,SyntaxError,EOL while scanning string literal,eol while scanning string literal,"File ""/decompiler_workspace/e25c38320dba6e5c79...",33
1044,3127c51d682432be364e4374ad2c5baacf8ff0c9935b2e...,SyntaxError,expected 'except' or 'finally' block,expected 'except' or 'finally' block,"File ""/decompiler_workspace/3127c51d682432be36...",74
1287,37406cf3f9b0fb17f909d1a77c862541d8c9b1daacdae7...,IndentationError,unexpected indent,unexpected indent,Sorry: IndentationError: unexpected indent (in...,93


In [161]:
for index, row in df_user_patch_with_syntax_error.iterrows():
    file_hash = row['file_hash']
    diff_json_path = BASE_DIR / file_hash / "user_patches" / "source_diffs.json"

    if diff_json_path.exists():
        with open(diff_json_path, "r", encoding="utf-8") as f:
            diff_hunks = json.load(f)

        error_line = int(row['error_line_number'])

        for hunk in diff_hunks:
            old_start, old_end = hunk['old_range']

            if old_start <= error_line <= old_end:
                df_user_patch_with_syntax_error.loc[index, "old_code_full"] = hunk['old_code_full']
                df_user_patch_with_syntax_error.loc[index, "new_code_full"] = hunk['new_code_full']
                break   # found the matching hunk → stop scanning


In [164]:
df_user_patch_with_syntax_error.shape

(1152, 8)

In [165]:
df_user_patch_with_syntax_error.to_csv('../dataset/syntax_error_user_patches_finetune_dataset.csv', index=False)