In [71]:
import os
import re
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
import difflib
import json


In [54]:
def read_csv_file(file_name: str) -> pd.DataFrame:
    csv_path = Path(__file__).with_name(file_name) if '__file__' in globals() else Path(file_name)
    print(f"Trying to read: {csv_path.resolve()}")
    df = pd.read_csv(csv_path)
    print('DataFrame shape:', df.shape)
    return df

In [55]:
df_all_data = read_csv_file('../dataset/dataset_summary.csv')
df_all_data_successful_patches = read_csv_file('../dataset/successfull_user_patches_mapping.csv')

Trying to read: /home/diogenes/pylingual_colaboration/pylingual_download/code/dataset/dataset_summary.csv
DataFrame shape: (294096, 9)
Trying to read: /home/diogenes/pylingual_colaboration/pylingual_download/code/dataset/successfull_user_patches_mapping.csv
DataFrame shape: (2065, 2)


In [56]:
df_user_patch_with_syntax_error = df_all_data[(df_all_data['error_type'] == 'syntactic_error') & (df_all_data['user_patches']==True)]

In [57]:
df_user_patch_with_syntax_error.shape

(4828, 9)

In [58]:
df_user_patch_with_syntax_error = df_user_patch_with_syntax_error[
    df_user_patch_with_syntax_error['file_hash'].isin(
        df_all_data_successful_patches['file_hash'].tolist()
    )
]


In [59]:
df_user_patch_with_syntax_error.shape

(1152, 9)

In [60]:
df_user_patch_with_syntax_error.head()

Unnamed: 0,file_hash,equivalence,error_type,syntactic_error_word,syntactic_error_message,precessed_error_message,syntactic_error_description,user_patches,semantic_error_lines
22,1b071182ed192b46c3663a602507bca58c08890a1a6a0f...,False,syntactic_error,SyntaxError,'{' was never closed,'{' was never closed,"File ""/decompiler_workspace/1b071182ed192b46c3...",True,['***<module>.getip: Failure: Different byteco...
158,11b1becb1c340a39803513371a438fe2568976dbaa1364...,False,syntactic_error,SyntaxError,invalid syntax,invalid syntax,"File ""/decompiler_workspace/11b1becb1c340a3980...",True,['***<module>.console: Failure: Different byte...
214,e25c38320dba6e5c7979d9818b3d82ccc7f0c0938f5b1b...,False,syntactic_error,SyntaxError,EOL while scanning string literal,eol while scanning string literal,"File ""/decompiler_workspace/e25c38320dba6e5c79...",True,"['***<module>: Failure: Compilation Error', '*..."
1044,3127c51d682432be364e4374ad2c5baacf8ff0c9935b2e...,False,syntactic_error,SyntaxError,expected 'except' or 'finally' block,expected 'except' or 'finally' block,"File ""/decompiler_workspace/3127c51d682432be36...",True,"['***<module>: Failure: Compilation Error', '*..."
1287,37406cf3f9b0fb17f909d1a77c862541d8c9b1daacdae7...,False,syntactic_error,IndentationError,unexpected indent,unexpected indent,Sorry: IndentationError: unexpected indent (in...,True,['***<module>.XBOX: Failure detected at line n...


In [61]:
BASE_DIR = Path("../../decompiler_workspace")

In [66]:
def extract_hunks(old_file_path, new_file_path):

    with open(old_file_path, 'r', encoding='utf-8') as f:
        old_lines = f.readlines()
    with open(new_file_path, 'r', encoding='utf-8') as f:
        new_lines = f.readlines()


    # old_lines = old_text.splitlines()
    # new_lines = new_text.splitlines()

    diff = difflib.unified_diff(
        old_lines, new_lines,
        lineterm=""
    )

    hunks = []
    current_hunk = None

    hunk_header_pattern = re.compile(
        r"@@ -(?P<old_start>\d+),(?P<old_len>\d+) "
        r"\+(?P<new_start>\d+),(?P<new_len>\d+) @@"
    )

    for line in diff:
        match = hunk_header_pattern.match(line)
        if match:
            # Start a new hunk
            if current_hunk:
                hunks.append(current_hunk)

            old_start = int(match.group("old_start"))
            old_len = int(match.group("old_len"))
            new_start = int(match.group("new_start"))
            new_len = int(match.group("new_len"))

            current_hunk = {
                "old_range": (old_start, old_start + old_len - 1),
                "new_range": (new_start, new_start + new_len - 1),
                "old_code": [],
                "new_code": [],
            }

        elif current_hunk:
            if line.startswith("-"):
                current_hunk["old_code"].append(line[1:])
            elif line.startswith("+"):
                current_hunk["new_code"].append(line[1:])
            else:
                # common context line, optional to include
                pass

    if current_hunk:
        hunks.append(current_hunk)

    return hunks


In [67]:
from typing import Optional, Tuple
INDENTED_RE = re.compile(r"^indented_(\d+)(?:\.[^.]+)?$")  # matches indented_12 or indented_12.py

def highest_indented_file(
    directory: Path | str,
    pattern: str = "indented_*"  # change to "indented_*.py" to require .py
) -> Optional[Tuple[Path, int]]:
    """
    Return (path, idx) for the highest-numbered file named like 'indented_{idx}[.ext]'
    in the given directory. Returns None if none found.
    """
    directory = Path(directory)
    best: Optional[Tuple[Path, int]] = None

    for p in directory.glob(pattern):
        if not p.is_file():
            continue
        m = INDENTED_RE.match(p.name)
        if not m:
            continue
        n = int(m.group(1))
        if best is None or n > best[1]:
            best = (p, n)
    return best

In [72]:
def serialize_hunks(hunks):
    fixed = []
    for h in hunks:
        fixed.append({
            "old_range": list(h["old_range"]),   # tuple â†’ list
            "new_range": list(h["new_range"]),
            "old_code": h["old_code"],
            "new_code": h["new_code"],
        })
    return fixed


In [None]:
for index, row in df_user_patch_with_syntax_error.iterrows():
    file_hash = row['file_hash']
    old_dir = BASE_DIR / file_hash / "decompiler_output"
    res = highest_indented_file(old_dir, pattern="indented_*.py")
    if res is None:
        continue
    old_file_path, max_idx = res
    successfull_patch_hash = df_all_data_successful_patches[df_all_data_successful_patches['file_hash'] == file_hash]['successful_patches'].iloc[0]
    new_file_path = BASE_DIR / file_hash / "user_patches" / successfull_patch_hash / "source.py"
    if old_file_path.exists() and new_file_path.exists():
        print(f"Processing file_hash={file_hash}, patch_hash={successfull_patch_hash}")
        diffs = extract_hunks(str(old_file_path), str(new_file_path))
        if len(diffs):
            serialized_diffs = serialize_hunks(diffs)
            output_dir = BASE_DIR / file_hash / "user_patches" / "source_diffs.json"
            with open(output_dir, 'w', encoding='utf-8') as f:
                json.dump(serialized_diffs, f, indent=4, ensure_ascii=False)