In [1]:
import re
from pathlib import Path
import pandas as pd

In [2]:
# Regex patterns
HASH_LINE_RE = re.compile(r"Comparing files for hash:\s+([0-9a-fA-F]+)")
FILE1_RE = re.compile(r"File\s*1\s*:\s*(.+)")
FILE2_RE = re.compile(r"File\s*2\s*:\s*(.+)")
FILE3_RE = re.compile(r"File\s*3\s*:\s*(.+)")

DIST_DECOMP_RE = re.compile(r"Edit Distance \(lookup vs decompiled\)\s*:\s*(\d+)")
DIST_REPAIRED_RE = re.compile(r"Edit Distance \(lookup vs repaired\)\s*:\s*(\d+)")

In [3]:
def parse_edit_distance_log(log_path: str | Path):
    """
    Parse the edit-distance log and return a list of dicts with:
        - file_hash
        - decompiled_file_name     (File 2)
        - raw_file_name            (File 3)
        - edit_distance_lookup_decompiled
        - edit_distance_lookup_repaired
    """
    log_path = Path(log_path)
    if not log_path.is_file():
        raise FileNotFoundError(f"Log file not found: {log_path}")

    rows = []

    current_hash = None
    file1 = None
    file2 = None
    file3 = None
    d_decomp = None
    d_repaired = None

    with log_path.open(encoding="utf-8", errors="replace") as f:
        for line in f:
            line = line.rstrip("\n")

            # --- HASH line ---
            m_hash = HASH_LINE_RE.search(line)
            if m_hash:
                # Flush previous block
                if current_hash and file2 and file3 and d_decomp is not None and d_repaired is not None:
                    rows.append({
                        "file_hash": current_hash,
                        "decompiled_file_name": file2,
                        "raw_file_name": file3,
                        "d_lookup_vs_decompiled": d_decomp,
                        "d_lookup_vs_repaired": d_repaired,
                    })

                current_hash = m_hash.group(1)
                file1 = file2 = file3 = None
                d_decomp = d_repaired = None
                continue

            # --- File name lines ---
            m1 = FILE1_RE.search(line)
            if m1:
                file1 = m1.group(1).strip()
                continue

            m2 = FILE2_RE.search(line)
            if m2:
                file2 = m2.group(1).strip()
                continue

            m3 = FILE3_RE.search(line)
            if m3:
                file3 = m3.group(1).strip()
                continue

            # --- Distances ---
            m_dec = DIST_DECOMP_RE.search(line)
            if m_dec:
                d_decomp = int(m_dec.group(1))
                continue

            m_rep = DIST_REPAIRED_RE.search(line)
            if m_rep:
                d_repaired = int(m_rep.group(1))
                continue

        # --- Final flush at EOF ---
        if current_hash and file2 and file3 and d_decomp is not None and d_repaired is not None:
            rows.append({
                "file_hash": current_hash,
                "decompiled_file_name": file2,
                "raw_file_name": file3,
                "d_lookup_vs_decompiled": d_decomp,
                "d_lookup_vs_repaired": d_repaired,
            })

    return rows


In [4]:
def parse_log_to_dataframe(log_path: str | Path) -> pd.DataFrame:
    rows = parse_edit_distance_log(log_path)
    df = pd.DataFrame(rows)
    return df


In [5]:
log_file = "edit_distance_results.txt"  # adjust path

df_parsed = parse_log_to_dataframe(log_file)
# display(df_parsed.head())
# print(f"Parsed {len(df_parsed)} records.")
df_parsed.to_csv("edit_distance_results_new.csv", index=False)
