In [137]:
import datetime
from pathlib import Path
from typing import Dict, List

import pandas as pd

# --- Configuration ---
# Specify a list of input CSV files to process.
INPUT_FILES: List[Path] = [
    Path('results_handcrafted_0_8_source_llm.csv'),
    Path('results_handcrafted_0_8_source.csv'),
    Path('results_handcrafted_0_5_source.csv'),
    Path('results_handcrafted_0_4_source.csv'),
    Path('results_handcrafted_0_8_bin.csv'),
    Path('results_handcrafted_0_5_bin.csv'),
    Path('results_handcrafted_0_4_bin.csv'),
]
OUTPUT_FILE = Path('reentrancy_metrics_data_combined.csv')
COLUMNS_TO_USE = ['filename', 'toolid', 'findings']
HIERARCHICAL_MAX_DEPTH = 30

# A dictionary mapping each tool to its label(s) for a reentrancy finding.
REENTRANCY_LABELS: Dict[str, str] = {
    'ccc': 'Reentrancy_Vulnerability',
    'confuzzius': 'Reentrancy',
    'conkas': 'Reentrancy',
    'mythril-0.24.7': 'State_access_after_external_call_SWC_107',
    'oyente+-2acaf2e': 'Re_Entrancy_Vulnerability',
    'securify': 'DAO',
    'sfuzz': 'Reentrancy',
    'slither-0.11.3': 'reentrancy_eth,reentrancy_no_eth',
    'solhint-6.0.0': 'reentrancy',
    'ethor-2023': 'insecure',
    'oyente+-060ca34': 'Callstack_Depth_Attack_Vulnerability',
    'vandal': 'ReentrantCall',
    'gpt-oss': 'reentrant',
    'gpt-5-mini': 'reentrant',
    'gpt-5': 'reentrant',
    'gpt-5-nano': 'reentrant'
}


# --- Functions ---

def log(message: str, level: str = "INFO"):
    """Prints a formatted log message with a timestamp."""
    timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    print(f"[{level}] [{timestamp}] {message}")


def load_and_prepare_data(filepath: Path, use_cols: List[str]) -> pd.DataFrame:
    """
    Loads data from a list of CSVs, checks for a 'fails' column,
    and prepares the DataFrame for analysis.
    """
    log("Loading data...")
    df = pd.DataFrame()
    try:
        log(f"--> Reading '{filepath}'")
        all_cols = pd.read_csv(filepath, nrows=0).columns.tolist()
        cols_to_read = use_cols[:]
        if 'fails' in all_cols:
            cols_to_read.append('fails')

        df = pd.read_csv(filepath, usecols=cols_to_read)
        df_copy = df.copy()
        for i, row in df_copy.iterrows():
            if not any(row['toolid'] in x for x in REENTRANCY_LABELS.keys()):
                df = df.drop(i)

        if 'fails' not in df.columns:
            df['fails'] = "{}"
        else:
            df['fails'] = df['fails'].fillna("{}")

    except FileNotFoundError:
        log(f"File not found at '{filepath}'. Skipping.", level="WARN")
        return pd.DataFrame()
    except Exception as e:
        log(f"Could not read file '{filepath}'. Error: {e}. Skipping.", level="WARN")
        return pd.DataFrame()

    df['true_reentrancy'] = df['filename'].str.contains(r'_ree', case=False)
    df['findings'] = df['findings'].fillna('').astype(str)
    return df


def add_predictions(df: pd.DataFrame, labels_map: Dict[str, str]) -> pd.DataFrame:
    """
    Adds a 'predicted_reentrancy' column based on tool findings.
    """
    processed_labels = {tool: labels.split(',') for tool, labels in labels_map.items()}

    def get_prediction(row: pd.Series) -> bool:
        tool_labels = processed_labels.get(row['toolid'])
        if not tool_labels:
            return False
        return any(label.strip() in row['findings'] for label in tool_labels)

    df['predicted_reentrancy'] = df.apply(get_prediction, axis=1)
    return df


def calculate_metrics_for_group(group: pd.DataFrame) -> pd.Series:
    """
    Calculates confusion matrix and metrics, accounting for failed runs
    as misclassifications.
    """

    if len(group) != group.filename.unique().shape[0]:
        agg_rules = {
            "fails": lambda x: " ".join(map(str, sorted(set(x)))),
            "predicted_reentrancy": "max",
            "true_reentrancy": "max"
        }

        group = group.groupby("filename", as_index=False).agg(agg_rules)

    failed_runs = group['fails'] != "{}"
    num_fails = failed_runs.sum()

    fn_from_fails = (group.loc[failed_runs, 'true_reentrancy'] == True).sum()
    fp_from_fails = (group.loc[failed_runs, 'true_reentrancy'] == False).sum()

    non_failed_group = group[~failed_runs]
    tp = ((non_failed_group['true_reentrancy'] == True) & (non_failed_group['predicted_reentrancy'] == True)).sum()
    tn = ((non_failed_group['true_reentrancy'] == False) & (non_failed_group['predicted_reentrancy'] == False)).sum()
    fp = ((non_failed_group['true_reentrancy'] == False) & (non_failed_group['predicted_reentrancy'] == True)).sum()
    fn = ((non_failed_group['true_reentrancy'] == True) & (non_failed_group['predicted_reentrancy'] == False)).sum()

    fp += fp_from_fails
    fn += fn_from_fails

    accuracy = ((tp + tn) / (tp + fp + tn + fn)) if (tp + fp + tn + fn) > 0 else 0
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    return pd.Series({
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1_Score': f1_score,
        'TP': tp,
        'TN': tn,
        'FP': fp,
        'FN': fn,
        'Fails': num_fails,
        'Count': len(group)
    })


def calculate_and_display_metrics(df: pd.DataFrame):
    """Calculates and displays metrics for each source input file."""
    log("=" * 50)
    log("Reentrancy Metrics:")
    log("=" * 50)

    tool_metrics = df.groupby('toolid').apply(calculate_metrics_for_group, include_groups=False)
    reportable_metrics = tool_metrics[tool_metrics['F1_Score'] > 0].sort_index()

    for tool, data in reportable_metrics.iterrows():
        print(f"  Tool: {tool}")
        print(f"    Accuracy: {data['Accuracy']:.2f}")
        print(f"    Precision: {data['Precision']:.2f}")
        print(f"    Recall:    {data['Recall']:.2f}")
        print(f"    F1 Score:  {data['F1_Score']:.2f}")
        if 'Fails' in data and data['Fails'] > 0:
            print(f"    Fails:     {int(data['Fails'])}")


def filename2dirname(filename: str) -> Path:
    path = Path(filename)
    filename = path.parts[-1]
    if filename.endswith(".hex") and len(path.parts) > 1:
        path = path.parent
    path_components = path.parts
    filename_path = path_components[-1].split("_sol_")[0]
    filename_components = filename_path.split('_')
    filename = '_'.join(filename_components[-2:]).split('.')[0]
    path = Path("/".join(list(path_components[:-1]) + filename_components[:-2] + [filename]))
    path = path.parent
    return path


def clean_filename(filename: str) -> str:
    path = Path(filename)
    if filename.endswith(".hex") and len(path.parts) > 1:
        return path.parts[-2]
    path_components = path.parts
    filename_path = path_components[-1].split("_sol_")[0]
    filename_components = filename_path.split('_')
    filename = '_'.join(filename_components[-2:]).split('.')[0]
    return filename


In [138]:
def calculate_metrics_by_path_and_version(df: pd.DataFrame):
    """
    Aggregates metrics per directory path, solidity version, and toolid.
    Returns two dataframes: one for source, one for binary.
    """
    # Add helper cols
    df['directory'] = df['filename'].apply(lambda f: str(filename2dirname(f)))
    df.filename = df.filename.apply(clean_filename)
    df.drop(columns=['findings'], inplace=True)
    df.drop_duplicates(inplace=True)

    grouped = (
        df.groupby(['type', 'directory', 'solidity_version', 'toolid'])
        .apply(calculate_metrics_for_group, include_groups=False)
        .reset_index()
    )

    source_df = grouped[grouped['type'] == "source"].drop(columns=["type"])
    bin_df = grouped[grouped['type'] == "bin"].drop(columns=["type"])
    return source_df, bin_df


def display_path_version_tables(source_df: pd.DataFrame, bin_df: pd.DataFrame):
    """
    Pretty-prints two final tables (source vs bin), grouped by directory and solidity version.
    """

    def display_table(df: pd.DataFrame, title: str):
        log("=" * 70)
        log(f"{title}")
        log("=" * 70)
        for directory in sorted(df['directory'].unique()):
            dir_df = df[df['directory'] == directory]
            print(f"\n📂 Path: {directory} - N={int(dir_df.iloc[0]['Count'])}")
            row_str = []
            for tool in sorted(dir_df["toolid"].unique()):
                for input_type in sorted(dir_df["input"].unique()):
                    prefix = f"    - Tool ({input_type}): {tool:<20} | "
                    tool_df = dir_df[(dir_df['toolid'] == tool) & (dir_df['input'] == input_type)]
                    empty_count = 0
                    for version in sorted(dir_df['solidity_version'].unique()):
                        version_df = tool_df[tool_df['solidity_version'] == version]
                        if version_df.empty:
                            empty_count += 1
                            version_result = f"v{version} -- A: -, TP: -, TN: -, FP: -, FN: -, Fails: -"
                        else:
                            row = version_df.iloc[0]
                            version_result = f"v{version} -- A: {row['Accuracy']:.2f}, TP: {int(row['TP'])}, TN: {int(row['TN'])}, FP: {int(row['FP'])}, FN: {int(row['FN'])}, Fails: {int(row['Fails'])}"
                        row_str.append(version_result)
                    if empty_count < 3:
                        print(prefix + " || ".join(row_str))

    source_df["input"] = "source"
    bin_df["input"] = "bin"

    final_df = pd.concat([source_df, bin_df])

    display_table(final_df, "FINAL TABLE")


def display_path_version_tables_latex(source_df: pd.DataFrame, bin_df: pd.DataFrame):
    """
    Pretty-prints two final tables (source vs bin), grouped by directory and solidity version.
    """

    def display_table(df: pd.DataFrame):
        lines = []
        for directory in sorted(df['directory'].unique()):
            dir_df = df[df['directory'] == directory]
            path = f"{{Path: {directory} - N={int(max(dir_df['Count'].unique()))}}}"
            lines.append(f"\\multicolumn{{20}}{{c}}{{\\textbf{path}}} \n\\\\\\midrule")
            for tool in sorted(dir_df["toolid"].unique()):
                for input_type in sorted(dir_df["input"].unique()):
                    row_str = []
                    prefix = f"{tool} & {input_type[0]}"
                    tool_df = dir_df[(dir_df['toolid'] == tool) & (dir_df['input'] == input_type)]
                    empty_count = 0
                    for version in sorted(dir_df['solidity_version'].unique()):
                        version_df = tool_df[tool_df['solidity_version'] == version]
                        if version_df.empty:
                            empty_count += 1
                            version_result = f" - & - & - & - & - & -"
                        else:
                            row = version_df.iloc[0]
                            version_result = f"{row['Accuracy']:.2f} & {int(row['TP'])} & {int(row['TN'])} & {int(row['FP'])} & {int(row['FN'])} & {int(row['Fails'])}"
                        row_str.append(version_result)
                    if empty_count < 3:
                        lines.append(prefix + " & " + " & ".join(row_str) + "\\\\")
            lines.append("\\midrule[\\heavyrulewidth]")
            lines.append("")
            lines.append("% -----------")
            lines.append("")

        with open('table.tex', 'w') as f:
            for line in lines:
                f.write(f"{line}\n")

    source_df["input"] = "source"
    bin_df["input"] = "bin"

    final_df = pd.concat([source_df, bin_df])

    display_table(final_df)

def display_path_version_tables_latex_new(source_df: pd.DataFrame, bin_df: pd.DataFrame):
    """
    Pretty-prints two final tables (source vs bin), grouped by directory and solidity version.
    """

    def display_table(df: pd.DataFrame):
        lines = []

        for version in sorted(df['solidity_version'].unique()):
            version_df = df[df['solidity_version'] == version]
            version = f"{{Solidity version: {version}}}"
            lines.append(f"\\multicolumn{{20}}{{c}}{{\\textbf{version}}} \n\\\\\\midrule")
            lines.append(" | ".join(sorted(version_df['directory'].unique())))
            for tool in sorted(version_df["toolid"].unique()):
                for input_type in sorted(version_df["input"].unique()):
                    tool_df = version_df[(version_df['toolid'] == tool) & (version_df['input'] == input_type)]
                    tool_str = f"{{Tool: {tool} ({input_type})}}"
                    if tool_df.empty:
                        continue
                    row_str = []
                    for directory in sorted(version_df['directory'].unique()):
                        dir_df = tool_df[tool_df['directory'] == directory]
                        if dir_df.empty:
                            dir_result = "-"
                        else:
                            row = dir_df.iloc[0]
                            dir_result = row['Accuracy']
                            if dir_result == 1:
                                dir_result = "V"
                            elif dir_result == 0:
                                dir_result = "X"
                            else:
                                dir_result = "~V"
                        row_str.append(dir_result)
                    lines.append(tool_str + " & " + " & ".join(row_str) + "\\\\")
            lines.append("\\midrule[\\heavyrulewidth]")
            lines.append("")
            lines.append("% -----------")
            lines.append("")

        with open('table_new.tex', 'w') as f:
            for line in lines:
                f.write(f"{line}\n")

    source_df["input"] = "source"
    bin_df["input"] = "bin"

    final_df = pd.concat([source_df, bin_df])

    display_table(final_df)


def main():
    all_data = []
    for input_file in INPUT_FILES:
        filename_parts = str(input_file).split("_")
        version = ".".join([filename_parts[2]] + [filename_parts[3]])
        data_type = filename_parts[4].split(".")[0]
        print("\n\n")
        log("=" * 50)
        log(f"Processing input file: {input_file}")
        log("=" * 50)
        df = load_and_prepare_data(input_file, COLUMNS_TO_USE)
        if df.empty:
            continue
        df = add_predictions(df, REENTRANCY_LABELS)
        df["solidity_version"] = version
        df["type"] = data_type
        all_data.append(df)

    if not all_data:
        log("No data loaded. Exiting.", level="ERROR")
        return

    combined_df = pd.concat(all_data, ignore_index=True)

    # Existing reports
    # calculate_and_display_metrics(combined_df)

    # NEW: Path + Solidity version tables (source/bin separated)
    source_df, bin_df = calculate_metrics_by_path_and_version(combined_df)
    # display_path_version_tables(source_df, bin_df)
    display_path_version_tables_latex_new(source_df, bin_df)


In [139]:
main()




[INFO] [2025-09-07 16:34:03] Processing input file: results_handcrafted_0_8_source_llm.csv
[INFO] [2025-09-07 16:34:03] Loading data...
[INFO] [2025-09-07 16:34:03] --> Reading 'results_handcrafted_0_8_source_llm.csv'



[INFO] [2025-09-07 16:34:04] Processing input file: results_handcrafted_0_8_source.csv
[INFO] [2025-09-07 16:34:04] Loading data...
[INFO] [2025-09-07 16:34:04] --> Reading 'results_handcrafted_0_8_source.csv'



[INFO] [2025-09-07 16:34:04] Processing input file: results_handcrafted_0_5_source.csv
[INFO] [2025-09-07 16:34:04] Loading data...
[INFO] [2025-09-07 16:34:04] --> Reading 'results_handcrafted_0_5_source.csv'



[INFO] [2025-09-07 16:34:04] Processing input file: results_handcrafted_0_4_source.csv
[INFO] [2025-09-07 16:34:04] Loading data...
[INFO] [2025-09-07 16:34:04] --> Reading 'results_handcrafted_0_4_source.csv'



[INFO] [2025-09-07 16:34:04] Processing input file: results_handcrafted_0_8_bin.csv
[INFO] [2025-09-07 16:34:04] Loading data...
[INFO] [