In [23]:
import datetime
from pathlib import Path
from typing import Dict, List

import pandas as pd

# --- Configuration ---
# Specify a list of input CSV files to process.
INPUT_FILES: List[Path] = [
    Path('results_handcrafted_0_8_source_llm.csv'),
    Path('results_handcrafted_0_8_source.csv'),
    Path('results_handcrafted_0_5_source.csv'),
    Path('results_handcrafted_0_4_source.csv'),
    Path('results_handcrafted_0_8_bin.csv'),
    Path('results_handcrafted_0_5_bin.csv'),
    Path('results_handcrafted_0_4_bin.csv'),
]
OUTPUT_FILE = Path('reentrancy_metrics_data_combined.csv')
COLUMNS_TO_USE = ['filename', 'toolid', 'findings']
HIERARCHICAL_MAX_DEPTH = 30

# A dictionary mapping each tool to its label(s) for a reentrancy finding.
REENTRANCY_LABELS: Dict[str, str] = {
    'ccc': 'Reentrancy_Vulnerability',
    'confuzzius': 'Reentrancy',
    'conkas': 'Reentrancy',
    'mythril-0.24.7': 'State_access_after_external_call_SWC_107',
    'oyente+-2acaf2e': 'Re_Entrancy_Vulnerability',
    'securify': 'DAO',
    'sfuzz': 'Reentrancy',
    'slither-0.11.3': 'reentrancy_eth,reentrancy_no_eth',
    'solhint-6.0.0': 'reentrancy',
    'ethor-2023': 'insecure',
    'oyente+-060ca34': 'Callstack_Depth_Attack_Vulnerability',
    'vandal': 'ReentrantCall',
    'gpt-oss': 'reentrant',
    'gpt-5-mini': 'reentrant',
    'gpt-5': 'reentrant',
    'gpt-5-nano': 'reentrant'
}


# --- Functions ---

def log(message: str, level: str = "INFO"):
    """Prints a formatted log message with a timestamp."""
    timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    print(f"[{level}] [{timestamp}] {message}")


def load_and_prepare_data(filepath: Path, use_cols: List[str]) -> pd.DataFrame:
    """
    Loads data from a list of CSVs, checks for a 'fails' column,
    and prepares the DataFrame for analysis.
    """
    log("Loading data...")
    df = pd.DataFrame()
    try:
        log(f"--> Reading '{filepath}'")
        all_cols = pd.read_csv(filepath, nrows=0).columns.tolist()
        cols_to_read = use_cols[:]
        if 'fails' in all_cols:
            cols_to_read.append('fails')

        df = pd.read_csv(filepath, usecols=cols_to_read)

        if 'fails' not in df.columns:
            df['fails'] = "{}"
        else:
            df['fails'] = df['fails'].fillna("{}")

    except FileNotFoundError:
        log(f"File not found at '{filepath}'. Skipping.", level="WARN")
        return pd.DataFrame()
    except Exception as e:
        log(f"Could not read file '{filepath}'. Error: {e}. Skipping.", level="WARN")
        return pd.DataFrame()

    df['true_reentrancy'] = df['filename'].str.contains(r'_ree', case=False)
    df['findings'] = df['findings'].fillna('').astype(str)
    return df


def add_predictions(df: pd.DataFrame, labels_map: Dict[str, str]) -> pd.DataFrame:
    """
    Adds a 'predicted_reentrancy' column based on tool findings.
    """
    processed_labels = {tool: labels.split(',') for tool, labels in labels_map.items()}

    def get_prediction(row: pd.Series) -> bool:
        tool_labels = processed_labels.get(row['toolid'])
        if not tool_labels:
            return False
        return any(label.strip() in row['findings'] for label in tool_labels)

    df['predicted_reentrancy'] = df.apply(get_prediction, axis=1)
    return df


def calculate_metrics_for_group(group: pd.DataFrame) -> pd.Series:
    """
    Calculates confusion matrix and metrics, accounting for failed runs
    as misclassifications.
    """
    failed_runs = group['fails'] != "{}"
    num_fails = failed_runs.sum()

    fn_from_fails = (group.loc[failed_runs, 'true_reentrancy'] == True).sum()
    fp_from_fails = (group.loc[failed_runs, 'true_reentrancy'] == False).sum()

    non_failed_group = group[~failed_runs]
    tp = ((non_failed_group['true_reentrancy'] == True) & (non_failed_group['predicted_reentrancy'] == True)).sum()
    tn = ((non_failed_group['true_reentrancy'] == False) & (non_failed_group['predicted_reentrancy'] == False)).sum()
    fp = ((non_failed_group['true_reentrancy'] == False) & (non_failed_group['predicted_reentrancy'] == True)).sum()
    fn = ((non_failed_group['true_reentrancy'] == True) & (non_failed_group['predicted_reentrancy'] == False)).sum()

    fp += fp_from_fails
    fn += fn_from_fails

    accuracy = ((tp + tn) / (tp + fp + tn + fn)) if (tp + fp + tn + fn) > 0 else 0
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    return pd.Series({
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1_Score': f1_score,
        'TP': tp,
        'FP': fp,
        'FN': fn,
        'Fails': num_fails,
        'Count': len(group)
    })


def calculate_and_display_metrics(df: pd.DataFrame):
    """Calculates and displays metrics for each source input file."""
    log("=" * 50)
    log("Reentrancy Metrics:")
    log("=" * 50)

    tool_metrics = df.groupby('toolid').apply(calculate_metrics_for_group, include_groups=False)
    reportable_metrics = tool_metrics[tool_metrics['F1_Score'] > 0].sort_index()

    for tool, data in reportable_metrics.iterrows():
        print(f"  Tool: {tool}")
        print(f"    Accuracy: {data['Accuracy']:.2f}")
        print(f"    Precision: {data['Precision']:.2f}")
        print(f"    Recall:    {data['Recall']:.2f}")
        print(f"    F1 Score:  {data['F1_Score']:.2f}")
        if 'Fails' in data and data['Fails'] > 0:
            print(f"    Fails:     {int(data['Fails'])}")


def calculate_and_display_hierarchical_metrics(df: pd.DataFrame, max_depth: int):
    """
    Expands data by directory paths and calculates metrics for each level up to max_depth.
    """
    log("=" * 50)
    log(f"Hierarchical Reentrancy Metrics (by dir, up to depth {max_depth}):")
    log("=" * 50)

    expanded_rows = []
    for _, row in df.iterrows():
        path = Path(row['filename'])

        if path.parts[-1].endswith(".hex"):
            path = path.parent
        else:
            path_components = path.parts
            filename_components = path_components[-1].split('_')
            filename = '_'.join(filename_components[-2:])
            path = Path("/".join(list(path_components[:-1]) + filename_components[:-2] + [filename]))

        path = path.parent
        expanded_rows.append({})
        cumulative_paths = [Path(*path.parts[:i + 1]) for i in range(len(path.parts)) if i < max_depth]

        if not cumulative_paths and max_depth > 0:
            cumulative_paths = [Path('<root>')]

        for level_path in cumulative_paths:
            new_row = row.to_dict()
            new_row['directory_level'] = str(level_path)
            expanded_rows.append(new_row)

    if not expanded_rows:
        log("No directory structure found to analyze.", level="WARN")
        return

    hierarchical_df = pd.DataFrame(expanded_rows)
    metrics = (
        hierarchical_df.groupby(['directory_level', 'toolid'])
        .apply(calculate_metrics_for_group, include_groups=False)
        .reset_index()
    )
    sorted_levels = sorted(metrics['directory_level'].unique(), key=lambda x: (x.count('/'), x))

    for level in sorted_levels:
        level_metrics = metrics[metrics['directory_level'] == level]
        indentation = ".." * level.count('/')

        total_tp = level_metrics['TP'].sum()
        total_fp = level_metrics['FP'].sum()
        total_fn = level_metrics['FN'].sum()
        total_count = level_metrics['Count'].sum()
        total_fails = level_metrics['Fails'].sum()

        print(
            f"\n{indentation}📁 {level} (Total Findings: {int(total_count)}, TP:{int(total_tp)}, FP:{int(total_fp)}, FN:{int(total_fn)}, Fails:{int(total_fails)})")

        for _, tool_data in level_metrics.iterrows():
            if tool_data['F1_Score'] > 0:
                fails_str = f", Fails: {int(tool_data['Fails'])}" if 'Fails' in tool_data and tool_data[
                    'Fails'] > 0 else ""
                print(
                    f"{indentation}  - Tool: {tool_data['toolid']:<18} | "
                    f"A: {tool_data['Accuracy']:.2f}, "
                    f"F1: {tool_data['F1_Score']:.2f}, "
                    f"P: {tool_data['Precision']:.2f}, "
                    f"R: {tool_data['Recall']:.2f} "
                    f"({int(tool_data['Count'])} files{fails_str})"
                )


def main():
    """Main script to load data, process it, and print metrics."""
    for input_file in INPUT_FILES:
        print("\n\n")
        log("=" * 50)
        log("Processing input file: {}".format(input_file))
        log("=" * 50)
        df = load_and_prepare_data(input_file, COLUMNS_TO_USE)

        if df.empty:
            log("No dataframes were loaded. Exiting.", level="ERROR")
            return

        df = add_predictions(df, REENTRANCY_LABELS)
        df.to_csv(OUTPUT_FILE, index=False)
        log(f"Processed data with predictions saved to '{OUTPUT_FILE}'.")

        analysis_df = df[df['toolid'].isin(REENTRANCY_LABELS.keys())]

        if analysis_df.empty:
            log("No data found for the tools specified in REENTRANCY_LABELS.", level="WARN")
            return

        # --- Overall Reporting ---
        log("=" * 50)
        log("Overall Reentrancy Metrics per Tool:")
        log("=" * 50)
        overall_metrics = analysis_df.groupby('toolid').apply(calculate_metrics_for_group, include_groups=False)
        reportable_metrics = overall_metrics[overall_metrics['F1_Score'] > 0].sort_index()

        for tool, data in reportable_metrics.iterrows():
            print(f"Tool: {tool}")
            print(f"  Precision: {data['Precision']:.4f}")
            print(f"  Recall:    {data['Recall']:.4f}")
            print(f"  F1 Score:  {data['F1_Score']:.4f}")
            if 'Fails' in data and data['Fails'] > 0:
                print(f"  Fails:     {int(data['Fails'])}")
            print("-" * 30)

        # --- Per-Input-File Reporting ---
        calculate_and_display_metrics(analysis_df)

        # --- Hierarchical Reporting ---
        calculate_and_display_hierarchical_metrics(analysis_df, HIERARCHICAL_MAX_DEPTH)

In [24]:
main()




[INFO] [2025-09-04 17:46:26] Processing input file: results_handcrafted_0_8_source_llm.csv
[INFO] [2025-09-04 17:46:26] Loading data...
[INFO] [2025-09-04 17:46:26] --> Reading 'results_handcrafted_0_8_source_llm.csv'
[INFO] [2025-09-04 17:46:26] Processed data with predictions saved to 'reentrancy_metrics_data_combined.csv'.
[INFO] [2025-09-04 17:46:26] Overall Reentrancy Metrics per Tool:
Tool: gpt-5
  Precision: 0.9600
  Recall:    0.8571
  F1 Score:  0.9057
------------------------------
Tool: gpt-5-mini
  Precision: 0.9184
  Recall:    0.8036
  F1 Score:  0.8571
------------------------------
Tool: gpt-5-nano
  Precision: 0.8333
  Recall:    0.8036
  F1 Score:  0.8182
------------------------------
Tool: gpt-oss
  Precision: 0.8958
  Recall:    0.7679
  F1 Score:  0.8269
------------------------------
[INFO] [2025-09-04 17:46:26] Reentrancy Metrics:
  Tool: gpt-5
    Accuracy: 0.92
    Precision: 0.96
    Recall:    0.86
    F1 Score:  0.91
  Tool: gpt-5-mini
    Accuracy: 0.88