In [None]:
import pandas as pd
from io import StringIO
from sklearn.metrics import f1_score


# Read data from the CSV file and select only the required columns.
filename = 'results_all_src.csv'

if 'src' in filename:
    out_csv = 'reentrancy_metrics_data_src.csv'
    latex_file = 'latex_table_src.csv'
else:
    out_csv = 'reentrancy_metrics_data_bins.csv'
    latex_file = 'latex_table_bins.csv'
df = pd.read_csv(filename)
df = df[['filename', 'basename', 'exit_code', 'toolid', 'findings']]

df['exit_code'] = df['exit_code'].fillna(-1).astype(int)
#print(df.shape)
df = df[~((df['toolid'] == 'ethor-2023') & (df['findings'] == '{}'))]
df = df[df['basename'].str.contains('_safe|_ree', na=False)]

#print(df.shape)
# A dictionary mapping each tool to the string(s) it produces for a reentrancy finding.
# You can easily update this dictionary as needed. For tools with multiple labels,
# use a comma-separated string, e.g., 'tool_name': 'label1,label2'.
reentrancy_labels = {
    'ccc': 'Reentrancy_Vulnerability',
    'confuzzius': 'Reentrancy',
    'conkas': 'Reentrancy', #.sol 0.5
    #'manticore-0.3.7': 'Reentrancy', # placeholder
    'mythril-0.24.7': 'State_access_after_external_call_SWC_107',
    'oyente+-2acaf2e': 'Re_Entrancy_Vulnerability',
    'securify': 'DAO', 
    'securify2': 'Reentrancy', # does not work
    'sfuzz': 'Reentrancy', 
    'slither-0.11.3': 'reentrancy_eth,reentrancy_no_eth',
    #'smartcheck': 'Reentrancy', # never finds any occurrence of reentrancy
    'solhint-6.0.0': 'reentrancy',
    #'ethainter': 'Reentrancy', # does not work
    'ethor-2023': 'insecure',
    'oyente+-060ca34':'Re_Entrancy_Vulnerability',
    'vandal': 'ReentrantCall',
    'gpt-oss': 'reentrant',
    'gpt-5-mini': 'reentrant',
    'gpt-5': 'reentrant',
    'gpt-5-nano': 'reentrant'
    }

# 1. Determine the "true" reentrancy label for each file based on its filename.
# 'ree' followed by an optional number indicates a true reentrancy vulnerability.
# df['true_reentrancy'] = df['filename'].str.contains(r'ree\d*\.sol', case=False)
df['true_reentrancy'] = df['basename'].str.contains(r'_ree', case=False)

# 2. Determine the "predicted" reentrancy label based on the 'findings' column.
# This function will check if any of the tool-specific reentrancy labels are present in the findings.
def get_prediction(row):
    tool_id = row['toolid']
    findings = str(row['findings']) # Convert to string to handle potential NaN values

    # Check if the tool is in our labels dictionary.
    if tool_id in reentrancy_labels:
        # Split the tool's finding string into a list of individual labels.
        tool_findings = [f.strip() for f in reentrancy_labels[tool_id].split(',')]
        
        # Check if any of the tool's labels are present in the findings from the data.
        for label in tool_findings:
            if label in findings:
                return True
    return False

df['predicted_reentrancy'] = df.apply(get_prediction, axis=1)
#print(df['exit_code']==1)


# Save the DataFrame to a new CSV file.
df.to_csv(out_csv, index=False)

# 3. Calculate metrics for each unique tool and print only the results.
# Analyze only the tools present in the reentrancy_labels dictionary.
tools_to_analyze = reentrancy_labels.keys()

print("Reentrancy Metrics per Tool:")
print("=" * 30)
versions = ['0_8', '0_5', '0_4']
with open(latex_file, 'w') as latex_file:
    
    for version in versions:
        latex_file.write('Solidity Version: ' + version.replace('_', '.') + '\n')
        latex_file.write('Tool,Accuracy,Precision,Recall,F1 Score,Errors\n')
        version_df = df[df['filename'].str.contains(version)]
        print('*' * 100)
        print('Results for Solidity version:', version.replace('_', '.'))
        print('*' * 100)
        for tool in tools_to_analyze:

            # Filter the DataFrame for the current tool.
            tool_df = version_df[version_df['toolid'] == tool]
            n_results = tool_df.shape[0]
            #if tool == 'vandal':
                #print(tool_df)
            #print(tool, tool_df.shape)
            ERRORS = (tool_df['exit_code'] != 0).sum()
            # if tool == 'vandal':
            #     print(tool_df, tool_df.shape)
            ERRORS2  = tool_df[(tool_df['exit_code'] != 0) & (tool_df['findings'] == '{}')].shape[0]/n_results if n_results >0 else 0
            # Exclude rows where exit_code is not

            

            tool_df = tool_df[(tool_df['exit_code'] == '0') | ((tool_df['exit_code'] != 0) & tool_df['findings']!= '{}')]

            #print( tool_df['findings'], tool_df['findings']!= '{}')
            #tool_df = tool_df[tool_df['exit_code'] == '0']
            # if tool == 'vandal':
            #     print(tool_df, tool_df.shape)

            # Calculate True Positives (TP), False Positives (FP), True Negatives (TN), and False Negatives (FN).
            TP = len(tool_df[(tool_df['true_reentrancy'] == True) & (tool_df['predicted_reentrancy'] == True)])
            FP = len(tool_df[(tool_df['true_reentrancy'] == False) & (tool_df['predicted_reentrancy'] == True)])
            TN = len(tool_df[(tool_df['true_reentrancy'] == False) & (tool_df['predicted_reentrancy'] == False)])
            FN = len(tool_df[(tool_df['true_reentrancy'] == True) & (tool_df['predicted_reentrancy'] == False)])
            # print(TP, FP, TN, FN)
            
            # Calculate Accuracy, Precision, and Recall.
            # Handle cases where the denominator is zero to avoid errors.
            accuracy = (TP + TN) / (TP + FP + TN + FN) if (TP + FP + TN + FN) > 0 else 0
            
            # Precision: Out of all positive predictions, how many were correct?
            precision = TP / (TP + FP) if (TP + FP) > 0 else 0
            
            # Recall: Out of all actual positives, how many were correctly predicted?
            recall = TP / (TP + FN) if (TP + FN) > 0 else 0

            # Calculate the F1 Score
            f1_score = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
            f1_score = 2 * TP / (2 * TP + FP + FN) if (2 * TP + FP + FN) > 0 else 0
            #f1_score = f1_score(tool_df['true_reentrancy'], tool_df['predicted_reentrancy'], zero_division=0, average = 'weighted')
            if f1_score > 0:
                latex_file.write(f'{tool},{accuracy*100:.2f},{precision*100:.2f},{recall*100:.2f},{f1_score*100:.2f},{ERRORS2*100:.2f}\n')
                print(f"Tool: {tool}")
                print(f"  Accuracy:  {accuracy*100:.2f}")
                print(f"  Precision: {precision*100:.2f}")
                print(f"  Recall:    {recall*100:.2f}")
                print(f"  F1 Score:  {f1_score*100:.2f}")
                #print(f"  Errors: {ERRORS}")
                print(f"  Errors: {ERRORS2*100:.2f}")
                print("-" * 30)


Reentrancy Metrics per Tool:
****************************************************************************************************
Results for Solidity version: 0.8
****************************************************************************************************
Tool: ccc
  Accuracy:  55.20
  Precision: 58.33
  Recall:    12.07
  F1 Score:  20.00
  Errors: 0.00
------------------------------
Tool: confuzzius
  Accuracy:  57.26
  Precision: 53.97
  Recall:    58.62
  F1 Score:  56.20
  Errors: 16.94
------------------------------
Tool: mythril-0.24.7
  Accuracy:  71.20
  Precision: 63.41
  Recall:    89.66
  F1 Score:  74.29
  Errors: 0.00
------------------------------
Tool: slither-0.11.3
  Accuracy:  69.60
  Precision: 67.86
  Recall:    65.52
  F1 Score:  66.67
  Errors: 0.00
------------------------------
Tool: solhint-6.0.0
  Accuracy:  49.60
  Precision: 27.27
  Recall:    5.17
  F1 Score:  8.70
  Errors: 0.00
------------------------------
Tool: oyente+-060ca34
  Accuracy:  58

In [65]:
from pathlib import Path

# Load your CSV
df = pd.read_csv("reentrancy_metrics_data.csv")

# 1. Keep only rows where basename contains '_ree'
df = df[df['basename'].str.contains('_ree', na=False)]

# 2. Extract folder path from filename (everything except the last component)
df['path'] = df['filename'].apply(lambda x: str(Path(x).parent))

# 3. Compute TP, FP, FN, TN flags
df['TP'] = (df['true_reentrancy'] & df['predicted_reentrancy'])
df['FP'] = (~df['true_reentrancy'] & df['predicted_reentrancy'])
df['FN'] = (df['true_reentrancy'] & ~df['predicted_reentrancy'])
df['TN'] = (~df['true_reentrancy'] & ~df['predicted_reentrancy'])

# 4. Aggregate by path and include total count
agg = (
    df.groupby('path')
      .agg(
          TP=('TP', 'sum'),
          FP=('FP', 'sum'),
          FN=('FN', 'sum'),
          TN=('TN', 'sum'),
          total_rows=('filename', 'count')
      )
      .reset_index()
)

# 5. Save to CSV
agg.to_csv("aggregated_results.csv", index=False)

In [None]:
# Read data from the CSV file and select only the required columns.
df = pd.read_csv('results_all_bins.csv')
df = df[['filename', 'exit_code', 'toolid', 'findings']]

# A dictionary mapping each tool to the string(s) it produces for a reentrancy finding.
# You can easily update this dictionary as needed. For tools with multiple labels,
# use a comma-separated string, e.g., 'tool_name': 'label1,label2'.
reentrancy_labels = {
    'ccc': 'Reentrancy_Vulnerability',
    'confuzzius': 'Reentrancy',
    'conkas': 'Reentrancy', #.sol 0.5
    #'manticore-0.3.7': 'Reentrancy', # placeholder
    'mythril-0.24.7': 'State_access_after_external_call_SWC_107',
    'oyente+-2acaf2e': 'Re_Entrancy_Vulnerability',
    'securify': 'DAO', 
    'securify2': 'Reentrancy', # does not work
    'sfuzz': 'Reentrancy', 
    'slither-0.11.3': 'reentrancy_eth,reentrancy_no_eth',
    #'smartcheck': 'Reentrancy', # never finds any occurrence of reentrancy
    'solhint-6.0.0': 'reentrancy',
    #'ethainter': 'Reentrancy', # does not work
    'ethor-2023': 'insecure',
    'oyente+-060ca34':'Callstack_Depth_Attack_Vulnerability',
    'vandal': 'ReentrantCall',
    'gpt-oss': 'reentrant',
    'gpt-5-mini': 'reentrant',
    'gpt-5': 'reentrant',
    'gpt-5-nano': 'reentrant'
    }

# 1. Determine the "true" reentrancy label for each file based on its filename.
# 'ree' followed by an optional number indicates a true reentrancy vulnerability.
# df['true_reentrancy'] = df['filename'].str.contains(r'ree\d*\.sol', case=False)
df['true_reentrancy'] = df['filename'].str.contains(r'_ree', case=False)

# 2. Determine the "predicted" reentrancy label based on the 'findings' column.
# This function will check if any of the tool-specific reentrancy labels are present in the findings.
def get_prediction(row):
    tool_id = row['toolid']
    findings = str(row['findings']) # Convert to string to handle potential NaN values
    if tool_id == 'vandal':
        print(findings)
    # Check if the tool is in our labels dictionary.
    if tool_id in reentrancy_labels:
        # Split the tool's finding string into a list of individual labels.
        tool_findings = [f.strip() for f in reentrancy_labels[tool_id].split(',')]
        
        # Check if any of the tool's labels are present in the findings from the data.
        for label in tool_findings:
            if label in findings:
                return True
    return False

df['predicted_reentrancy'] = df.apply(get_prediction, axis=1)
#print(df['exit_code']==1)


# Save the DataFrame to a new CSV file.
df.to_csv('reentrancy_metrics_data.csv', index=False)

# 3. Calculate metrics for each unique tool and print only the results.
# Analyze only the tools present in the reentrancy_labels dictionary.
tools_to_analyze = reentrancy_labels.keys()

print("Reentrancy Metrics per Tool:")
print("=" * 30)
versions = ['0_8', '0_5', '0_4']

for version in versions:
    version_df = df[df['filename'].str.contains(version)]
    print('*' * 100)
    print('Results for Solidity version:', version.replace('_', '.'))
    print('*' * 100)
    for tool in tools_to_analyze:
        
        # Filter the DataFrame for the current tool.
        tool_df = version_df[version_df['toolid'] == tool]
        #print(tool, tool_df.shape)
        ERRORS = (tool_df['exit_code'] != 0).sum()
        tool_df = tool_df[tool_df['exit_code'] == 0]

        # Calculate True Positives (TP), False Positives (FP), True Negatives (TN), and False Negatives (FN).
        TP = len(tool_df[(tool_df['true_reentrancy'] == True) & (tool_df['predicted_reentrancy'] == True)])
        FP = len(tool_df[(tool_df['true_reentrancy'] == False) & (tool_df['predicted_reentrancy'] == True)])
        TN = len(tool_df[(tool_df['true_reentrancy'] == False) & (tool_df['predicted_reentrancy'] == False)])
        FN = len(tool_df[(tool_df['true_reentrancy'] == True) & (tool_df['predicted_reentrancy'] == False)])

        
        # Calculate Accuracy, Precision, and Recall.
        # Handle cases where the denominator is zero to avoid errors.
        accuracy = (TP + TN) / (TP + FP + TN + FN) if (TP + FP + TN + FN) > 0 else 0
        
        # Precision: Out of all positive predictions, how many were correct?
        precision = TP / (TP + FP) if (TP + FP) > 0 else 0
        
        # Recall: Out of all actual positives, how many were correctly predicted?
        recall = TP / (TP + FN) if (TP + FN) > 0 else 0

        # Calculate the F1 Score
        f1_score = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
        f1_score = 2 * TP / (2 * TP + FP + FN) if (2 * TP + FP + FN) > 0 else 0
        #f1_score = f1_score(tool_df['true_reentrancy'], tool_df['predicted_reentrancy'], zero_division=0, average = 'weighted')
        if f1_score > 0:
            print(f"Tool: {tool}")
            print(f"  Accuracy:  {accuracy*100:.2f}")
            print(f"  Precision: {precision*100:.2f}")
            print(f"  Recall:    {recall*100:.2f}")
            print(f"  F1 Score:  {f1_score*100:.2f}")
            print(f"  Errors: {ERRORS}")
            print("-" * 30)


In [21]:
import pandas as pd
from collections import defaultdict
from io import StringIO

def get_file_type(filename):
    """
    Extracts the file type from the path using a predefined dictionary.
    """
    d = {
        'tests/0_8/cross-contract/create/': 'create',
        'tests/0_8/cross-contract/gmx/': 'gmx',
        'tests/0_8/cross-contract/human/': 'human',
        'tests/0_8/cross-contract/read-only/': 'read-only',
        'tests/0_8/cross-contract/to-target/': 'cross-to-target',
        'tests/0_8/always-safe/underflow/': 'underflow',
        'tests/0_8/always-safe/emit/': 'emit',
        'tests/0_8/always-safe/constructor/': 'safe-constructor',
        'tests/0_8/always-safe/send-transfer/': 'send-transfer',
        'tests/0_8/always-safe/this/': 'safe-this',
        'tests/0_8/cross-function/guard/mutex/mod/': 'cross-function-mod',
        'tests/0_8/cross-function/guard/mutex/no-mod/': 'cross-function-mutex',
        'tests/0_8/single-function/low-level-call/to-sender/': 'LLC-to-sender',
        'tests/0_8/single-function/low-level-call/to-sender/guard/mutex/': 'LLC-to-sender-guard-mutex',
        'tests/0_8/single-function/low-level-call/to-sender/guard/access-control/': 'LLC-to-sender-guard-access-control',
        'tests/0_8/single-function/low-level-call/to-sender/guard/block-number/': 'LLC-to-sender-guard-block-number',
        'tests/0_8/single-function/low-level-call/to-sender/folded': 'LLC-to-sender-folded',
        'tests/0_8/single-function/low-level-call/to-sender/gas': 'LLC-to-sender-gas',
        'tests/0_8/single-function/low-level-call/to-target/': 'LLC-to-target',
        'tests/0_8/single-function/method-invocation/': 'single-cast'
    }
    cat = 'unknown'
    for prefix, category in d.items():
        if prefix in filename:
            cat = category

    return [cat]

# 3. Analyze and print TP, FP, TN, FN for each tool and category.
print("Detailed Reentrancy Analysis per Tool and Category:")
print("=" * 60)

for tool in reentrancy_labels.keys():
    print(f"Tool: {tool}")
    print("-" * 30)

    tool_df = df[df['toolid'] == tool].copy()
    
    # Store metrics for each category
    category_metrics = defaultdict(lambda: defaultdict(int))

    total_contracts = 0
    for _, row in tool_df.iterrows():
        categories = get_file_type(row['filename'])
        true_reentrancy = row['true_reentrancy']
        predicted_reentrancy = row['predicted_reentrancy']

        for category in categories:
            if true_reentrancy and predicted_reentrancy:
                category_metrics[category]['TP'] += 1
            elif not true_reentrancy and predicted_reentrancy:
                category_metrics[category]['FP'] += 1
            elif not true_reentrancy and not predicted_reentrancy:
                category_metrics[category]['TN'] += 1
            elif true_reentrancy and not predicted_reentrancy:
                category_metrics[category]['FN'] += 1
            
            category_metrics[category]['total'] += 1

    for category, metrics in sorted(category_metrics.items()):
        print(f"  Category: {category} (Total files: {metrics['total']})")
        print(f"    TP: {metrics['TP']}")
        print(f"    FP: {metrics['FP']}")
        print(f"    TN: {metrics['TN']}")
        print(f"    FN: {metrics['FN']}")
        print("-" * 30)
    print("=" * 60)
    total_contracts += sum(metrics['total'] for metrics in category_metrics.values())
    print('check if I counted correctly:', total_contracts)


Detailed Reentrancy Analysis per Tool and Category:
Tool: ccc
------------------------------
  Category: unknown (Total files: 365)
    TP: 99
    FP: 76
    TN: 122
    FN: 68
------------------------------
check if I counted correctly: 365
Tool: confuzzius
------------------------------
  Category: unknown (Total files: 317)
    TP: 103
    FP: 93
    TN: 75
    FN: 46
------------------------------
check if I counted correctly: 317
Tool: conkas
------------------------------
  Category: unknown (Total files: 240)
    TP: 91
    FP: 30
    TN: 101
    FN: 18
------------------------------
check if I counted correctly: 240
Tool: mythril-0.24.7
------------------------------
  Category: unknown (Total files: 26)
    TP: 0
    FP: 0
    TN: 16
    FN: 10
------------------------------
check if I counted correctly: 26
Tool: oyente+-2acaf2e
------------------------------
check if I counted correctly: 0
Tool: securify
------------------------------
  Category: unknown (Total files: 362)
  

In [None]:
from collections import defaultdict
def get_file_type(filename):

    d = {
        'tests/0_8/cross-contract/create/': 'create',
        'tests/0_8/cross-contract/gmx/': 'gmx',
        'tests/0_8/cross-contract/human/': 'human',
        'tests/0_8/cross-contract/read-only/': 'read-only',
        'tests/0_8/cross-contract/to-target/': 'cross-to-target',
        'tests/0_8/always-safe/underflow/': 'underflow',
        'tests/0_8/always-safe/emit/': 'emit',
        'tests/0_8/always-safe/constructor/': 'safe-constructor',
        'tests/0_8/always-safe/send-transfer/': 'send-transfer',
        'tests/0_8/always-safe/this/': 'safe-this',
        'tests/0_8/cross-function/guard/mutex/mod/': 'cross-function-mod',
        'tests/0_8/cross-function/guard/mutex/no-mod/': 'cross-function-mutex',
        'tests/0_8/single-function/low-level-call/to-sender/': 'LLC-to-sender',
        'tests/0_8/single-function/low-level-call/to-sender/guard': 'LLC-to-sender-guard',
        'tests/0_8/single-function/low-level-call/to-sender/folded': 'LLC-to-sender-folded',
        'tests/0_8/single-function/low-level-call/to-sender/gas': 'LLC-to-sender-gas',
        'tests/0_8/single-function/low-level-call/to-target/': 'LLC-to-target',
        'tests/0_8/single-function/method-invocation/': 'single-cast'
    }
    cat = 'unknown'
    for prefix, category in d.items():
        if prefix in filename:
            cat = category

    return [cat]


for tool in tools_to_analyze:
    tool_df = df[df['toolid'] == tool]

    # Analyze False Positives (FP)
    false_positives = tool_df[(tool_df['true_reentrancy'] == False) & (tool_df['predicted_reentrancy'] == True)]
    fp_categories = defaultdict(int)
    for _, row in false_positives.iterrows():
        categories = get_file_type(row['filename'])
        for cat in categories:
            fp_categories[cat] += 1
    
    # Analyze False Negatives (FN)
    false_negatives = tool_df[(tool_df['true_reentrancy'] == True) & (tool_df['predicted_reentrancy'] == False)]
    fn_categories = defaultdict(int)
    for _, row in false_negatives.iterrows():
        categories = get_file_type(row['filename'])
        for cat in categories:
            fn_categories[cat] += 1

    print(f"Tool: {tool}")
    print("  False Positives:")
    if fp_categories:
        for cat, count in sorted(fp_categories.items()):
            print(f"    - {cat}: {count}")
    else:
        print("    No false positives found.")
    
    print("  False Negatives:")
    if fn_categories:
        for cat, count in sorted(fn_categories.items()):
            print(f"    - {cat}: {count}")
    else:
        print("    No false negatives found.")
    print("-" * 30)


Tool: confuzzius
  False Positives:
    No false positives found.
  False Negatives:
    No false negatives found.
------------------------------
Tool: mythril-0.24.7
  False Positives:
    No false positives found.
  False Negatives:
    No false negatives found.
------------------------------
Tool: oyente+-2acaf2e
  False Positives:
    No false positives found.
  False Negatives:
    No false negatives found.
------------------------------
Tool: slither-0.11.3
  False Positives:
    No false positives found.
  False Negatives:
    No false negatives found.
------------------------------
Tool: solhint-6.0.0
  False Positives:
    No false positives found.
  False Negatives:
    No false negatives found.
------------------------------
Tool: oyente+-060ca34
  False Positives:
    No false positives found.
  False Negatives:
    No false negatives found.
------------------------------
Tool: vandal
  False Positives:
    No false positives found.
  False Negatives:
    No false negatives