In [2]:
import pandas as pd
from io import StringIO
from sklearn.metrics import f1_score

# Read data from the CSV file and select only the required columns.
df = pd.read_csv('results_aggregated_new_source.csv')
df = df[['filename', 'toolid', 'findings']]

# A dictionary mapping each tool to the string(s) it produces for a reentrancy finding.
# You can easily update this dictionary as needed. For tools with multiple labels,
# use a comma-separated string, e.g., 'tool_name': 'label1,label2'.
reentrancy_labels = {
    'ccc': 'Reentrancy_Vulnerability',
    'confuzzius': 'Reentrancy',
    'conkas': 'Reentrancy', #.sol 0.5
    #'manticore-0.3.7': 'Reentrancy', # placeholder
    'mythril-0.24.7': 'State_access_after_external_call_SWC_107',
    'oyente+-2acaf2e': 'Re_Entrancy_Vulnerability',
    'securify': 'DAO', 
    #'securify2': 'Reentrancy', # does not work
    'sfuzz': 'Reentrancy', 
    'slither-0.11.3': 'reentrancy_eth,reentrancy_no_eth',
    #'smartcheck': 'Reentrancy', # never finds any occurrence of reentrancy
    'solhint-6.0.0': 'reentrancy',
    #'ethainter': 'Reentrancy', # does not work
    'ethor-2023': 'insecure',
    'oyente+-060ca34':'Callstack_Depth_Attack_Vulnerability',
    'vandal': 'ReentrantCall',
    'gpt-oss': 'reentrant',
    'gpt-5-mini': 'reentrant',
    'gpt-5': 'reentrant',
    'gpt-5-nano': 'reentrant'
    }

# 1. Determine the "true" reentrancy label for each file based on its filename.
# 'ree' followed by an optional number indicates a true reentrancy vulnerability.
# df['true_reentrancy'] = df['filename'].str.contains(r'ree\d*\.sol', case=False)
df['true_reentrancy'] = df['filename'].str.contains(r'_ree', case=False)

# 2. Determine the "predicted" reentrancy label based on the 'findings' column.
# This function will check if any of the tool-specific reentrancy labels are present in the findings.
def get_prediction(row):
    tool_id = row['toolid']
    findings = str(row['findings']) # Convert to string to handle potential NaN values
    
    # Check if the tool is in our labels dictionary.
    if tool_id in reentrancy_labels:
        # Split the tool's finding string into a list of individual labels.
        tool_findings = [f.strip() for f in reentrancy_labels[tool_id].split(',')]
        
        # Check if any of the tool's labels are present in the findings from the data.
        for label in tool_findings:
            if label in findings:
                return True
    return False

df['predicted_reentrancy'] = df.apply(get_prediction, axis=1)

# Save the DataFrame to a new CSV file.
df.to_csv('reentrancy_metrics_data.csv', index=False)

# 3. Calculate metrics for each unique tool and print only the results.
# Analyze only the tools present in the reentrancy_labels dictionary.
tools_to_analyze = reentrancy_labels.keys()

print("Reentrancy Metrics per Tool:")
print("=" * 30)

for tool in tools_to_analyze:
    
    # Filter the DataFrame for the current tool.
    tool_df = df[df['toolid'] == tool]
    
    # Calculate True Positives (TP), False Positives (FP), True Negatives (TN), and False Negatives (FN).
    TP = len(tool_df[(tool_df['true_reentrancy'] == True) & (tool_df['predicted_reentrancy'] == True)])
    FP = len(tool_df[(tool_df['true_reentrancy'] == False) & (tool_df['predicted_reentrancy'] == True)])
    TN = len(tool_df[(tool_df['true_reentrancy'] == False) & (tool_df['predicted_reentrancy'] == False)])
    FN = len(tool_df[(tool_df['true_reentrancy'] == True) & (tool_df['predicted_reentrancy'] == False)])
    
    # Calculate Accuracy, Precision, and Recall.
    # Handle cases where the denominator is zero to avoid errors.
    accuracy = (TP + TN) / (TP + FP + TN + FN) if (TP + FP + TN + FN) > 0 else 0
    
    # Precision: Out of all positive predictions, how many were correct?
    precision = TP / (TP + FP) if (TP + FP) > 0 else 0
    
    # Recall: Out of all actual positives, how many were correctly predicted?
    recall = TP / (TP + FN) if (TP + FN) > 0 else 0

    # Calculate the F1 Score
    f1_score = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    f1_score = 2 * TP / (2 * TP + FP + FN) if (2 * TP + FP + FN) > 0 else 0
    #f1_score = f1_score(tool_df['true_reentrancy'], tool_df['predicted_reentrancy'], zero_division=0, average = 'weighted')
    if f1_score > 0:
        print(f"Tool: {tool}")
        print(f"  Accuracy:  {accuracy:.4f}")
        print(f"  Precision: {precision:.4f}")
        print(f"  Recall:    {recall:.4f}")
        print(f"  F1 Score:  {f1_score:.4f}")
        print("-" * 30)


Reentrancy Metrics per Tool:


In [11]:
import pandas as pd
from sklearn.metrics import f1_score

# Read data from the CSV file and select only the required columns.
df = pd.read_csv('results_aggregated_bins.csv')
df = df[['filename', 'toolid', 'findings']]
print(df)

# Dictionary now stores a list of possible reentrancy labels per tool
reentrancy_labels = {
    'ccc': ['Reentrancy_Vulnerability', 'Result_of_expression_can_be_over_or_under_flown_by_external_entity'],
    'confuzzius': ['Reentrancy'],
    'conkas': ['Reentrancy'], 
    'mythril-0.24.7': ['State_access_after_external_call_SWC_107'],
    'oyente+-2acaf2e': ['Re_Entrancy_Vulnerability'],
    'securify': ['DAO'], 
    'sfuzz': ['Reentrancy'], 
    'slither-0.11.3': ['reentrancy_eth', 'reentrancy_no_eth'],
    'solhint-6.0.0': ['reentrancy'],
    'ethor-2023': ['insecure'],
    'oyente+-060ca34': ['Callstack_Depth_Attack_Vulnerability'],
    'vandal': ['ReentrantCall'],
    'gpt-oss': ['reentrant'],
    'gpt-5-mini': ['reentrant'],
    'gpt-5': ['reentrant'],
    'gpt-5-nano': ['reentrant']
}

# 1. True label from filename
df['true_reentrancy'] = df['filename'].str.contains(r'_ree', case=False)

# 2. Prediction function
def get_prediction(row):
    tool_id = row['toolid']
    findings = str(row['findings'])  # convert NaN to string

    if tool_id in reentrancy_labels:
        for label in reentrancy_labels[tool_id]:
            if label in findings:
                return True
    return False



df['predicted_reentrancy'] = df.apply(get_prediction, axis=1)

# Save updated DataFrame
df.to_csv('reentrancy_metrics_data.csv', index=False)

# 3. Metrics per tool
tools_to_analyze = reentrancy_labels.keys()

print("Reentrancy Metrics per Tool:")
print("=" * 30)

for tool in tools_to_analyze:
    tool_df = df[df['toolid'] == tool]

    TP = len(tool_df[(tool_df['true_reentrancy'] == True) & (tool_df['predicted_reentrancy'] == True)])
    FP = len(tool_df[(tool_df['true_reentrancy'] == False) & (tool_df['predicted_reentrancy'] == True)])
    TN = len(tool_df[(tool_df['true_reentrancy'] == False) & (tool_df['predicted_reentrancy'] == False)])
    FN = len(tool_df[(tool_df['true_reentrancy'] == True) & (tool_df['predicted_reentrancy'] == False)])

    accuracy = (TP + TN) / (TP + FP + TN + FN) if (TP + FP + TN + FN) > 0 else 0
    precision = TP / (TP + FP) if (TP + FP) > 0 else 0
    recall = TP / (TP + FN) if (TP + FN) > 0 else 0
    f1 = 2 * TP / (2 * TP + FP + FN) if (2 * TP + FP + FN) > 0 else 0

    if f1 > 0:
        print(f"Tool: {tool}")
        print(f"  Accuracy:  {accuracy:.2f}")
        print(f"  Precision: {precision:.2f}")
        print(f"  Recall:    {recall:.2f}")
        print(f"  F1 Score:  {f1:.2f}")
        print("-" * 30)


                                               filename  toolid  \
0     tests/bins/_safe/0x9091397ebc19163202daa16814f...  conkas   
1     tests/bins/_safe/0xfa63eb1dc0652e7d74e6658a13a...  conkas   
2     tests/bins/_safe/0x900ea0ba0a7cc1c8bec7792a881...  conkas   
3     tests/bins/_safe/0x4cf56da70c06b7183cd59b7b5ed...  conkas   
4     tests/bins/reentrant/39154ab86b53b8e3a39c30b84...  conkas   
...                                                 ...     ...   
8595  tests/bins/_safe/0xb02bc6401abf94d899efcc7c073...  vandal   
8596  tests/bins/reentrant/7d6b9a034771315b63328b230...  vandal   
8597  tests/bins/_safe/0xa52e014b3f5cc48287c2d483a3e...  vandal   
8598  tests/bins/_safe/0x54a3017754bfba73f71f37d893a...  vandal   
8599  tests/bins/reentrant/0x9240c2d6e42db74a5a0553b...  vandal   

                           findings  
0                                {}  
1                                {}  
2                                {}  
3                                {}  
4    

In [14]:
import pandas as pd

# Read data
df = pd.read_csv('results_aggregated_bins.csv')
df = df[['filename', 'toolid', 'findings']]

# Dictionary of reentrancy labels per tool (lists of possible labels)
reentrancy_labels = {
    'ccc': ['Reentrancy_Vulnerability', 'Result_of_expression_can_be_over_or_under_flown_by_external_entity'],
    'confuzzius': ['Reentrancy'],
    'conkas': ['Reentrancy'], 
    'mythril-0.24.7': ['State_access_after_external_call_SWC_107'],
    'oyente+-2acaf2e': ['Re_Entrancy_Vulnerability'],
    'securify': ['DAO'], 
    'sfuzz': ['Reentrancy'], 
    'slither-0.11.3': ['reentrancy_eth', 'reentrancy_no_eth'],
    'solhint-6.0.0': ['reentrancy'],
    'ethor-2023': ['insecure'],
    'oyente+-060ca34': ['Callstack_Depth_Attack_Vulnerability'],
    'vandal': ['ReentrantCall'],
    'gpt-oss': ['reentrant'],
    'gpt-5-mini': ['reentrant'],
    'gpt-5': ['reentrant'],
    'gpt-5-nano': ['reentrant']
}

# 1. Prediction function
def get_prediction(row):
    tool_id = row['toolid']
    findings = str(row['findings'])

    if tool_id == 'ethor-2023' and findings == '':
        return None
    if tool_id in reentrancy_labels:
        for label in reentrancy_labels[tool_id]:
            if label in findings:
                return True
    return False

df['predicted_reentrancy'] = df.apply(get_prediction, axis=1)

# 2. Extract folder and mark if it's reentrant or safe
df['folder'] = df['filename'].str.extract(r'(tests/bins/[^/]+/[^/]+)')
df['is_reentrant_folder'] = df['folder'].str.contains('reentrant', case=False, na=False)
df['is_safe_folder'] = df['folder'].str.contains('_safe', case=False, na=False)

# 3. True labels
def true_label(row):
    if row['is_reentrant_folder']:
        return True   # reentrant = vulnerable
    elif row['is_safe_folder']:
        return False  # safe = no vulnerability
    return False

df['true_reentrancy'] = df.apply(true_label, axis=1)

# 4. Folder-level aggregation:
# for reentrant folders → if any file predicted True, the whole folder counts as predicted True
folder_predictions = (
    df.groupby(['toolid', 'folder'])['predicted_reentrancy']
    .max()
    .reset_index()
    .rename(columns={'predicted_reentrancy': 'folder_predicted'})
)

df = df.merge(folder_predictions, on=['toolid', 'folder'], how='left')

# 5. Final prediction:
# - use folder_predicted for reentrant folders
# - keep file-level predicted for safe folders
df['final_pred'] = df.apply(
    lambda r: r['folder_predicted'] if r['is_reentrant_folder'] else r['predicted_reentrancy'],
    axis=1
)

# Save updated DataFrame
df.to_csv('reentrancy_metrics_data.csv', index=False)

# 6. Metrics per tool
print("Reentrancy Metrics per Tool:")
print("=" * 30)

for tool in reentrancy_labels.keys():
    tool_df = df[df['toolid'] == tool]

    TP = len(tool_df[(tool_df['true_reentrancy']) & (tool_df['final_pred'])])
    FP = len(tool_df[(~tool_df['true_reentrancy']) & (tool_df['final_pred'])])
    TN = len(tool_df[(~tool_df['true_reentrancy']) & (~tool_df['final_pred'])])
    FN = len(tool_df[(tool_df['true_reentrancy']) & (~tool_df['final_pred'])])

    accuracy = (TP + TN) / (TP + FP + TN + FN) if (TP + FP + TN + FN) > 0 else 0
    precision = TP / (TP + FP) if (TP + FP) > 0 else 0
    recall = TP / (TP + FN) if (TP + FN) > 0 else 0
    f1 = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    if f1 > 0:
        print(f"Tool: {tool}")
        print(f"  Accuracy:  {accuracy:.2f}")
        print(f"  Precision: {precision:.2f}")
        print(f"  Recall:    {recall:.2f}")
        print(f"  F1 Score:  {f1:.2f}")
        print("-" * 30)


Reentrancy Metrics per Tool:
Tool: ethor-2023
  Accuracy:  0.67
  Precision: 0.85
  Recall:    0.06
  F1 Score:  0.11
------------------------------
Tool: oyente+-060ca34
  Accuracy:  0.79
  Precision: 0.93
  Recall:    0.44
  F1 Score:  0.59
------------------------------
Tool: vandal
  Accuracy:  0.90
  Precision: 0.78
  Recall:    0.98
  F1 Score:  0.87
------------------------------


In [4]:
import pandas as pd
from collections import defaultdict
from io import StringIO

def get_file_type(filename):
    """
    Extracts the file type from the path using a predefined dictionary.
    """
    d = {
        'tests/0_8/cross-contract/create/': 'create',
        'tests/0_8/cross-contract/gmx/': 'gmx',
        'tests/0_8/cross-contract/human/': 'human',
        'tests/0_8/cross-contract/read-only/': 'read-only',
        'tests/0_8/cross-contract/to-target/': 'cross-to-target',
        'tests/0_8/always-safe/underflow/': 'underflow',
        'tests/0_8/always-safe/emit/': 'emit',
        'tests/0_8/always-safe/constructor/': 'safe-constructor',
        'tests/0_8/always-safe/send-transfer/': 'send-transfer',
        'tests/0_8/always-safe/this/': 'safe-this',
        'tests/0_8/cross-function/guard/mutex/mod/': 'cross-function-mod',
        'tests/0_8/cross-function/guard/mutex/no-mod/': 'cross-function-mutex',
        'tests/0_8/single-function/low-level-call/to-sender/': 'LLC-to-sender',
        'tests/0_8/single-function/low-level-call/to-sender/guard/mutex/': 'LLC-to-sender-guard-mutex',
        'tests/0_8/single-function/low-level-call/to-sender/guard/access-control/': 'LLC-to-sender-guard-access-control',
        'tests/0_8/single-function/low-level-call/to-sender/guard/block-number/': 'LLC-to-sender-guard-block-number',
        'tests/0_8/single-function/low-level-call/to-sender/folded': 'LLC-to-sender-folded',
        'tests/0_8/single-function/low-level-call/to-sender/gas': 'LLC-to-sender-gas',
        'tests/0_8/single-function/low-level-call/to-target/': 'LLC-to-target',
        'tests/0_8/single-function/method-invocation/': 'single-cast'
    }
    cat = 'unknown'
    for prefix, category in d.items():
        if prefix in filename:
            cat = category

    return [cat]

# 3. Analyze and print TP, FP, TN, FN for each tool and category.
print("Detailed Reentrancy Analysis per Tool and Category:")
print("=" * 60)

for tool in reentrancy_labels.keys():
    print(f"Tool: {tool}")
    print("-" * 30)

    tool_df = df[df['toolid'] == tool].copy()
    
    # Store metrics for each category
    category_metrics = defaultdict(lambda: defaultdict(int))

    total_contracts = 0
    for _, row in tool_df.iterrows():
        categories = get_file_type(row['filename'])
        true_reentrancy = row['true_reentrancy']
        predicted_reentrancy = row['predicted_reentrancy']

        for category in categories:
            if true_reentrancy and predicted_reentrancy:
                category_metrics[category]['TP'] += 1
            elif not true_reentrancy and predicted_reentrancy:
                category_metrics[category]['FP'] += 1
            elif not true_reentrancy and not predicted_reentrancy:
                category_metrics[category]['TN'] += 1
            elif true_reentrancy and not predicted_reentrancy:
                category_metrics[category]['FN'] += 1
            
            category_metrics[category]['total'] += 1

    for category, metrics in sorted(category_metrics.items()):
        print(f"  Category: {category} (Total files: {metrics['total']})")
        print(f"    TP: {metrics['TP']}")
        print(f"    FP: {metrics['FP']}")
        print(f"    TN: {metrics['TN']}")
        print(f"    FN: {metrics['FN']}")
        print("-" * 30)
    print("=" * 60)
    total_contracts += sum(metrics['total'] for metrics in category_metrics.values())
    print('check if I counted correctly:', total_contracts)


Detailed Reentrancy Analysis per Tool and Category:
Tool: confuzzius
------------------------------
  Category: LLC-to-sender (Total files: 2)
    TP: 0
    FP: 0
    TN: 1
    FN: 1
------------------------------
  Category: LLC-to-sender-folded (Total files: 6)
    TP: 3
    FP: 1
    TN: 2
    FN: 0
------------------------------
  Category: LLC-to-sender-gas (Total files: 3)
    TP: 1
    FP: 0
    TN: 2
    FN: 0
------------------------------
  Category: LLC-to-sender-guard-access-control (Total files: 3)
    TP: 0
    FP: 0
    TN: 2
    FN: 1
------------------------------
  Category: LLC-to-sender-guard-block-number (Total files: 2)
    TP: 0
    FP: 0
    TN: 2
    FN: 0
------------------------------
  Category: LLC-to-sender-guard-mutex (Total files: 32)
    TP: 15
    FP: 7
    TN: 5
    FN: 5
------------------------------
  Category: LLC-to-target (Total files: 6)
    TP: 1
    FP: 1
    TN: 2
    FN: 2
------------------------------
  Category: create (Total files: 6)
 

In [None]:
from collections import defaultdict
def get_file_type(filename):

    d = {
        'tests/0_8/cross-contract/create/': 'create',
        'tests/0_8/cross-contract/gmx/': 'gmx',
        'tests/0_8/cross-contract/human/': 'human',
        'tests/0_8/cross-contract/read-only/': 'read-only',
        'tests/0_8/cross-contract/to-target/': 'cross-to-target',
        'tests/0_8/always-safe/underflow/': 'underflow',
        'tests/0_8/always-safe/emit/': 'emit',
        'tests/0_8/always-safe/constructor/': 'safe-constructor',
        'tests/0_8/always-safe/send-transfer/': 'send-transfer',
        'tests/0_8/always-safe/this/': 'safe-this',
        'tests/0_8/cross-function/guard/mutex/mod/': 'cross-function-mod',
        'tests/0_8/cross-function/guard/mutex/no-mod/': 'cross-function-mutex',
        'tests/0_8/single-function/low-level-call/to-sender/': 'LLC-to-sender',
        'tests/0_8/single-function/low-level-call/to-sender/guard': 'LLC-to-sender-guard',
        'tests/0_8/single-function/low-level-call/to-sender/folded': 'LLC-to-sender-folded',
        'tests/0_8/single-function/low-level-call/to-sender/gas': 'LLC-to-sender-gas',
        'tests/0_8/single-function/low-level-call/to-target/': 'LLC-to-target',
        'tests/0_8/single-function/method-invocation/': 'single-cast'
    }
    cat = 'unknown'
    for prefix, category in d.items():
        if prefix in filename:
            cat = category

    return [cat]


for tool in tools_to_analyze:
    tool_df = df[df['toolid'] == tool]

    # Analyze False Positives (FP)
    false_positives = tool_df[(tool_df['true_reentrancy'] == False) & (tool_df['predicted_reentrancy'] == True)]
    fp_categories = defaultdict(int)
    for _, row in false_positives.iterrows():
        categories = get_file_type(row['filename'])
        for cat in categories:
            fp_categories[cat] += 1
    
    # Analyze False Negatives (FN)
    false_negatives = tool_df[(tool_df['true_reentrancy'] == True) & (tool_df['predicted_reentrancy'] == False)]
    fn_categories = defaultdict(int)
    for _, row in false_negatives.iterrows():
        categories = get_file_type(row['filename'])
        for cat in categories:
            fn_categories[cat] += 1

    print(f"Tool: {tool}")
    print("  False Positives:")
    if fp_categories:
        for cat, count in sorted(fp_categories.items()):
            print(f"    - {cat}: {count}")
    else:
        print("    No false positives found.")
    
    print("  False Negatives:")
    if fn_categories:
        for cat, count in sorted(fn_categories.items()):
            print(f"    - {cat}: {count}")
    else:
        print("    No false negatives found.")
    print("-" * 30)


Tool: confuzzius
  False Positives:
    No false positives found.
  False Negatives:
    No false negatives found.
------------------------------
Tool: mythril-0.24.7
  False Positives:
    No false positives found.
  False Negatives:
    No false negatives found.
------------------------------
Tool: oyente+-2acaf2e
  False Positives:
    No false positives found.
  False Negatives:
    No false negatives found.
------------------------------
Tool: slither-0.11.3
  False Positives:
    No false positives found.
  False Negatives:
    No false negatives found.
------------------------------
Tool: solhint-6.0.0
  False Positives:
    No false positives found.
  False Negatives:
    No false negatives found.
------------------------------
Tool: oyente+-060ca34
  False Positives:
    No false positives found.
  False Negatives:
    No false negatives found.
------------------------------
Tool: vandal
  False Positives:
    No false positives found.
  False Negatives:
    No false negatives