# Preparing Each Label From CPRWarner Groundtruth + Large Sample

## Set up

In [1]:
import os
from pathlib import Path
import pandas as pd
import shutil

In [2]:
PATH = Path.cwd().parents[1]
DATA_PATH = os.path.join(PATH, 'data/external/crpwarner')
GROUNDTRUTH_PATH = os.path.join(DATA_PATH, 'groundtruth')
GROUND_HEX_PATH = os.path.join(GROUNDTRUTH_PATH, 'hex')
GROUND_SOL_PATH = os.path.join(GROUNDTRUTH_PATH, 'sol')
LARGE_PATH = os.path.join(DATA_PATH, 'large')
LARGE_HEX_PATH = os.path.join(LARGE_PATH, 'hex')
LARGE_SOL_PATH = os.path.join(LARGE_PATH, 'sol')
MERGE_PATH = os.path.join(DATA_PATH, 'merged')
SOL_PATH = os.path.join(MERGE_PATH, 'sol')
HEX_PATH = os.path.join(MERGE_PATH, 'hex')
os.makedirs(MERGE_PATH, exist_ok=True)
os.makedirs(SOL_PATH, exist_ok=True)
os.makedirs(HEX_PATH, exist_ok=True)

In [3]:
groundtruth_df = pd.read_excel(os.path.join(GROUNDTRUTH_PATH, 'groundTruth.xlsx'))

In [4]:
SAMPLE_PATH = os.path.join(LARGE_PATH, 'sample')
mint_df = pd.read_excel(os.path.join(SAMPLE_PATH, 'mint.xlsx'))
leak_df = pd.read_excel(os.path.join(SAMPLE_PATH, 'leak.xlsx'))
limit_df = pd.read_excel(os.path.join(SAMPLE_PATH, 'limit.xlsx'))

## Sampling

In [5]:
groundtruth_df.head()

Unnamed: 0,address,Mint,Leak,Limit
0,0x93023F1D3525E273F291B6f76d2F5027A39BF302,1,0,1
1,0x2753dcE37A7eDB052a77832039bcc9aA49Ad8b25,0,0,1
2,0x94b7D24552933F50A5A5705C446528806dCeA381,0,0,0
3,0xE0b9d4146AaD6936cBfcBE4dAE47e34aAb96b093,0,0,0
4,0x10f6f2b97F3aB29583D9D38BaBF2994dF7220C21,1,0,1


In [6]:
mint_df.head()

Unnamed: 0,Address,TP?
0,0x0fef20d2c4ee011fa0389e69e9fa92a2291b63c8,Yes
1,0xd7cc0deb9dd11be95068bf2d7a3d082b8ba9bf04,Yes
2,0xa1b756be589441519b1a08e16bc4f60ab177d916,Yes
3,0x514bc174df04a4b04ae2be81ee8c788c3796b06b,Yes
4,0x1354c8c1a66c2573ce9cc3e92e98d17869501a46,Yes


In [7]:
leak_df.head()

Unnamed: 0,Address,TP?
0,0x0290ea3c728981725689187763f6c63a68e192b8,Yes
1,0x054ad3cd4a66f14bf5c0de2548a53be66995a4f6,Yes
2,0x0566c17dc2a9efcaa2f63e04cf06a69e8fc77f60,No
3,0x10c8324b20b7266c445944f043f53f6a77ea0bd4,Yes
4,0x10cc060f6f9b2e5dcdb23f1361e4b368a7daec73,Yes


In [8]:
limit_df.head()

Unnamed: 0,Address,TP?
0,0xe5f3c6d2b47cbe2cf936b9521466bac2422ebef8,Yes
1,0xa623b5a542c0d7daadef321042a04c600b03a8cb,Yes
2,0xe412189da2dfa188a1a61633114b8732bbbfba19,Yes
3,0x9fcf7acdc11fd904c4b73a009909c7f00efc4844,Yes
4,0x921a5dce3dfed5cccfbb2e593f2978533bc66110,Yes


## Move Sol and Hex from groundtruth

In [9]:
for f in list(Path(GROUND_SOL_PATH).glob('*.sol')):
    shutil.copy(f, SOL_PATH)

In [10]:
for f in list(Path(GROUND_HEX_PATH).glob('*.hex')):
    shutil.copy(f, HEX_PATH)

## Mint

In [11]:
# Prepare new_mint_df from groundtruth
new_mint_df = groundtruth_df[['address', 'Mint']].rename(columns={'Mint': 'TP?'}).set_index('address')

# Prepare mint_df with correct columns and types
mint_df_subset = mint_df[['Address', 'TP?']].rename(columns={'Address': 'address'})
mint_df_subset['TP?'] = mint_df_subset['TP?'].map({'Yes': 1, 'No': 0})
mint_df_subset = mint_df_subset.set_index('address')

# Concatenate the two DataFrames
new_mint_df = pd.concat([new_mint_df, mint_df_subset])

new_mint_df.head()

Unnamed: 0_level_0,TP?
address,Unnamed: 1_level_1
0x93023F1D3525E273F291B6f76d2F5027A39BF302,1
0x2753dcE37A7eDB052a77832039bcc9aA49Ad8b25,0
0x94b7D24552933F50A5A5705C446528806dCeA381,0
0xE0b9d4146AaD6936cBfcBE4dAE47e34aAb96b093,0
0x10f6f2b97F3aB29583D9D38BaBF2994dF7220C21,1


In [12]:
# Check for duplicate addresses in new_mint_df
duplicate_addresses = new_mint_df.index.duplicated(keep='first')
new_mint_df[duplicate_addresses]

Unnamed: 0_level_0,TP?
address,Unnamed: 1_level_1


### Save DataFrame and Move SOL + HEX

In [13]:
new_mint_df.to_csv(os.path.join(MERGE_PATH, 'mint.csv'))

In [14]:
for f in list(Path(LARGE_HEX_PATH).glob('*.hex')):
    if f.stem in new_mint_df.index:
        shutil.copy(f, HEX_PATH)

In [15]:
for f in list(Path(LARGE_SOL_PATH).glob('*.sol')):
    if f.stem in new_mint_df.index:
        shutil.copy(f, SOL_PATH)

## Leak

In [16]:
# Prepare new_leak_df from groundtruth
new_leak_df = groundtruth_df[['address', 'Leak']].rename(columns={'Leak': 'TP?'}).set_index('address')

# Prepare leak_df with correct columns and types
leak_df_subset = leak_df[['Address', 'TP?']].rename(columns={'Address': 'address'})
leak_df_subset['TP?'] = leak_df_subset['TP?'].map({'Yes': 1, 'No': 0})
leak_df_subset = leak_df_subset.set_index('address')

# Concatenate the two DataFrames
new_leak_df = pd.concat([new_leak_df, leak_df_subset])

new_leak_df.head()

Unnamed: 0_level_0,TP?
address,Unnamed: 1_level_1
0x93023F1D3525E273F291B6f76d2F5027A39BF302,0
0x2753dcE37A7eDB052a77832039bcc9aA49Ad8b25,0
0x94b7D24552933F50A5A5705C446528806dCeA381,0
0xE0b9d4146AaD6936cBfcBE4dAE47e34aAb96b093,0
0x10f6f2b97F3aB29583D9D38BaBF2994dF7220C21,0


In [17]:
# Check for duplicate addresses in new_leak_df
duplicate_addresses = new_leak_df.index.duplicated(keep='first')
new_leak_df[duplicate_addresses]

Unnamed: 0_level_0,TP?
address,Unnamed: 1_level_1
0x99d3dcf7febd2bb968b3fe7baa1a9a36546d9293,0


In [18]:
new_leak_df = new_leak_df[~duplicate_addresses]
new_leak_df.head()

Unnamed: 0_level_0,TP?
address,Unnamed: 1_level_1
0x93023F1D3525E273F291B6f76d2F5027A39BF302,0
0x2753dcE37A7eDB052a77832039bcc9aA49Ad8b25,0
0x94b7D24552933F50A5A5705C446528806dCeA381,0
0xE0b9d4146AaD6936cBfcBE4dAE47e34aAb96b093,0
0x10f6f2b97F3aB29583D9D38BaBF2994dF7220C21,0


### Save DataFrame and Move SOL + HEX

In [19]:
new_leak_df.to_csv(os.path.join(MERGE_PATH, 'leak.csv'))

In [20]:
for f in list(Path(LARGE_HEX_PATH).glob('*.hex')):
    if f.stem in new_leak_df.index:
        shutil.copy(f, HEX_PATH)

In [21]:
for f in list(Path(LARGE_SOL_PATH).glob('*.sol')):
    if f.stem in new_leak_df.index:
        shutil.copy(f, SOL_PATH)

## Limit

In [22]:
# Prepare new_limit_df from groundtruth
new_limit_df = groundtruth_df[['address', 'Limit']].rename(columns={'Limit': 'TP?'}).set_index('address')

# Prepare leak_df with correct columns and types
limit_df_subset = limit_df[['Address', 'TP?']].rename(columns={'Address': 'address'})
limit_df_subset['TP?'] = limit_df_subset['TP?'].map({'Yes': 1, 'No': 0})
limit_df_subset = limit_df_subset.set_index('address')

# Concatenate the two DataFrames
new_limit_df = pd.concat([new_limit_df, limit_df_subset])

new_limit_df.head()

Unnamed: 0_level_0,TP?
address,Unnamed: 1_level_1
0x93023F1D3525E273F291B6f76d2F5027A39BF302,1
0x2753dcE37A7eDB052a77832039bcc9aA49Ad8b25,1
0x94b7D24552933F50A5A5705C446528806dCeA381,0
0xE0b9d4146AaD6936cBfcBE4dAE47e34aAb96b093,0
0x10f6f2b97F3aB29583D9D38BaBF2994dF7220C21,1


In [23]:
# Check for duplicate addresses in new_limit_df
duplicate_addresses = new_limit_df.index.duplicated(keep='first')
new_limit_df[duplicate_addresses]

Unnamed: 0_level_0,TP?
address,Unnamed: 1_level_1
0x61c3ee9ce25e2c88dc9a8dd98f0510b95a82397a,1


In [24]:
new_limit_df = new_limit_df[~duplicate_addresses]
new_limit_df.head()

Unnamed: 0_level_0,TP?
address,Unnamed: 1_level_1
0x93023F1D3525E273F291B6f76d2F5027A39BF302,1
0x2753dcE37A7eDB052a77832039bcc9aA49Ad8b25,1
0x94b7D24552933F50A5A5705C446528806dCeA381,0
0xE0b9d4146AaD6936cBfcBE4dAE47e34aAb96b093,0
0x10f6f2b97F3aB29583D9D38BaBF2994dF7220C21,1


### Save DataFrame and Move SOL + HEX

In [25]:
new_limit_df.to_csv(os.path.join(MERGE_PATH, 'limit.csv'))

In [26]:
for f in list(Path(LARGE_HEX_PATH).glob('*.hex')):
    if f.stem in new_limit_df.index:
        shutil.copy(f, HEX_PATH)

In [27]:
for f in list(Path(LARGE_SOL_PATH).glob('*.sol')):
    if f.stem in new_limit_df.index:
        shutil.copy(f, SOL_PATH)