# Partial Labeled Data Preparing

## Set up

In [1]:
import os
from pathlib import Path
import shutil
import pandas as pd
from tqdm import tqdm
import time

import sys
PATH = Path.cwd().parents[1]
sys.path.append(str(PATH))

from backend.utils.data_loader import (
  get_info_by_contract_addr,
  save_bytecode_by_contract_addr,
  get_bytecode_by_contract_addr,
  save_transactions_by_contract_addr,
  get_source_code_by_contract_addr,
  save_sol_by_contract_addr
)

In [2]:
name = "crp_large_sample.csv" # change this
DATA_PATH = os.path.join(PATH, 'data')
SOURCE_PATH = os.path.join(DATA_PATH, 'external/crpwarner/dataset/large/hex') # change this
DATASET_PATH = os.path.join(DATA_PATH, 'interim')
UNLABELED_PATH = os.path.join(DATA_PATH, 'unlabeled')

## Move Dataset to Unlabeled

In [3]:
source = os.path.join(DATASET_PATH, name)
target = os.path.join(UNLABELED_PATH, name)
if os.path.exists(source):
    shutil.move(source, UNLABELED_PATH)
    print("File moved successfully.")
else:
    print("File does not exist.", DATASET_PATH)

File does not exist. /Users/napatcholthaipanich/Dev/master/dissertation/workspace/data/interim


In [4]:
df = pd.read_csv(target, index_col=0)
df.head()

Unnamed: 0_level_0,Mint,Leak,Limit
Address,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0x0290ea3c728981725689187763f6c63a68e192b8,-1,1,-1
0x02d3aea48b443a0026ed9cbc91b97d7335aba323,-1,-1,1
0x03260e1b0f53e1a1f93cf126a7ca42a1c71648d6,1,-1,-1
0x054ad3cd4a66f14bf5c0de2548a53be66995a4f6,-1,1,-1
0x0566c17dc2a9efcaa2f63e04cf06a69e8fc77f60,-1,0,-1


In [5]:
target_df = df.loc[df['Leak']!=-1] # change this
target_df.to_csv(os.path.join(UNLABELED_PATH, 'groundtruth.csv'))

In [6]:
HEX_PATH = os.path.join(UNLABELED_PATH, 'hex')
SOL_PATH = os.path.join(UNLABELED_PATH, 'sol')
TXN_PATH = os.path.join(UNLABELED_PATH, 'txn')
TMP_PATH = os.path.join(DATASET_PATH, 'hex')

In [7]:
for src_file in tqdm(list(Path(TMP_PATH).glob('*.hex'))):
    address = src_file.stem.lower()
    if address in target_df.index:
        shutil.move(src_file, os.path.join(HEX_PATH, f'{address}.hex'))

for src_file in tqdm(list(Path(SOURCE_PATH).glob('*.hex'))):
    address = src_file.stem.lower()
    if address in target_df.index:
        shutil.copy(src_file, os.path.join(HEX_PATH, f'{address}.hex'))
    else:
        shutil.copy(src_file, os.path.join(TMP_PATH, f'{address}.hex'))

100%|██████████| 13396/13396 [00:00<00:00, 366570.96it/s]
100%|██████████| 13483/13483 [00:15<00:00, 884.06it/s] 


In [8]:
len(target_df), len(os.listdir(HEX_PATH))

(87, 87)

In [9]:
chains = [1, 97, 56, 57054, 146]
# 3. Collect and extract
for addr in tqdm(target_df.index):
    print(f"Searching {addr}...")
    for chain in chains:
        print(f"Searching in {chain} chain...")
        if addr not in [filename.lower().split('.')[0] for filename in os.listdir(TXN_PATH)]:
            info = get_info_by_contract_addr(addr, chain)
            save_transactions_by_contract_addr(TXN_PATH, addr, info)
            if 'creationBytecode' in info.get('creator'):
                save_bytecode_by_contract_addr(HEX_PATH, addr, info['creator']['creationBytecode'])
                time.sleep(0.5)
        if addr not in [filename.lower().split('.')[0] for filename in os.listdir(HEX_PATH)]:
            bytecode = get_bytecode_by_contract_addr(addr, chain)
            save_bytecode_by_contract_addr(HEX_PATH, addr, bytecode)
            time.sleep(0.5)
        if addr not in [filename.lower().split('.')[0] for filename in os.listdir(SOL_PATH)]:
            source = get_source_code_by_contract_addr(addr, chain)
            if 'SourceCode' in source:
                save_sol_by_contract_addr(SOL_PATH, addr, source['SourceCode'])
                time.sleep(0.5)


 13%|█▎        | 11/87 [00:00<00:00, 109.15it/s]

Searching 0x0290ea3c728981725689187763f6c63a68e192b8...
Searching in 1 chain...
Searching in 97 chain...
Searching in 56 chain...
Searching in 57054 chain...
Searching in 146 chain...
Searching 0x054ad3cd4a66f14bf5c0de2548a53be66995a4f6...
Searching in 1 chain...
Searching in 97 chain...
Searching in 56 chain...
Searching in 57054 chain...
Searching in 146 chain...
Searching 0x0566c17dc2a9efcaa2f63e04cf06a69e8fc77f60...
Searching in 1 chain...
Searching in 97 chain...
Searching in 56 chain...
Searching in 57054 chain...
Searching in 146 chain...
Searching 0x10c8324b20b7266c445944f043f53f6a77ea0bd4...
Searching in 1 chain...
Searching in 97 chain...
Searching in 56 chain...
Searching in 57054 chain...
Searching in 146 chain...
Searching 0x10cc060f6f9b2e5dcdb23f1361e4b368a7daec73...
Searching in 1 chain...
Searching in 97 chain...
Searching in 56 chain...
Searching in 57054 chain...
Searching in 146 chain...
Searching 0x12e29d7d6fd63f0f12040fcd1bf9021919b0e4fa...
Searching in 1 chain...


100%|██████████| 87/87 [00:00<00:00, 308.86it/s]

Searching in 57054 chain...
Searching in 146 chain...
Searching 0x9241b4c67b6cdf9b99f5f50de21283d0441eff75...
Searching in 1 chain...
Searching in 97 chain...
Searching in 56 chain...
Searching in 57054 chain...
Searching in 146 chain...
Searching 0x964bc602ee118f13090c69a670032e506d66f457...
Searching in 1 chain...
Searching in 97 chain...
Searching in 56 chain...
Searching in 57054 chain...
Searching in 146 chain...
Searching 0x97fe19dfab95b1709bb0994af18ba7f793e28cba...
Searching in 1 chain...
Searching in 97 chain...
Searching in 56 chain...
Searching in 57054 chain...
Searching in 146 chain...
Searching 0x99d3dcf7febd2bb968b3fe7baa1a9a36546d9293...
Searching in 1 chain...
Searching in 97 chain...
Searching in 56 chain...
Searching in 57054 chain...
Searching in 146 chain...
Searching 0x9e3c595e72294eea8a100c63d7c8bf4c4430ee97...
Searching in 1 chain...
Searching in 97 chain...
Searching in 56 chain...
Searching in 57054 chain...
Searching in 146 chain...
Searching 0xa5dd8cde486436




In [10]:
len(target_df), len(os.listdir(HEX_PATH)), len(os.listdir(TXN_PATH)), len(os.listdir(SOL_PATH))

(87, 87, 87, 87)

In [11]:
if os.path.exists(target):
    shutil.move(target, DATASET_PATH)
    print("File moved successfully.")
else:
    print("File does not exist.", UNLABELED_PATH)

File moved successfully.
