# Partial Labeled Data Preparing

## Set up

In [1]:
import os
from pathlib import Path
import shutil
import pandas as pd
from tqdm import tqdm
import time

import sys
PATH = Path.cwd().parents[1]
sys.path.append(str(PATH))

from notebooks.utils.data_loader import (
  get_info_by_contract_addr,
  save_bytecode_by_contract_addr,
  get_bytecode_by_contract_addr,
  save_transactions_by_contract_addr,
  get_source_code_by_contract_addr,
  save_sol_by_contract_addr
)

In [2]:
name = "crp_large_sample.csv" # change this
DATA_PATH = os.path.join(PATH, 'data')
SOURCE_PATH = os.path.join(DATA_PATH, 'external/crpwarner/dataset/large/hex') # change this
HAX_PATH = os.path.join(DATA_PATH, 'hex')

## Move Dataset to Unlabeled

In [None]:
source = os.path.join(DATASET_PATH, name)
target = os.path.join(UNLABELED_PATH, name)
if os.path.exists(source):
    shutil.move(source, UNLABELED_PATH)
    print("File moved successfully.")
else:
    print("File does not exist.", DATASET_PATH)

File does not exist. /Users/napatcholthaipanich/Dev/master/dissertation/workspace/data/interim


In [3]:
target = os.path.join(DATA_PATH, name)
df = pd.read_csv(target, index_col=0)
df.head()

Unnamed: 0_level_0,Mint,Leak,Limit
Address,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0x0290ea3c728981725689187763f6c63a68e192b8,-1,1,-1
0x02d3aea48b443a0026ed9cbc91b97d7335aba323,-1,-1,1
0x03260e1b0f53e1a1f93cf126a7ca42a1c71648d6,1,-1,-1
0x054ad3cd4a66f14bf5c0de2548a53be66995a4f6,-1,1,-1
0x0566c17dc2a9efcaa2f63e04cf06a69e8fc77f60,-1,0,-1


In [None]:
target_df = df.loc[df['Leak']!=-1] # change this
# target_df.to_csv(os.path.join(UNLABELED_PATH, 'groundtruth.csv'))

In [4]:
HEX_PATH = os.path.join(DATA_PATH, 'hex')
SOL_PATH = os.path.join(DATA_PATH, 'sol')
TXN_PATH = os.path.join(DATA_PATH, 'txn')

In [None]:
for src_file in tqdm(list(Path(SOURCE_PATH).glob('*.hex'))):
    address = src_file.stem.lower()
    if address in df.index:
        shutil.copy(src_file, os.path.join(HEX_PATH, f'{address}.hex'))


100%|██████████| 13483/13483 [00:00<00:00, 17442.40it/s]


In [8]:
len(df), len(os.listdir(HEX_PATH))

(265, 13557)

In [None]:
chains = [1, 97, 56, 57054, 146]
# 3. Collect and extract
for addr in tqdm(df.index):
    print(f"Searching {addr}...")
    for chain in chains:
        print(f"Searching in {chain} chain...")
        if addr not in [filename.lower().split('.')[0] for filename in os.listdir(TXN_PATH)]:
            info = get_info_by_contract_addr(addr, chain)
            save_transactions_by_contract_addr(TXN_PATH, addr, info)
            if 'creationBytecode' in info.get('creator') and addr not in [filename.lower().split('.')[0] for filename in os.listdir(HEX_PATH)]:
                save_bytecode_by_contract_addr(HEX_PATH, addr, info['creator']['creationBytecode'])
                time.sleep(0.5)
        if addr not in [filename.lower().split('.')[0] for filename in os.listdir(HEX_PATH)]:
            bytecode = get_bytecode_by_contract_addr(addr, chain)
            save_bytecode_by_contract_addr(HEX_PATH, addr, bytecode)
            time.sleep(0.5)
        if addr not in [filename.lower().split('.')[0] for filename in os.listdir(SOL_PATH)]:
            source = get_source_code_by_contract_addr(addr, chain)
            if 'SourceCode' in source:
                save_sol_by_contract_addr(SOL_PATH, addr, source['SourceCode'])
                time.sleep(0.5)


  0%|          | 0/265 [00:00<?, ?it/s]

Searching 0x0290ea3c728981725689187763f6c63a68e192b8...
Searching in 1 chain...
error from get_internal_transactions_by_contract_addr: No transactions found
Saved 0x0290ea3c728981725689187763f6c63a68e192b8.json
Saved 0x0290ea3c728981725689187763f6c63a68e192b8.hex


  1%|          | 2/265 [00:05<10:53,  2.49s/it]

Searching in 97 chain...
Searching in 56 chain...
Searching in 57054 chain...
Searching in 146 chain...
Searching 0x02d3aea48b443a0026ed9cbc91b97d7335aba323...
Searching in 1 chain...
Searching in 97 chain...
Searching in 56 chain...
Searching in 57054 chain...
Searching in 146 chain...
Searching 0x03260e1b0f53e1a1f93cf126a7ca42a1c71648d6...
Searching in 1 chain...
error from get_internal_transactions_by_contract_addr: No transactions found
Saved 0x03260e1b0f53e1a1f93cf126a7ca42a1c71648d6.json
Saved 0x03260e1b0f53e1a1f93cf126a7ca42a1c71648d6.hex


  1%|          | 3/265 [00:10<15:34,  3.57s/it]

Searching in 97 chain...
Searching in 56 chain...
Searching in 57054 chain...
Searching in 146 chain...
Searching 0x054ad3cd4a66f14bf5c0de2548a53be66995a4f6...
Searching in 1 chain...
error from get_internal_transactions_by_contract_addr: No transactions found
Saved 0x054ad3cd4a66f14bf5c0de2548a53be66995a4f6.json
Saved 0x054ad3cd4a66f14bf5c0de2548a53be66995a4f6.hex


  2%|▏         | 4/265 [00:16<18:22,  4.22s/it]

Searching in 97 chain...
Searching in 56 chain...
Searching in 57054 chain...
Searching in 146 chain...
Searching 0x0566c17dc2a9efcaa2f63e04cf06a69e8fc77f60...
Searching in 1 chain...
error from get_internal_transactions_by_contract_addr: No transactions found
Saved 0x0566c17dc2a9efcaa2f63e04cf06a69e8fc77f60.json
Saved 0x0566c17dc2a9efcaa2f63e04cf06a69e8fc77f60.hex


  2%|▏         | 5/265 [00:20<19:11,  4.43s/it]

Searching in 97 chain...
Searching in 56 chain...
Searching in 57054 chain...
Searching in 146 chain...
Searching 0x08769a9b479a4b20e796194d960cc407fc66359a...
Searching in 1 chain...
error from get_internal_transactions_by_contract_addr: No transactions found


  2%|▏         | 5/265 [00:25<21:41,  5.01s/it]


KeyboardInterrupt: 

In [12]:
len(target_df), len(os.listdir(HEX_PATH)), len(os.listdir(TXN_PATH)), len(os.listdir(SOL_PATH))

(87, 89, 88, 88)

In [13]:
if os.path.exists(target):
    shutil.move(target, DATASET_PATH)
    print("File moved successfully.")
else:
    print("File does not exist.", UNLABELED_PATH)

File moved successfully.
