# Collect Transactions

## Set up

In [1]:

import os
from pathlib import Path
import pandas as pd
from tqdm import tqdm
import shutil

import sys
PATH = Path.cwd().parents[0]
sys.path.append(str(PATH))

from backend.utils.data_loader import get_info_by_contract_addr, save_bytecode_by_contract_addr, get_bytecode_by_contract_addr, save_transactions_by_contract_addr

In [2]:
DATA_PATH = os.path.join(PATH, 'data')
LABELED_PATH = os.path.join(DATA_PATH, 'labeled')
TXN_PATH = os.path.join(LABELED_PATH, 'txn')
HEX_PATH = os.path.join(LABELED_PATH, 'hex')
GROUND_PATH = os.path.join(DATA_PATH, 'external/crpwarner/dataset/groundtruth/hex')
TMP_PATH = os.path.join(DATA_PATH, 'interim/hex')


## Load Dataset

In [3]:
df = pd.read_csv(os.path.join(LABELED_PATH, 'groundtruth.csv'))

In [4]:
contract_addresses = df['Address'].unique().tolist()

## Load API Key

In [5]:
for src_file in tqdm(list(Path(GROUND_PATH).glob('*.hex'))):
    address = src_file.stem.lower()
    if address in contract_addresses:
        shutil.copy(src_file, os.path.join(HEX_PATH, f'{address}.hex'))
    else:
        shutil.copy(src_file, os.path.join(TMP_PATH, f'{address}.hex'))

100%|██████████| 70/70 [00:00<00:00, 1594.94it/s]


In [6]:
# 3. Collect and extract
for address in tqdm(contract_addresses):
    addr = address.lower()
    if addr not in [filename.lower().split('.')[0] for filename in os.listdir(TXN_PATH)]:
        info = get_info_by_contract_addr(address)
        save_transactions_by_contract_addr(TXN_PATH, address, info)
        if 'creationBytecode' in info.get('creator'):
            save_bytecode_by_contract_addr(HEX_PATH, address, info['creator']['creationBytecode'])

    if addr not in [filename.lower().split('.')[0] for filename in os.listdir(HEX_PATH)]:
        bytecode = get_bytecode_by_contract_addr(addr)
        save_bytecode_by_contract_addr(HEX_PATH, address, bytecode)

100%|██████████| 69/69 [00:00<00:00, 1682.10it/s]


In [7]:
len(os.listdir(TXN_PATH)), len(os.listdir(HEX_PATH))

(69, 69)