# Feature Extraction -- CRPWarner Ground Truth

## Import and set up variable

In [1]:
import pandas as pd
import os
from pathlib import Path
from tqdm import tqdm

from utils.main import extract_opcodes, get_opcode_freq, build_feature_df

In [2]:
PATH = Path.cwd().parents[1]
NAME = 'crpwarner'
LEVEL = 'groundtruth'
DATA_PATH = os.path.join(PATH, 'data')
IN_PATH = os.path.join(DATA_PATH, f'interim/{NAME}')
EXT_PATH = os.path.join(DATA_PATH, f'external/{NAME}/{LEVEL}')
HEX_PATH = os.path.join(EXT_PATH, 'hex')
SOL_PATH = os.path.join(EXT_PATH, 'sol')

## Load Data

In [3]:
df = pd.read_csv(os.path.join(IN_PATH, 'dataset-modified.csv'))
hex_files = [f for f in os.listdir(HEX_PATH)]

len(df), len(hex_files)

(69, 72)

## Bytecode

### Opcode Frequency

#### Extract Opcode Frequency

In [4]:
opcode_counters = []
addresses = []

for filename in tqdm(hex_files):
    with open(os.path.join(HEX_PATH, filename)) as f:
        hex_code = f.read().strip()
    opcodes = extract_opcodes(hex_code)
    counter = get_opcode_freq(opcodes)
    opcode_counters.append(counter)
    addresses.append(filename.replace(".hex", ""))

df_opcode = build_feature_df(opcode_counters, addresses)

  0%|          | 0/72 [00:00<?, ?it/s]invalid instruction: PUSH30
  4%|▍         | 3/72 [00:00<00:02, 29.48it/s]invalid instruction: PUSH16
  8%|▊         | 6/72 [00:00<00:02, 22.87it/s]invalid instruction: PUSH30
invalid instruction: PUSH24
invalid instruction: PUSH30
 12%|█▎        | 9/72 [00:00<00:03, 19.16it/s]invalid instruction: PUSH16
invalid instruction: PUSH15
 19%|█▉        | 14/72 [00:00<00:02, 27.20it/s]invalid instruction: PUSH13
invalid instruction: PUSH31
 24%|██▎       | 17/72 [00:00<00:02, 26.72it/s]invalid instruction: PUSH25
 32%|███▏      | 23/72 [00:00<00:02, 24.34it/s]invalid instruction: PUSH16
invalid instruction: PUSH13
invalid instruction: PUSH25
 36%|███▌      | 26/72 [00:01<00:02, 21.29it/s]invalid instruction: PUSH21
invalid instruction: PUSH16
 40%|████      | 29/72 [00:01<00:01, 22.34it/s]invalid instruction: PUSH15
 54%|█████▍    | 39/72 [00:01<00:01, 26.86it/s]invalid instruction: PUSH19
invalid instruction: PUSH29
 60%|█████▉    | 43/72 [00:01<00:01, 2

#### Convert to DataFrame

In [5]:
df = build_feature_df(opcode_counters, addresses)

In [6]:
df.head()

Unnamed: 0,ADD,ADDMOD,ADDRESS,AND,BALANCE,BASEFEE,BLOCKHASH,BYTE,CALL,CALLCODE,...,UNKNOWN_0xf8,UNKNOWN_0xf9,UNKNOWN_0xfb,UNKNOWN_0xfc,UNKNOWN_0xfe,UNOFFICIAL_DUP,UNOFFICIAL_PUSH,UNOFFICIAL_SWAP,XOR,address
0,153,0,1,150,0,0,0,0,0,0,...,0,0,0,0,2,0,0,0,0,0x8275eBF521Dc217aa79C88132017A5BCEf001dd9
1,98,1,0,47,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0x292E89d5D5BDab3aF2f5838C194c1983f0140b43
2,153,1,0,76,0,0,0,4,0,0,...,0,0,0,1,1,0,0,0,0,0x2753dcE37A7eDB052a77832039bcc9aA49Ad8b25
3,78,0,4,40,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0xE1A0CE8B94c6A5E4791401086763d7bD0a6C18f5
4,206,0,7,143,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,0,0x9dB8a10C7FE60d84397860b3aF2E686D4F90C2b7


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72 entries, 0 to 71
Columns: 266 entries, ADD to address
dtypes: int64(265), object(1)
memory usage: 149.8+ KB


In [8]:
invalid_cols = [col for col in df.columns if col.startswith("INVALID")]
df[invalid_cols].sum().sort_values(ascending=False).head(10)

INVALID_0x6c    5
INVALID_0x6f    5
INVALID_0x7d    3
INVALID_0x7c    3
INVALID_0x77    2
INVALID_0x73    2
INVALID_0x6e    2
INVALID_0x78    2
INVALID_0x74    1
INVALID_0x70    1
dtype: int64

In [None]:
df.columns

Index(['ADD', 'ADDMOD', 'ADDRESS', 'AND', 'BALANCE', 'BASEFEE', 'BLOCKHASH',
       'BYTE', 'CALL', 'CALLCODE',
       ...
       'UNKNOWN_0xf8', 'UNKNOWN_0xf9', 'UNKNOWN_0xfb', 'UNKNOWN_0xfc',
       'UNKNOWN_0xfe', 'UNOFFICIAL_DUP', 'UNOFFICIAL_PUSH', 'UNOFFICIAL_SWAP',
       'XOR', 'address'],
      dtype='object', length=266)


#### Save as CSV