# Feature Extraction -- CRPWarner sample

## Import and set up variable

In [1]:
import pandas as pd
import os
from pathlib import Path
from tqdm import tqdm

from utils.main import extract_opcodes, get_opcode_freq, build_feature_df

In [2]:
PATH = Path.cwd().parents[1]
NAME = 'crpwarner'
LEVEL = 'large'
DATA_PATH = os.path.join(PATH, 'data')
IN_PATH = os.path.join(DATA_PATH, f'interim/{NAME}')
EXT_PATH = os.path.join(DATA_PATH, f'external/{NAME}/{LEVEL}')
HEX_PATH = os.path.join(EXT_PATH, 'hex')
SOL_PATH = os.path.join(EXT_PATH, 'sol')

## Load Data

In [3]:
df = pd.read_csv(os.path.join(IN_PATH, 'sample_dataset-modified.csv'))
hex_files = [f for f in os.listdir(HEX_PATH)]

len(df), len(hex_files)

(267, 13483)

## Bytecode

### Opcode Frequency

#### Extract Opcode Frequency

In [None]:
opcode_counters = []
addresses = []

for filename in tqdm(hex_files):
    with open(os.path.join(HEX_PATH, filename)) as f:
        hex_code = f.read().strip()
    opcodes = extract_opcodes(hex_code)
    counter = get_opcode_freq(opcodes)
    opcode_counters.append(counter)
    addresses.append(filename.replace(".hex", ""))

df_opcode = build_feature_df(opcode_counters, addresses)

  0%|          | 2/13483 [00:00<17:13, 13.04it/s]invalid instruction: PUSH28
  0%|          | 5/13483 [00:00<11:11, 20.07it/s]invalid instruction: PUSH30
  0%|          | 8/13483 [00:00<10:18, 21.80it/s]invalid instruction: PUSH22
invalid instruction: PUSH24
invalid instruction: PUSH27
  0%|          | 13/13483 [00:00<07:36, 29.50it/s]invalid instruction: PUSH17
invalid instruction: PUSH21
invalid instruction: PUSH17
  0%|          | 20/13483 [00:00<05:33, 40.40it/s]invalid instruction: PUSH19
  0%|          | 25/13483 [00:00<05:50, 38.43it/s]invalid instruction: PUSH32
invalid instruction: PUSH13
  0%|          | 30/13483 [00:00<05:39, 39.60it/s]invalid instruction: PUSH24
invalid instruction: PUSH28
invalid instruction: PUSH16
invalid instruction: PUSH24
invalid instruction: PUSH11
invalid instruction: PUSH29
  0%|          | 37/13483 [00:01<05:07, 43.74it/s]invalid instruction: PUSH23
invalid instruction: PUSH31
invalid instruction: PUSH29
  0%|          | 43/13483 [00:01<05:16, 42.

#### Convert to DataFrame

In [5]:
df = build_feature_df(opcode_counters, addresses)

In [6]:
df.head()

Unnamed: 0,ADD,ADDMOD,ADDRESS,AND,BALANCE,BASEFEE,BLOCKHASH,BYTE,CALL,CALLCODE,...,UNKNOWN_0xf8,UNKNOWN_0xf9,UNKNOWN_0xfb,UNKNOWN_0xfc,UNKNOWN_0xfe,UNOFFICIAL_DUP,UNOFFICIAL_PUSH,UNOFFICIAL_SWAP,XOR,address
0,162,0,6,118,1,0,0,0,2,0,...,0,0,0,0,1,0,0,0,0,0x71a982a028c9d4b0566041a78df12b810462e155
1,102,0,0,91,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0x9b11efcaaa1890f6ee52c6bb7cf8153ac5d74139
2,168,0,6,123,1,0,0,0,2,0,...,0,0,0,0,2,0,0,0,0,0x9ec8d44af808d7cca2ec23c0dc0d1f49a3386ea4
3,128,0,0,110,0,0,0,0,0,0,...,1,0,0,0,7,0,0,0,0,0x662abcad0b7f345ab7ffb1b1fbb9df7894f18e66
4,270,0,2,246,0,0,0,0,2,0,...,0,0,0,0,9,0,0,0,0,0x21f15966e07a10554c364b988e91dab01d32794a


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13483 entries, 0 to 13482
Columns: 287 entries, ADD to address
dtypes: int64(286), object(1)
memory usage: 29.5+ MB


In [8]:
invalid_cols = [col for col in df.columns if col.startswith("INVALID")]
df[invalid_cols].sum().sort_values(ascending=False).head(10)

INVALID_0x7f    542
INVALID_0x7e    508
INVALID_0x7c    481
INVALID_0x7d    479
INVALID_0x7a    435
INVALID_0x7b    402
INVALID_0x79    393
INVALID_0x77    348
INVALID_0x78    344
INVALID_0x76    336
dtype: int64

In [9]:
df.columns

Index(['ADD', 'ADDMOD', 'ADDRESS', 'AND', 'BALANCE', 'BASEFEE', 'BLOCKHASH',
       'BYTE', 'CALL', 'CALLCODE',
       ...
       'UNKNOWN_0xf8', 'UNKNOWN_0xf9', 'UNKNOWN_0xfb', 'UNKNOWN_0xfc',
       'UNKNOWN_0xfe', 'UNOFFICIAL_DUP', 'UNOFFICIAL_PUSH', 'UNOFFICIAL_SWAP',
       'XOR', 'address'],
      dtype='object', length=287)

#### Save as CSV