# Feature Extraction -- RPHunter

## Import and set up variable

In [8]:
import pandas as pd
import os
from pathlib import Path
from tqdm import tqdm

from utils.main import extract_opcodes, get_opcode_freq, build_feature_df

In [None]:
PATH = Path.cwd().parents[1]
NAME = 'rphunter'
DATA_PATH = os.path.join(PATH, 'data')
IN_PATH = os.path.join(DATA_PATH, f'interim/{NAME}')
EXT_PATH = os.path.join(DATA_PATH, f'external/{NAME}')
HEX_NOR_PATH = os.path.join(EXT_PATH, 'Normal-Bytecode')
HEX_RUG_PATH = os.path.join(EXT_PATH, 'Rug-Bytecode')

## Load Data

In [10]:
df = pd.read_csv(os.path.join(IN_PATH, 'dataset-modified.csv'))
hex_files = [("normal", f) for f in os.listdir(HEX_NOR_PATH)] + \
             [("rug", f) for f in os.listdir(HEX_RUG_PATH)]

len(df), len(hex_files)

(643, 2327)

## Bytecode

### Opcode Frequency

#### Extract Opcode Frequency

In [None]:
opcode_counters = []
addresses = []

for label, filename in tqdm(hex_files):
    path = HEX_NOR_PATH if label == "normal" else HEX_RUG_PATH
    with open(os.path.join(path, filename)) as f:
        hex_code = f.read().strip()
    opcodes = extract_opcodes(hex_code)
    counter = get_opcode_freq(opcodes)
    opcode_counters.append(counter)
    addresses.append(filename.replace(".txt", ""))

df_opcode = build_feature_df(opcode_counters, addresses)

  0%|          | 0/2327 [00:00<?, ?it/s]invalid instruction: PUSH20
invalid instruction: PUSH30
  0%|          | 3/2327 [00:00<01:24, 27.58it/s]invalid instruction: PUSH16
invalid instruction: PUSH13
invalid instruction: PUSH26
  0%|          | 7/2327 [00:00<01:09, 33.34it/s]invalid instruction: PUSH17
invalid instruction: PUSH15
  1%|          | 14/2327 [00:00<01:32, 25.01it/s]invalid instruction: PUSH29
invalid instruction: PUSH32
invalid instruction: PUSH16
invalid instruction: PUSH7
invalid instruction: PUSH31
  1%|          | 21/2327 [00:00<01:01, 37.60it/s]invalid instruction: PUSH19
invalid instruction: PUSH21
invalid instruction: PUSH24
invalid instruction: PUSH27
invalid instruction: PUSH24
  1%|▏         | 31/2327 [00:00<00:42, 54.09it/s]invalid instruction: PUSH32
invalid instruction: PUSH16
invalid instruction: PUSH27
invalid instruction: PUSH26
invalid instruction: PUSH12
invalid instruction: PUSH16
  2%|▏         | 37/2327 [00:00<00:46, 49.02it/s]invalid instruction: PUSH

#### Convert to DataFrame

In [12]:
df = build_feature_df(opcode_counters, addresses)

In [13]:
df.head()

Unnamed: 0,ADD,ADDMOD,ADDRESS,AND,BALANCE,BASEFEE,BLOCKHASH,BYTE,CALL,CALLCODE,...,UNKNOWN_0xf8,UNKNOWN_0xf9,UNKNOWN_0xfb,UNKNOWN_0xfc,UNKNOWN_0xfe,UNOFFICIAL_DUP,UNOFFICIAL_PUSH,UNOFFICIAL_SWAP,XOR,address
0,138,0,1,83,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0x6B466B0232640382950c45440Ea5b630744eCa99.txt
1,136,0,0,197,0,0,0,0,4,2,...,0,0,0,0,24,0,0,0,1,0x4E15361FD6b4BB609Fa63C81A2be19d873717870.txt
2,153,0,1,105,0,0,0,0,2,0,...,0,0,1,0,0,0,1,0,0,0xa95c4f2e0d6455637f67F655Da4AFAe5d50d859B.txt
3,137,0,0,84,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0x35dd2ebf20746C6e658fac75cd80D4722fae62f6.txt
4,150,0,1,111,0,0,0,0,1,0,...,0,0,0,0,3,0,0,0,0,0x264Dc2DedCdcbb897561A57CBa5085CA416fb7b4.txt


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2327 entries, 0 to 2326
Columns: 287 entries, ADD to address
dtypes: int64(286), object(1)
memory usage: 5.1+ MB


In [15]:
invalid_cols = [col for col in df.columns if col.startswith("INVALID")]
df[invalid_cols].sum().sort_values(ascending=False).head(10)

INVALID_0x6f    171
INVALID_0x7d     80
INVALID_0x7f     78
INVALID_0x7b     68
INVALID_0x73     67
INVALID_0x7a     64
INVALID_0x7e     63
INVALID_0x79     59
INVALID_0x7c     54
INVALID_0x75     50
dtype: int64

In [16]:
df.columns

Index(['ADD', 'ADDMOD', 'ADDRESS', 'AND', 'BALANCE', 'BASEFEE', 'BLOCKHASH',
       'BYTE', 'CALL', 'CALLCODE',
       ...
       'UNKNOWN_0xf8', 'UNKNOWN_0xf9', 'UNKNOWN_0xfb', 'UNKNOWN_0xfc',
       'UNKNOWN_0xfe', 'UNOFFICIAL_DUP', 'UNOFFICIAL_PUSH', 'UNOFFICIAL_SWAP',
       'XOR', 'address'],
      dtype='object', length=287)

#### Save as CSV