# Feature Extraction -- Trapdoor

## Import and set up variable

In [1]:
import pandas as pd
import os
from pathlib import Path
from tqdm import tqdm

from utils.main import extract_opcodes, get_opcode_freq, build_feature_df

In [2]:
PATH = Path.cwd().parents[1]
DATA_PATH = os.path.join(PATH, 'data')
IN_PATH = os.path.join(DATA_PATH, f'interim/trapdoor')
EXT_PATH = os.path.join(DATA_PATH, f'external/trapdoordata')
HEX_PATH = os.path.join(EXT_PATH, 'hex')
SOL_PATH = os.path.join(EXT_PATH, 'sol')

## Load Data

In [3]:
df = pd.read_csv(os.path.join(IN_PATH, 'dataset-modified.csv'))
hex_files = [f for f in os.listdir(HEX_PATH)]

len(df), len(hex_files)

(11943, 11943)

## Bytecode

### Opcode Frequency

#### Extract Opcode Frequency

In [4]:
opcode_counters = []
addresses = []

for filename in tqdm(hex_files):
    with open(os.path.join(HEX_PATH, filename)) as f:
        hex_code = f.read().strip()
    opcodes = extract_opcodes(hex_code)
    counter = get_opcode_freq(opcodes)
    opcode_counters.append(counter)
    addresses.append(filename.replace(".hex", ""))

df_opcode = build_feature_df(opcode_counters, addresses)

  0%|          | 0/11943 [00:00<?, ?it/s]invalid instruction: PUSH20
invalid instruction: PUSH32
  0%|          | 4/11943 [00:00<06:59, 28.48it/s]invalid instruction: PUSH20
  0%|          | 7/11943 [00:00<09:21, 21.24it/s]invalid instruction: PUSH28
  0%|          | 16/11943 [00:00<07:07, 27.91it/s]invalid instruction: PUSH16
invalid instruction: PUSH22
invalid instruction: PUSH29
  0%|          | 22/11943 [00:00<06:36, 30.07it/s]invalid instruction: PUSH24
  0%|          | 26/11943 [00:00<07:35, 26.14it/s]invalid instruction: PUSH31
invalid instruction: PUSH21
  0%|          | 30/11943 [00:01<07:19, 27.13it/s]invalid instruction: PUSH29
invalid instruction: PUSH20
  0%|          | 34/11943 [00:01<06:57, 28.55it/s]invalid instruction: PUSH30
  0%|          | 40/11943 [00:01<07:27, 26.57it/s]invalid instruction: PUSH20
invalid instruction: PUSH32
invalid instruction: PUSH29
  0%|          | 45/11943 [00:01<06:26, 30.80it/s]invalid instruction: PUSH13
  0%|          | 54/11943 [00:01<07

#### Convert to DataFrame

In [5]:
df = build_feature_df(opcode_counters, addresses)

In [6]:
df.head()

Unnamed: 0,ADD,ADDMOD,ADDRESS,AND,BALANCE,BASEFEE,BLOCKHASH,BYTE,CALL,CALLCODE,...,UNKNOWN_0xf8,UNKNOWN_0xf9,UNKNOWN_0xfb,UNKNOWN_0xfc,UNKNOWN_0xfe,UNOFFICIAL_DUP,UNOFFICIAL_PUSH,UNOFFICIAL_SWAP,XOR,address
0,166,0,11,93,0,0,0,0,5,0,...,0,0,0,0,1,0,0,0,0,0x0983118867e36dee0f993bbfbe7f67639e2db3c9
1,145,0,1,138,0,0,0,0,2,0,...,0,0,0,0,2,0,0,0,0,0x32e1c61e6213a101d474404a74dca27c25f8f630
2,146,0,13,122,0,0,0,0,6,0,...,0,0,0,0,1,0,0,0,0,0x74d6a669e27ac5cfa6bc380f983e5d82aef66f5a
3,439,0,14,199,0,0,0,0,7,0,...,0,0,0,0,1,0,0,0,0,0xe5ffd337213d7f565721bb3abbf539efb8756031
4,235,1,10,167,0,0,0,0,6,0,...,0,0,2,0,1,0,0,0,0,0x7f0c3fb8755c3ee81f2c6f20f152bc27b0662348


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11943 entries, 0 to 11942
Columns: 279 entries, ADD to address
dtypes: int64(278), object(1)
memory usage: 25.4+ MB


In [8]:
invalid_cols = [col for col in df.columns if col.startswith("INVALID")]
df[invalid_cols].sum().sort_values(ascending=False).head(10)

INVALID_0x6f    699
INVALID_0x73    650
INVALID_0x6c    554
INVALID_0x7c    270
INVALID_0x7f    248
INVALID_0x7e    230
INVALID_0x7d    222
INVALID_0x78    216
INVALID_0x7b    211
INVALID_0x79    208
dtype: int64

In [9]:
df.columns

Index(['ADD', 'ADDMOD', 'ADDRESS', 'AND', 'BALANCE', 'BASEFEE', 'BLOCKHASH',
       'BYTE', 'CALL', 'CALLCODE',
       ...
       'UNKNOWN_0xf8', 'UNKNOWN_0xf9', 'UNKNOWN_0xfb', 'UNKNOWN_0xfc',
       'UNKNOWN_0xfe', 'UNOFFICIAL_DUP', 'UNOFFICIAL_PUSH', 'UNOFFICIAL_SWAP',
       'XOR', 'address'],
      dtype='object', length=279)

#### Save as CSV