# Opcode Frequency

## Import and Const

In [1]:
import pandas as pd
import os
from pathlib import Path
from tqdm import tqdm

In [2]:
PATH = Path.cwd().parents[1]
DATA_PATH = os.path.join(PATH, 'data')
EXT_PATH = os.path.join(DATA_PATH, 'external')
INT_PATH = os.path.join(DATA_PATH, 'interim')

In [3]:
from evmdasm import EvmBytecode
from collections import Counter

def extract_opcode_frequency(hex_code):
    try:
        evm = EvmBytecode(bytecode=hex_code)
        instructions = evm.disassemble()
        # opcodes = [instr.name for instr in instructions if not (instr.name.startswith('INVALID') or instr.name.startswith('UNKNOWN'))]
        opcodes = [instr.name for instr in instructions]
        return Counter(opcodes)
    except Exception:
        return []

def extract_from_folder(folder_path):
    rows = []
    for filename in tqdm(os.listdir(folder_path)):
        if filename.endswith(".hex"):
            address = filename.replace(".hex", "")
        else:
            address = filename.replace(".txt", "")
        with open(os.path.join(folder_path, filename), "r") as f:
            bytecode = f.read().strip().lower().replace("0x", "")
        freq = extract_opcode_frequency(bytecode)
        freq["address"] = address.lower()
        rows.append(freq)

    df = pd.DataFrame(rows).fillna(0).astype({k: int for k in rows[0] if k != 'address'})
    cols = ['address'] + [col for col in df.columns if col != 'address']
    df = df[cols]
    return df

In [4]:
data = []

## CRPWarner

In [5]:
CRPWARNER_PATH = os.path.join(EXT_PATH, 'crpwarner')

### Ground Truth

In [6]:
GROUND_PATH = os.path.join(CRPWARNER_PATH, 'groundtruth/hex')

In [7]:
df = extract_from_folder(GROUND_PATH)

  0%|          | 0/72 [00:00<?, ?it/s]

invalid instruction: PUSH30
invalid instruction: PUSH16
  6%|▌         | 4/72 [00:00<00:02, 27.51it/s]invalid instruction: PUSH30
 10%|▉         | 7/72 [00:00<00:02, 21.74it/s]invalid instruction: PUSH24
invalid instruction: PUSH30
invalid instruction: PUSH16
invalid instruction: PUSH15
 19%|█▉        | 14/72 [00:00<00:01, 37.86it/s]invalid instruction: PUSH13
invalid instruction: PUSH31
invalid instruction: PUSH25
 26%|██▋       | 19/72 [00:00<00:01, 32.98it/s]invalid instruction: PUSH16
 33%|███▎      | 24/72 [00:00<00:01, 35.95it/s]invalid instruction: PUSH13
invalid instruction: PUSH25
invalid instruction: PUSH21
invalid instruction: PUSH16
 39%|███▉      | 28/72 [00:00<00:01, 31.82it/s]invalid instruction: PUSH15
 56%|█████▌    | 40/72 [00:01<00:00, 34.73it/s]invalid instruction: PUSH19
invalid instruction: PUSH29
invalid instruction: PUSH29
invalid instruction: PUSH26
 64%|██████▍   | 46/72 [00:01<00:00, 36.65it/s]invalid instruction: PUSH20
invalid instruction: PUSH17
invalid in

In [8]:
df.head()

Unnamed: 0,address,PUSH1,MSTORE,CALLDATASIZE,LT,PUSH2,JUMPI,CALLDATALOAD,PUSH29,SWAP1,...,UNKNOWN_0xc6,UNKNOWN_0xe1,INVALID_0x70,PUSH30,DUP16,UNKNOWN_0x2b,UNKNOWN_0xd8,INVALID_0x7a,UNKNOWN_0xf9,INVALID_0x7f
0,0x8275ebf521dc217aa79c88132017a5bcef001dd9,386,112,10,7,126,53,17,1,209,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0x292e89d5d5bdab3af2f5838c194c1983f0140b43,413,115,10,8,188,63,7,0,89,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0x2753dce37a7edb052a77832039bcc9aa49ad8b25,660,176,23,13,378,101,7,0,157,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0xe1a0ce8b94c6a5e4791401086763d7bd0a6c18f5,264,77,8,13,116,36,15,0,103,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0x9db8a10c7fe60d84397860b3af2e686d4f90c2b7,801,248,20,29,312,100,35,0,310,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72 entries, 0 to 71
Columns: 266 entries, address to INVALID_0x7f
dtypes: float64(199), int64(66), object(1)
memory usage: 149.8+ KB


In [10]:
data.append(df)

In [11]:
OUT_PATH = os.path.join(INT_PATH, 'crpwarner')

In [12]:
df.to_csv(os.path.join(OUT_PATH, 'groundtruth-feature-opcode-frequency.csv'), index=False)

### Large

In [13]:
LARGE_PATH = os.path.join(CRPWARNER_PATH, 'large/hex')

In [14]:
df = extract_from_folder(LARGE_PATH)

  0%|          | 0/13483 [00:00<?, ?it/s]invalid instruction: PUSH28
  0%|          | 4/13483 [00:00<05:53, 38.09it/s]invalid instruction: PUSH30
invalid instruction: PUSH22
invalid instruction: PUSH24
  0%|          | 10/13483 [00:00<07:38, 29.37it/s]invalid instruction: PUSH27
invalid instruction: PUSH17
  0%|          | 15/13483 [00:00<07:07, 31.48it/s]invalid instruction: PUSH21
invalid instruction: PUSH17
  0%|          | 23/13483 [00:00<05:09, 43.54it/s]invalid instruction: PUSH19
invalid instruction: PUSH32
  0%|          | 28/13483 [00:00<05:37, 39.86it/s]invalid instruction: PUSH13
invalid instruction: PUSH24
invalid instruction: PUSH28
invalid instruction: PUSH16
invalid instruction: PUSH24
invalid instruction: PUSH11
  0%|          | 36/13483 [00:00<04:29, 49.99it/s]invalid instruction: PUSH29
invalid instruction: PUSH23
invalid instruction: PUSH31
invalid instruction: PUSH29
  0%|          | 43/13483 [00:01<04:53, 45.74it/s]invalid instruction: PUSH28
invalid instruction: P

In [15]:
df.head()

Unnamed: 0,address,PUSH1,MSTORE,CALLDATASIZE,LT,PUSH2,JUMPI,CALLDATALOAD,PUSH29,SWAP1,...,UNKNOWN_0xe6,INVALID_0x68,INVALID_0x6d,INVALID_0x67,INVALID_0x69,INVALID_0x64,INVALID_0x66,INVALID_0x73,INVALID_0x62,INVALID_0x65
0,0x71a982a028c9d4b0566041a78df12b810462e155,417,103,15,10,156,75,27,2,227,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0x9b11efcaaa1890f6ee52c6bb7cf8153ac5d74139,304,76,1,7,135,63,14,1,163,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0x9ec8d44af808d7cca2ec23c0dc0d1f49a3386ea4,462,106,15,25,173,92,27,2,232,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0x662abcad0b7f345ab7ffb1b1fbb9df7894f18e66,334,97,1,9,121,56,17,1,191,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0x21f15966e07a10554c364b988e91dab01d32794a,711,172,16,19,212,107,35,1,375,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13483 entries, 0 to 13482
Columns: 287 entries, address to INVALID_0x65
dtypes: float64(216), int64(70), object(1)
memory usage: 29.5+ MB


In [17]:
data.append(df)

In [18]:
df.to_csv(os.path.join(OUT_PATH, 'large-feature-opcode-frequency.csv'), index=False)

## RPHunter

In [19]:
RP_PATH = os.path.join(EXT_PATH, 'rphunter')

### Normal

In [20]:
NOR_PATH = os.path.join(RP_PATH, 'Normal-Bytecode')

In [21]:
df = extract_from_folder(NOR_PATH)

  0%|          | 0/1675 [00:00<?, ?it/s]

invalid instruction: PUSH20
invalid instruction: PUSH30
  0%|          | 2/1675 [00:00<02:09, 12.93it/s]invalid instruction: PUSH16
invalid instruction: PUSH13
invalid instruction: PUSH26
invalid instruction: PUSH17
  1%|          | 9/1675 [00:00<00:42, 39.53it/s]invalid instruction: PUSH15
  1%|          | 14/1675 [00:00<00:41, 39.96it/s]invalid instruction: PUSH29
invalid instruction: PUSH32
invalid instruction: PUSH16
invalid instruction: PUSH7
invalid instruction: PUSH31
  1%|▏         | 23/1675 [00:00<00:28, 57.00it/s]invalid instruction: PUSH19
invalid instruction: PUSH21
invalid instruction: PUSH24
invalid instruction: PUSH27
invalid instruction: PUSH24
invalid instruction: PUSH32
  2%|▏         | 32/1675 [00:00<00:29, 56.40it/s]invalid instruction: PUSH16
invalid instruction: PUSH27
invalid instruction: PUSH26
invalid instruction: PUSH12
invalid instruction: PUSH16
invalid instruction: PUSH32
invalid instruction: PUSH9
  2%|▏         | 41/1675 [00:00<00:24, 65.66it/s]invalid in

In [22]:
df.head()

Unnamed: 0,address,PUSH1,MSTORE,CALLDATASIZE,LT,PUSH2,JUMPI,CALLDATALOAD,PUSH29,SWAP1,...,INVALID_0x76,UNKNOWN_0xe1,PUSH30,INVALID_0x64,INVALID_0x62,INVALID_0x7b,INVALID_0x6d,INVALID_0x63,PUSH31,INVALID_0x67
0,0x6b466b0232640382950c45440ea5b630744eca99,293,84,7,11,92,47,15,3,155,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0x4e15361fd6b4bb609fa63c81a2be19d873717870,1039,205,4,47,524,254,67,1,358,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0xa95c4f2e0d6455637f67f655da4afae5d50d859b,350,98,7,12,112,54,15,3,188,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0x35dd2ebf20746c6e658fac75cd80d4722fae62f6,316,98,9,14,115,52,16,1,162,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0x264dc2dedcdcbb897561a57cba5085ca416fb7b4,379,104,1,10,120,58,20,2,181,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1675 entries, 0 to 1674
Columns: 287 entries, address to INVALID_0x67
dtypes: float64(216), int64(70), object(1)
memory usage: 3.7+ MB


In [24]:
data.append(df)

In [25]:
OUT_PATH = os.path.join(INT_PATH, 'rphunter')

In [26]:
df.to_csv(os.path.join(OUT_PATH, 'normal-feature-opcode-frequency.csv'), index=False)

### Rug 

In [27]:
RUG_PATH = os.path.join(RP_PATH, 'Rug-Bytecode')

In [28]:
df = extract_from_folder(RUG_PATH)

  0%|          | 0/652 [00:00<?, ?it/s]invalid instruction: PUSH16
  1%|          | 4/652 [00:00<00:19, 33.59it/s]invalid instruction: PUSH16
  1%|          | 8/652 [00:00<00:24, 25.83it/s]invalid instruction: PUSH11
  2%|▏         | 13/652 [00:00<00:19, 32.50it/s]invalid instruction: PUSH25
invalid instruction: PUSH32
  3%|▎         | 17/652 [00:00<00:22, 28.79it/s]invalid instruction: PUSH25
invalid instruction: PUSH32
  3%|▎         | 21/652 [00:00<00:21, 29.84it/s]invalid instruction: PUSH18
invalid instruction: PUSH28
invalid instruction: PUSH22
  4%|▍         | 25/652 [00:00<00:22, 27.65it/s]invalid instruction: PUSH30
  5%|▌         | 33/652 [00:01<00:26, 23.77it/s]invalid instruction: PUSH22
invalid instruction: PUSH13
  6%|▌         | 36/652 [00:01<00:30, 20.34it/s]invalid instruction: PUSH28
invalid instruction: PUSH16
invalid instruction: PUSH27
invalid instruction: PUSH24
  7%|▋         | 43/652 [00:01<00:21, 28.49it/s]invalid instruction: PUSH15
invalid instruction: PUSH32

In [29]:
df.head()

Unnamed: 0,address,PUSH1,MSTORE,CALLVALUE,DUP1,ISZERO,PUSH2,JUMPI,REVERT,JUMPDEST,...,UNKNOWN_0xcb,INVALID_0x74,UNKNOWN_0xae,UNKNOWN_0xe4,INVALID_0x7e,INVALID_0x6d,INVALID_0x72,INVALID_0x69,INVALID_0x68,INVALID_0x76
0,0x6d86f0a41c3966cef8ea139648db707e912563c9,589,175,1,114,48,306,89,27,195,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0xd248537d601f8e062dd36c8e7d26302d51cc653a,851,255,1,347,67,316,101,38,224,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0xa4dbc813f7e1bf5827859e278594b1e0ec1f710f,729,180,28,202,71,382,130,61,246,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0xc606b9ac3dc335aba427011547a4211c298ca5e4,1980,451,60,681,238,840,315,181,538,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0xc5a25e92e691635bdd6df2e904633dc3152360cd,949,175,31,298,144,489,183,93,318,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 652 entries, 0 to 651
Columns: 281 entries, address to INVALID_0x76
dtypes: float64(199), int64(81), object(1)
memory usage: 1.4+ MB


In [31]:
data.append(df)

In [32]:
df.to_csv(os.path.join(OUT_PATH, 'rug-feature-opcode-frequency.csv'), index=False)

## Trapdoor

In [33]:
TRAP_PATH = os.path.join(EXT_PATH, 'trapdoordata/hex')

In [34]:
df = extract_from_folder(TRAP_PATH)

  0%|          | 0/11943 [00:00<?, ?it/s]invalid instruction: PUSH20
invalid instruction: PUSH32
  0%|          | 4/11943 [00:00<08:23, 23.71it/s]invalid instruction: PUSH20
  0%|          | 7/11943 [00:00<07:43, 25.77it/s]invalid instruction: PUSH28
  0%|          | 15/11943 [00:00<06:48, 29.23it/s]invalid instruction: PUSH16
invalid instruction: PUSH22
invalid instruction: PUSH29
  0%|          | 20/11943 [00:00<06:50, 29.04it/s]invalid instruction: PUSH24
  0%|          | 24/11943 [00:00<06:42, 29.63it/s]invalid instruction: PUSH31
  0%|          | 28/11943 [00:01<07:20, 27.04it/s]invalid instruction: PUSH21
invalid instruction: PUSH29
invalid instruction: PUSH20
  0%|          | 32/11943 [00:01<07:47, 25.48it/s]invalid instruction: PUSH30
  0%|          | 40/11943 [00:01<07:24, 26.80it/s]invalid instruction: PUSH20
invalid instruction: PUSH32
  0%|          | 43/11943 [00:01<08:28, 23.42it/s]invalid instruction: PUSH29
invalid instruction: PUSH13
  0%|          | 55/11943 [00:02<07

In [35]:
df.head()

Unnamed: 0,address,PUSH1,MSTORE,CALLDATASIZE,LT,PUSH2,JUMPI,CALLDATALOAD,SHR,DUP1,...,INVALID_0x71,INVALID_0x79,INVALID_0x72,INVALID_0x76,PUSH30,INVALID_0x78,INVALID_0x6e,INVALID_0x6d,INVALID_0x6b,INVALID_0x6a
0,0x0983118867e36dee0f993bbfbe7f67639e2db3c9,776,168,11,14,381,125,13,1,208,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0x32e1c61e6213a101d474404a74dca27c25f8f630,380,93,9,15,158,74,17,1,213,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0x74d6a669e27ac5cfa6bc380f983e5d82aef66f5a,918,194,10,9,394,133,11,1,213,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0xe5ffd337213d7f565721bb3abbf539efb8756031,1911,508,32,38,927,329,25,2,448,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0x7f0c3fb8755c3ee81f2c6f20f152bc27b0662348,1336,288,28,19,723,271,17,2,367,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [36]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11943 entries, 0 to 11942
Columns: 279 entries, address to INVALID_0x6a
dtypes: float64(187), int64(91), object(1)
memory usage: 25.4+ MB


In [37]:
data.append(df)

In [38]:
OUT_PATH = os.path.join(INT_PATH, 'trapdoor')

In [39]:
df.to_csv(os.path.join(OUT_PATH, 'feature.csv'), index=False)

## Show Check Feature

In [40]:
# Convert column sets
col_sets = [set(df.columns) for df in data]

# 1. Common columns
common_cols = set.intersection(*col_sets)

# 2. Number of columns in each
col_counts = {i: len(cols) for i, cols in enumerate(col_sets)}

# 3. Differences (unique to each)
diffs = {
    i: cols - common_cols
    for i, cols in enumerate(col_sets)
}

# Display results
print(f"✅ Common Columns ({len(common_cols)}):\n{sorted(common_cols)}\n")
print("📊 Column Counts:")
for name, count in col_counts.items():
    print(f"  {name}: {count}")

print("\n❌ Differences (unique columns):")
for name, unique in diffs.items():
    print(f"  {name} ({len(unique)} unique): {sorted(unique)}")

✅ Common Columns (266):
['ADD', 'ADDMOD', 'ADDRESS', 'AND', 'BALANCE', 'BASEFEE', 'BLOCKHASH', 'BYTE', 'CALL', 'CALLCODE', 'CALLDATACOPY', 'CALLDATALOAD', 'CALLDATASIZE', 'CALLER', 'CALLVALUE', 'CHAINID', 'CODECOPY', 'CODESIZE', 'COINBASE', 'CREATE', 'CREATE2', 'DELEGATECALL', 'DIFFICULTY', 'DIV', 'DUP1', 'DUP10', 'DUP11', 'DUP12', 'DUP13', 'DUP14', 'DUP15', 'DUP16', 'DUP2', 'DUP3', 'DUP4', 'DUP5', 'DUP6', 'DUP7', 'DUP8', 'DUP9', 'EQ', 'EXP', 'EXTCODECOPY', 'EXTCODEHASH', 'EXTCODESIZE', 'GAS', 'GASLIMIT', 'GASPRICE', 'GT', 'INVALID_0x6c', 'INVALID_0x6e', 'INVALID_0x6f', 'INVALID_0x70', 'INVALID_0x72', 'INVALID_0x73', 'INVALID_0x74', 'INVALID_0x77', 'INVALID_0x78', 'INVALID_0x79', 'INVALID_0x7a', 'INVALID_0x7c', 'INVALID_0x7d', 'INVALID_0x7e', 'INVALID_0x7f', 'ISZERO', 'JUMP', 'JUMPDEST', 'JUMPI', 'LOG0', 'LOG1', 'LOG2', 'LOG3', 'LOG4', 'LT', 'MLOAD', 'MOD', 'MSIZE', 'MSTORE', 'MSTORE8', 'MUL', 'MULMOD', 'NOT', 'NUMBER', 'OR', 'ORIGIN', 'PC', 'POP', 'PUSH1', 'PUSH10', 'PUSH11', 'PUSH12'