# Extract Opcode N-grams from Bytecode

In [1]:
import pandas as pd
import os
from pathlib import Path

from evmdasm import EvmBytecode
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
PATH = Path.cwd().parents[1]
DATA_PATH = os.path.join(PATH, 'data')
EXT_PATH = os.path.join(DATA_PATH, 'external')
INT_PATH = os.path.join(DATA_PATH, 'interim')

In [3]:
def gen_corpus(file_path):
    corpus = []
    addresses = []
    for filename in os.listdir(file_path):
        if filename.endswith('.hex'):
            address = filename.replace('.hex', '')
        else:
            address = filename.replace('.txt', '')

        with open(os.path.join(file_path, filename), 'r') as f:
            bytecode = f.read().strip()
            evm_bytecode = EvmBytecode(bytecode)
            instructions = evm_bytecode.disassemble()
            opcodes = [inst.name for inst in instructions]
            corpus.append(' '.join(opcodes))
            addresses.append(address.lower())

    return corpus, addresses

def extract_n_grams(file_path, n=3):
    vectorizer = CountVectorizer(ngram_range=(1, n)) # Extract t-grams to n-grams
    corpus, addresses = gen_corpus(file_path)
    X = vectorizer.fit_transform(corpus)

    # See feature names
    ngrams = vectorizer.get_feature_names_out()

    # Convert to DataFrame
    df = pd.DataFrame(X.toarray(), columns=ngrams, index=addresses)
    df.index.name = 'address'
    return df


## CRPWarner

In [4]:
CRPWARNER_PATH = os.path.join(EXT_PATH, 'crpwarner')

### Ground Truth

In [5]:
GROUND_PATH = os.path.join(CRPWARNER_PATH, 'groundtruth/hex')

In [6]:
df = extract_n_grams(GROUND_PATH)
df.head()

invalid instruction: PUSH30
invalid instruction: PUSH16
invalid instruction: PUSH30
invalid instruction: PUSH24
invalid instruction: PUSH30
invalid instruction: PUSH16
invalid instruction: PUSH15
invalid instruction: PUSH13
invalid instruction: PUSH31
invalid instruction: PUSH25
invalid instruction: PUSH16
invalid instruction: PUSH13
invalid instruction: PUSH25
invalid instruction: PUSH21
invalid instruction: PUSH16
invalid instruction: PUSH15
invalid instruction: PUSH19
invalid instruction: PUSH29
invalid instruction: PUSH29
invalid instruction: PUSH26
invalid instruction: PUSH20
invalid instruction: PUSH17
invalid instruction: PUSH16
invalid instruction: PUSH20
invalid instruction: PUSH13
invalid instruction: PUSH13
invalid instruction: PUSH13
invalid instruction: PUSH24
invalid instruction: PUSH27
invalid instruction: PUSH32
invalid instruction: PUSH29


Unnamed: 0_level_0,add,add add,add add dup2,add add gt,add add mload,add add mstore,add add swap1,add add swap2,add add swap3,add and,...,xor gasprice push1,xor invalid_0x72,xor mul,xor mul extcodehash,xor push32,xor push32 push9,xor sload,xor sload smod,xor unknown_0xe3,xor unknown_0xe3 push5
address,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0x8275ebf521dc217aa79c88132017a5bcef001dd9,153,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0x292e89d5d5bdab3af2f5838c194c1983f0140b43,98,2,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0x2753dce37a7edb052a77832039bcc9aa49ad8b25,153,3,0,0,0,0,3,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0xe1a0ce8b94c6a5e4791401086763d7bd0a6c18f5,78,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0x9db8a10c7fe60d84397860b3af2e686d4f90c2b7,206,0,0,0,0,0,0,0,0,4,...,0,0,0,0,0,0,0,0,0,0


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 72 entries, 0x8275ebf521dc217aa79c88132017a5bcef001dd9 to 0x17e65e6b9b166fb8e7c59432f0db126711246bc0
Columns: 13747 entries, add to xor unknown_0xe3 push5
dtypes: int64(13747)
memory usage: 7.6+ MB


In [8]:
OUT_PATH = os.path.join(INT_PATH, 'crpwarner')

In [9]:
df.to_csv(os.path.join(OUT_PATH, 'groundtruth-feature-opcode-n-gram.csv'))