# TF-IDF (Term Frequency-Inverse Document Frequency) Extraction

## Setup

In [1]:
import os
import pandas as pd
from pathlib import Path
import numpy as np
import json
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from scipy.sparse import save_npz
import joblib
from evmdasm import EvmBytecode

In [2]:
PATH = Path.cwd().parents[1]
DATA_PATH = os.path.join(PATH, 'data')
NAME = 'crpwarner'
IN_PATH = os.path.join(DATA_PATH, f'external/{NAME}/groundtruth')
HEX_PATH = os.path.join(IN_PATH, 'hex')
SOL_PATH = os.path.join(IN_PATH, 'sol')
OUT_PATH = os.path.join(DATA_PATH, f'interim/{NAME}')
PRO_PATH = os.path.join(DATA_PATH, 'processed/tf_idf')
df = pd.read_csv(os.path.join(OUT_PATH, 'dataset-modified.csv')).set_index('address')

## Load Bytecode

In [3]:
def load_bytecode(file_path):
    with open(file_path, 'r') as f:
        return f.read().strip()


## Disassemble to Opcode List

In [4]:
def get_opcodes(bytecode):
    evm_code = EvmBytecode(bytecode)
    opcodes = []
    for instr in evm_code.disassemble():
        opcode = instr.name
        # Extract the first alphanumeric-only word (no digits, no special characters)
        match = re.match(r'^[a-zA-Z]+', opcode)

        if match:
            opcode_group = match.group()
            opcodes.append(opcode_group)
        else:
            opcodes.append(opcode)
    return opcodes


In [5]:
def get_opcode_seq_from_file(hex_file):
    bytecode = load_bytecode(hex_file)
    opcodes = get_opcodes(bytecode)
    return " ".join(opcodes)


## Compute TF-IDF

In [6]:
hex_all_files = list(Path(HEX_PATH).glob('*.hex'))
sol_all_files = list(Path(SOL_PATH).glob('*.sol'))

In [7]:
documents_sol = []
addresses_sol = []

for file in sol_all_files:
    address = file.stem.lower()
    if address in df.index:
        with open(file, 'r', encoding='utf-8', errors='ignore') as f:
            content = f.read()
            documents_sol.append(content)
            addresses_sol.append(address)

In [8]:
documents_hex = []
addresses_hex = []

for file in hex_all_files:
    address = file.stem.lower()
    if address in df.index:
        opcode_seq = get_opcode_seq_from_file(file)
        documents_hex.append(opcode_seq)
        addresses_hex.append(address)

invalid instruction: PUSH30
invalid instruction: PUSH16
invalid instruction: PUSH30
invalid instruction: PUSH24
invalid instruction: PUSH30
invalid instruction: PUSH16
invalid instruction: PUSH15
invalid instruction: PUSH13
invalid instruction: PUSH25
invalid instruction: PUSH16
invalid instruction: PUSH13
invalid instruction: PUSH25
invalid instruction: PUSH21
invalid instruction: PUSH16
invalid instruction: PUSH15
invalid instruction: PUSH19
invalid instruction: PUSH29
invalid instruction: PUSH26
invalid instruction: PUSH20
invalid instruction: PUSH17
invalid instruction: PUSH16
invalid instruction: PUSH20
invalid instruction: PUSH13
invalid instruction: PUSH13
invalid instruction: PUSH13
invalid instruction: PUSH24
invalid instruction: PUSH27
invalid instruction: PUSH32
invalid instruction: PUSH29


## Vectorize

In [9]:
vectorizer_sol = TfidfVectorizer(
    lowercase=True,
    analyzer='word',
    token_pattern=r'\b\w+\b',
    max_features=10000  # tune as needed
)
X_sol = vectorizer_sol.fit_transform(documents_sol)
y_sol = df.loc[addresses_sol][['mint', 'leak', 'limit']].fillna(0).astype(int).values

y_sol[:5]

array([[0, 0, 0],
       [0, 0, 0],
       [0, 1, 1],
       [0, 0, 0],
       [0, 1, 0]])

In [10]:
vectorizer_hex = TfidfVectorizer(
    lowercase=True,
    analyzer='word',
    token_pattern=r'\b\w+\b',
    max_features=10000  # tune as needed
)
X_hex = vectorizer_hex.fit_transform(documents_hex)
y_hex = df.loc[addresses_hex][['mint', 'leak', 'limit']].fillna(0).astype(int).values

y_hex[:5]

array([[0, 0, 0],
       [0, 0, 1],
       [0, 0, 1],
       [0, 0, 0],
       [1, 1, 1]])

## Save

In [11]:
save_npz(os.path.join(PRO_PATH, "tfidf_vector_sol.npz"), X_sol)
np.save(os.path.join(PRO_PATH, "labels_sol.npy"), y_sol)

with open(os.path.join(PRO_PATH, "feature_sol.json"), "w") as f:
    json.dump(vectorizer_sol.get_feature_names_out().tolist(), f)

In [12]:
save_npz(os.path.join(PRO_PATH, "tfidf_vector_hex.npz"), X_sol)
np.save(os.path.join(PRO_PATH, "labels_hex.npy"), y_sol)

with open(os.path.join(PRO_PATH, "feature_hex.json"), "w") as f:
    json.dump(vectorizer_hex.get_feature_names_out().tolist(), f)