In [21]:
import os
import re
import pandas as pd
from collections import Counter
from tqdm.notebook import tqdm

# Common x86 opcodes (expandable list)
OPCODES = [
    'mov', 'push', 'pop', 'call', 'jmp', 'lea', 'add', 'sub', 'mul', 'div',
    'inc', 'dec', 'and', 'or', 'xor', 'nop', 'cmp', 'test', 'shl', 'shr',
    'ret', 'jne', 'je', 'jz', 'jnz', 'loop', 'int', 'neg', 'not', 'sar',
    'sal', 'xchg', 'stos', 'lods', 'scas', 'in', 'out', 'bsf', 'bt', 'cli',
    'sti', 'cbw', 'cwd', 'cdq', 'cmpsb', 'cmpsw', 'cmpsd', 'insb', 'outsb',
    'imul', 'idiv', 'setne', 'sete', 'setnz', 'movzx', 'movsx', 'fadd',
    'fsub', 'fst', 'fld', 'fdiv', 'fsin', 'fcos', 'fabs', 'fchs'
]


In [22]:
def extract_opcodes(file_path, opcode_list):
    op_counter = Counter()
    opcode_set = set(opcode_list)

    with open(file_path, 'r', errors='ignore') as f:
        for line in f:
            # Extract all word tokens from line (assembly + comments)
            tokens = re.findall(r'\b[a-zA-Z]+\b', line)
            for token in tokens:
                token = token.lower()
                if token in opcode_set:
                    op_counter[token] += 1
                    break  # count only the first opcode per line
    return [op_counter.get(op, 0) for op in opcode_list]


In [23]:
opcode_features = []
file_ids = []

for filename in tqdm(os.listdir('.')):
    if filename.endswith('.asm'):
        file_id = filename.replace('.asm', '')
        file_path = os.path.join('.', filename)
        try:
            features = extract_opcodes(file_path, OPCODES)
        except Exception as e:
            print(f"Error processing {file_path}: {e}")
            features = [0] * len(OPCODES)
        opcode_features.append(features)
        file_ids.append(file_id)


  0%|          | 0/13 [00:00<?, ?it/s]

In [24]:
# Create DataFrame
df_opcodes = pd.DataFrame(opcode_features, columns=[f'op_{op}' for op in OPCODES])
df_opcodes.insert(0, 'Id', file_ids)

# Save
os.makedirs('features', exist_ok=True)
df_opcodes.to_csv('features/opcode_frequency.csv', index=False)

# Preview
df_opcodes.head()


Unnamed: 0,Id,op_mov,op_push,op_pop,op_call,op_jmp,op_lea,op_add,op_sub,op_mul,...,op_movsx,op_fadd,op_fsub,op_fst,op_fld,op_fdiv,op_fsin,op_fcos,op_fabs,op_fchs
0,0A32eTdBKayjCWhZqDOQ,1923,971,471,412,361,367,195,283,5,...,3,0,0,0,3,0,0,0,0,0
1,0ACDbR5M3ZhBJajygTuf,818,25,14,9,85,70,9,1,0,...,0,0,0,0,0,0,0,0,0,0
