In [1]:
import os
import re
import pandas as pd
from collections import Counter
from tqdm.notebook import tqdm

# Define the registers to track (common x86 general-purpose + extended)
REGISTERS = [
    'eax', 'ebx', 'ecx', 'edx',
    'esi', 'edi', 'esp', 'ebp',
    'ax', 'bx', 'cx', 'dx',
    'al', 'ah', 'bl', 'bh', 'cl', 'ch', 'dl', 'dh'
]


In [2]:
def extract_register_usage(file_path, registers):
    counter = Counter()
    reg_set = set(registers)

    with open(file_path, 'r', errors='ignore') as f:
        for line in f:
            tokens = re.findall(r'\b[a-zA-Z]{2,3}\b', line.lower())
            for token in tokens:
                if token in reg_set:
                    counter[token] += 1

    return [counter.get(r, 0) for r in registers]


In [3]:
reg_features = []
file_ids = []

for filename in tqdm(os.listdir('.')):
    if filename.endswith('.asm'):
        file_id = filename.replace('.asm', '')
        file_path = os.path.join('.', filename)
        try:
            features = extract_register_usage(file_path, REGISTERS)
        except Exception as e:
            print(f"Error processing {file_path}: {e}")
            features = [0] * len(REGISTERS)
        reg_features.append(features)
        file_ids.append(file_id)


  0%|          | 0/16 [00:00<?, ?it/s]

In [4]:
df_reg = pd.DataFrame(reg_features, columns=[f'reg_{r}' for r in REGISTERS])
df_reg.insert(0, 'Id', file_ids)

os.makedirs('features', exist_ok=True)
df_reg.to_csv('features/register_usage.csv', index=False)

# Preview
df_reg.head()


Unnamed: 0,Id,reg_eax,reg_ebx,reg_ecx,reg_edx,reg_esi,reg_edi,reg_esp,reg_ebp,reg_ax,...,reg_cx,reg_dx,reg_al,reg_ah,reg_bl,reg_bh,reg_cl,reg_ch,reg_dl,reg_dh
0,0A32eTdBKayjCWhZqDOQ,3285,447,1431,960,1829,755,347,1352,8,...,24,7,201,0,28,0,138,0,24,0
1,0ACDbR5M3ZhBJajygTuf,339,313,326,332,298,279,10,623,0,...,0,0,1,1,0,0,0,0,0,0
