In [1]:
import os
import re
import pandas as pd
from collections import Counter
from tqdm.notebook import tqdm

# Sample of 15 misc keywords (you can expand this as needed)
MISC_KEYWORDS = [
    'hkey_local_machine', 'hkey_current_user', 'kernel32.dll', 'user32.dll',
    'shell32.dll', 'ntdll.dll', 'wsock32.dll', 'advapi32.dll',
    'dll', 'debug', 'virtualalloc', 'loadlibrarya', 'exitprocess',
    'createprocessa', '------------'  # dashed lines in IDA
]


In [2]:
def extract_misc_features(file_path, keywords):
    counter = Counter()
    keywords_lower = [k.lower() for k in keywords]

    with open(file_path, 'r', errors='ignore') as f:
        for line in f:
            line = line.lower()
            for keyword in keywords_lower:
                if keyword in line:
                    counter[keyword] += 1

    return [counter.get(k, 0) for k in keywords_lower]


In [3]:
misc_features = []
file_ids = []

for filename in tqdm(os.listdir('.')):
    if filename.endswith('.asm'):
        file_id = filename.replace('.asm', '')
        file_path = os.path.join('.', filename)
        try:
            features = extract_misc_features(file_path, MISC_KEYWORDS)
        except Exception as e:
            print(f"Error in {file_path}: {e}")
            features = [0] * len(MISC_KEYWORDS)
        misc_features.append(features)
        file_ids.append(file_id)


  0%|          | 0/20 [00:00<?, ?it/s]

In [4]:
df_misc = pd.DataFrame(misc_features, columns=[f'misc_{k}' for k in MISC_KEYWORDS])
df_misc.insert(0, 'Id', file_ids)

os.makedirs('features', exist_ok=True)
df_misc.to_csv('features/misc_features.csv', index=False)

# Preview
df_misc.head()


Unnamed: 0,Id,misc_hkey_local_machine,misc_hkey_current_user,misc_kernel32.dll,misc_user32.dll,misc_shell32.dll,misc_ntdll.dll,misc_wsock32.dll,misc_advapi32.dll,misc_dll,misc_debug,misc_virtualalloc,misc_loadlibrarya,misc_exitprocess,misc_createprocessa,misc_------------
0,0A32eTdBKayjCWhZqDOQ,0,0,1,3,0,0,0,2,14,3,8,5,11,0,650
1,0ACDbR5M3ZhBJajygTuf,0,0,2,0,0,0,0,0,18,0,6,0,0,0,93
