In [3]:
import os
import re
import pandas as pd
from collections import Counter
from tqdm.notebook import tqdm

# Example subset of common Windows API calls (from Kaggle + research papers)
TOP_APIS = [
    'LoadLibraryA', 'GetProcAddress', 'VirtualAlloc', 'VirtualFree',
    'CreateFileA', 'ReadFile', 'WriteFile', 'CloseHandle',
    'GetModuleHandleA', 'GetProcAddress', 'WinExec', 'ShellExecuteA',
    'CreateProcessA', 'RegOpenKeyExA', 'RegSetValueExA', 'RegCreateKeyExA',
    'InternetOpenA', 'InternetOpenUrlA', 'HttpSendRequestA', 'URLDownloadToFileA'
]


In [4]:
def extract_api_calls(file_path, api_list):
    api_counter = Counter()
    api_set = set(api.lower() for api in api_list)

    with open(file_path, 'r', errors='ignore') as f:
        for line in f:
            # Lowercase the line for case-insensitive matching
            line_lower = line.lower()
            for api in api_set:
                if api in line_lower:
                    api_counter[api] += 1
    return [api_counter.get(api.lower(), 0) for api in api_list]


In [5]:
api_features = []
file_ids = []

for filename in tqdm(os.listdir('.')):
    if filename.endswith('.asm'):
        file_id = filename.replace('.asm', '')
        file_path = os.path.join('.', filename)
        try:
            features = extract_api_calls(file_path, TOP_APIS)
        except Exception as e:
            print(f"Error processing {file_path}: {e}")
            features = [0] * len(TOP_APIS)
        api_features.append(features)
        file_ids.append(file_id)


  0%|          | 0/14 [00:00<?, ?it/s]

In [6]:
df_api = pd.DataFrame(api_features, columns=[f'api_{api}' for api in TOP_APIS])
df_api.insert(0, 'Id', file_ids)

os.makedirs('features', exist_ok=True)
df_api.to_csv('features/api_frequency.csv', index=False)

# Preview
df_api.head()


Unnamed: 0,Id,api_LoadLibraryA,api_GetProcAddress,api_VirtualAlloc,api_VirtualFree,api_CreateFileA,api_ReadFile,api_WriteFile,api_CloseHandle,api_GetModuleHandleA,...,api_WinExec,api_ShellExecuteA,api_CreateProcessA,api_RegOpenKeyExA,api_RegSetValueExA,api_RegCreateKeyExA,api_InternetOpenA,api_InternetOpenUrlA,api_HttpSendRequestA,api_URLDownloadToFileA
0,0A32eTdBKayjCWhZqDOQ,5,10,8,5,3,0,3,0,3,...,0,0,0,0,0,0,0,0,0,0
1,0ACDbR5M3ZhBJajygTuf,0,0,6,0,0,0,4,0,0,...,0,0,0,0,0,0,0,0,0,0
