In [1]:
import os
import re
import pandas as pd
from collections import Counter
from tqdm.notebook import tqdm

# DP patterns to track (names will match paper's format)
DP_FEATURES = [
    'db_count', 'dd_count', 'dw_count', 'dc_count',
    'db_0_count', 'db_non0_count',
    'dd_4param', 'dd_5param', 'dd_6param'
]


In [2]:
def extract_data_definition(file_path):
    db_count = 0
    dd_count = 0
    dw_count = 0
    db_0 = 0
    db_non0 = 0
    dd_4 = dd_5 = dd_6 = 0

    with open(file_path, 'r', errors='ignore') as f:
        for line in f:
            tokens = line.strip().split()
            if len(tokens) < 2:
                continue
            mnemonic = tokens[1].lower()
            operands = tokens[2:] if len(tokens) > 2 else []

            if mnemonic == 'db':
                db_count += 1
                if operands:
                    if all(op == '0' or op == '00h' for op in operands):
                        db_0 += 1
                    else:
                        db_non0 += 1
            elif mnemonic == 'dd':
                dd_count += 1
                if 4 <= len(operands) <= 6:
                    if len(operands) == 4:
                        dd_4 += 1
                    elif len(operands) == 5:
                        dd_5 += 1
                    elif len(operands) == 6:
                        dd_6 += 1
            elif mnemonic == 'dw':
                dw_count += 1

    dc_count = db_count + dd_count + dw_count
    return [
        db_count, dd_count, dw_count, dc_count,
        db_0, db_non0,
        dd_4, dd_5, dd_6
    ]


In [3]:
dp_features = []
file_ids = []

for filename in tqdm(os.listdir('.')):
    if filename.endswith('.asm'):
        file_id = filename.replace('.asm', '')
        file_path = os.path.join('.', filename)
        try:
            features = extract_data_definition(file_path)
        except Exception as e:
            print(f"Error processing {file_path}: {e}")
            features = [0] * len(DP_FEATURES)
        dp_features.append(features)
        file_ids.append(file_id)


  0%|          | 0/17 [00:00<?, ?it/s]

In [4]:
df_dp = pd.DataFrame(dp_features, columns=DP_FEATURES)
df_dp.insert(0, 'Id', file_ids)

os.makedirs('features', exist_ok=True)
df_dp.to_csv('features/data_definition_patterns.csv', index=False)

# Preview
df_dp.head()


Unnamed: 0,Id,db_count,dd_count,dw_count,dc_count,db_0_count,db_non0_count,dd_4param,dd_5param,dd_6param
0,0A32eTdBKayjCWhZqDOQ,1184,1439,0,2623,0,1184,1160,0,4
1,0ACDbR5M3ZhBJajygTuf,0,0,0,0,0,0,0,0,0
