In [1]:
import os
import re
import pandas as pd
from collections import defaultdict
from tqdm.notebook import tqdm


In [2]:
KNOWN_SECTIONS = [
    '.text', '.data', '.bss', '.rdata', '.edata', '.idata',
    '.rsrc', '.tls', '.reloc'
]


In [3]:
def extract_section_features(file_path):
    section_line_counts = defaultdict(int)
    total_lines = 0

    with open(file_path, 'r', errors='ignore') as f:
        for line in f:
            total_lines += 1
            # Look for section headers like `.text:00401000`
            match = re.match(r'^([.\w]+):[0-9A-Fa-f]+', line)
            if match:
                section_name = match.group(1).lower()
                section_line_counts[section_name] += 1

    features = []
    known_lines = 0
    unknown_lines = 0

    for sec in KNOWN_SECTIONS:
        count = section_line_counts.get(sec, 0)
        features.append(count)  # absolute line count
        known_lines += count

    # Count unknown sections
    for sec, count in section_line_counts.items():
        if sec not in KNOWN_SECTIONS:
            unknown_lines += count

    # Total sections
    total_sections = len(section_line_counts)
    known_sections = sum(1 for s in section_line_counts if s in KNOWN_SECTIONS)
    unknown_sections = total_sections - known_sections

    # Proportions
    def safe_div(x, y):
        return x / y if y > 0 else 0

    features += [
        total_sections,
        known_sections,
        unknown_sections,
        unknown_lines,
        safe_div(known_sections, total_sections),
        safe_div(unknown_sections, total_sections),
        safe_div(unknown_lines, total_lines)
    ]

    # Proportion of each known section to total file
    for sec in KNOWN_SECTIONS:
        count = section_line_counts.get(sec, 0)
        features.append(safe_div(count, total_lines))

    return features


In [4]:
sec_features = []
file_ids = []

for filename in tqdm(os.listdir('.')):
    if filename.endswith('.asm'):
        file_id = filename.replace('.asm', '')
        file_path = os.path.join('.', filename)
        try:
            features = extract_section_features(file_path)
        except Exception as e:
            print(f"Error in {file_path}: {e}")
            features = [0] * (len(KNOWN_SECTIONS) + 7 + len(KNOWN_SECTIONS))
        sec_features.append(features)
        file_ids.append(file_id)


  0%|          | 0/19 [00:00<?, ?it/s]

In [5]:
columns = (
    [f'{sec}_lines' for sec in KNOWN_SECTIONS] +
    ['total_sections', 'known_sections', 'unknown_sections',
     'unknown_section_lines', 'known_section_ratio', 'unknown_section_ratio',
     'unknown_section_line_ratio'] +
    [f'{sec}_line_ratio' for sec in KNOWN_SECTIONS]
)

df_sec = pd.DataFrame(sec_features, columns=columns)
df_sec.insert(0, 'Id', file_ids)

os.makedirs('features', exist_ok=True)
df_sec.to_csv('features/section_features.csv', index=False)

# Preview
df_sec.head()


Unnamed: 0,Id,.text_lines,.data_lines,.bss_lines,.rdata_lines,.edata_lines,.idata_lines,.rsrc_lines,.tls_lines,.reloc_lines,...,unknown_section_line_ratio,.text_line_ratio,.data_line_ratio,.bss_line_ratio,.rdata_line_ratio,.edata_line_ratio,.idata_line_ratio,.rsrc_line_ratio,.tls_line_ratio,.reloc_line_ratio
0,0A32eTdBKayjCWhZqDOQ,13801,842632,0,39622,0,455,0,0,0,...,0.0,0.015394,0.939903,0.0,0.044196,0.0,0.000508,0.0,0.0,0.0
1,0ACDbR5M3ZhBJajygTuf,23917,417,0,250376,0,241,3,0,3,...,6.2e-05,0.086979,0.001517,0.0,0.910544,0.0,0.000876,1.1e-05,0.0,1.1e-05
