In [2]:
import os
import re
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm


In [4]:
def extract_ascii_strings(file_path, min_length=4):
    # Extract hex bytes
    with open(file_path, 'r') as f:
        all_bytes = []
        for line in f:
            parts = line.strip().split()[1:]  # skip address
            all_bytes.extend([b for b in parts if b != '??'])

    # Convert to raw byte string
    try:
        byte_data = bytes([int(b, 16) for b in all_bytes])
    except ValueError:
        return []  # Skip if conversion fails

    # Use regex to find ASCII printable strings (length ≥ min_length)
    strings = re.findall(rb'[ -~]{%d,}' % min_length, byte_data)
    return [len(s) for s in strings]


In [5]:
def str_length_histogram(lengths, bins=(0,10,20,30,40,50,60,70,80,90,100,1000)):
    hist, _ = np.histogram(lengths, bins=bins)
    return hist.tolist()


In [6]:
str_features = []
file_ids = []

for filename in tqdm(os.listdir('.')):
    if filename.endswith('.bytes'):
        file_id = filename.replace('.bytes', '')
        file_path = os.path.join('.', filename)
        lengths = extract_ascii_strings(file_path)
        hist = str_length_histogram(lengths)
        str_features.append(hist)
        file_ids.append(file_id)


  0%|          | 0/10 [00:00<?, ?it/s]

In [7]:
bin_labels = ['len_0_10', 'len_11_20', 'len_21_30', 'len_31_40', 'len_41_50',
              'len_51_60', 'len_61_70', 'len_71_80', 'len_81_90', 'len_91_100', 'len_100+']

df_str = pd.DataFrame(str_features, columns=bin_labels)
df_str.insert(0, 'Id', file_ids)

# Save to CSV
os.makedirs('features', exist_ok=True)
df_str.to_csv('features/str_length_distribution.csv', index=False)

# View sample
df_str.head()


Unnamed: 0,Id,len_0_10,len_11_20,len_21_30,len_31_40,len_41_50,len_51_60,len_61_70,len_71_80,len_81_90,len_91_100,len_100+
0,0A32eTdBKayjCWhZqDOQ,14790,717,47,19,11,0,2,3,0,3,1
1,0ACDbR5M3ZhBJajygTuf,32,50,21,6,2,2,1,1,1,1,0
