In [3]:
import os
import pandas as pd
from collections import Counter
from tqdm.notebook import tqdm  # better progress bar in Jupyter

# List of all possible byte values from 00 to FF
BYTE_VALUES = [f"{i:02X}" for i in range(256)]


In [4]:
def extract_1gram(file_path):
    counter = Counter()
    with open(file_path, 'r') as f:
        for line in f:
            parts = line.strip().split()[1:]  # Skip the address
            valid_bytes = [b for b in parts if b != '??']
            counter.update(valid_bytes)
    # Return a fixed-length 256-dim feature vector
    return [counter.get(b, 0) for b in BYTE_VALUES]


In [5]:
data = []
file_ids = []

# Scan current folder for .bytes files
for filename in tqdm(os.listdir('.')):
    if filename.endswith('.bytes'):
        file_id = filename.replace('.bytes', '')
        file_path = os.path.join('.', filename)
        vector = extract_1gram(file_path)
        data.append(vector)
        file_ids.append(file_id)


  0%|          | 0/7 [00:00<?, ?it/s]

In [6]:
df = pd.DataFrame(data, columns=[f'byte_{b}' for b in BYTE_VALUES])
df.insert(0, 'Id', file_ids)

# Save to disk
os.makedirs('features', exist_ok=True)
df.to_csv('features/1gram_byte_frequency.csv', index=False)

# Or preview in notebook
df.head()


Unnamed: 0,Id,byte_00,byte_01,byte_02,byte_03,byte_04,byte_05,byte_06,byte_07,byte_08,...,byte_F6,byte_F7,byte_F8,byte_F9,byte_FA,byte_FB,byte_FC,byte_FD,byte_FE,byte_FF
0,0A32eTdBKayjCWhZqDOQ,69048,26559,25602,26567,4180,2573,2726,5327,4734,...,2149,1964,2388,1738,1248,1446,2240,1464,2276,24714
1,0ACDbR5M3ZhBJajygTuf,138046,124440,7938,126,6984,23,7459,17,6836,...,77,11,86,64,66,60,24,8,78,653
