In [1]:
import os
import numpy as np
import pandas as pd
from collections import Counter
from tqdm.notebook import tqdm
from math import log2


In [2]:
def shannon_entropy(byte_window):
    total = len(byte_window)
    if total == 0:
        return 0.0
    counts = Counter(byte_window)
    probs = [count / total for count in counts.values()]
    return -sum(p * log2(p) for p in probs)

def compute_entropy_features(file_path, window_size=10000):
    with open(file_path, 'r') as f:
        all_bytes = []
        for line in f:
            parts = line.strip().split()[1:]  # skip the address
            valid_bytes = [b for b in parts if b != '??']
            all_bytes.extend(valid_bytes)

    # Convert hex to integer for numeric entropy calculation
    byte_vals = [int(b, 16) for b in all_bytes if b != '??']

    if len(byte_vals) == 0:
        return [0.0] * 5  # Edge case: empty file

    # Compute entropy over sliding windows
    entropy_list = []
    for i in range(0, len(byte_vals), window_size):
        window = byte_vals[i:i+window_size]
        entropy_list.append(shannon_entropy(window))

    # Extract statistical features from entropy list
    ent_mean = np.mean(entropy_list)
    ent_std = np.std(entropy_list)
    ent_min = np.min(entropy_list)
    ent_max = np.max(entropy_list)
    ent_median = np.median(entropy_list)

    return [ent_mean, ent_std, ent_min, ent_max, ent_median]


In [3]:
entropy_data = []
file_ids = []

for filename in tqdm(os.listdir('.')):
    if filename.endswith('.bytes'):
        file_id = filename.replace('.bytes', '')
        file_path = os.path.join('.', filename)
        entropy_vector = compute_entropy_features(file_path)
        entropy_data.append(entropy_vector)
        file_ids.append(file_id)


  0%|          | 0/8 [00:00<?, ?it/s]

In [4]:
df_entropy = pd.DataFrame(entropy_data, columns=[
    'ent_mean', 'ent_std', 'ent_min', 'ent_max', 'ent_median'
])
df_entropy.insert(0, 'Id', file_ids)

# Save to CSV
os.makedirs('features', exist_ok=True)
df_entropy.to_csv('features/entropy_features.csv', index=False)

# Or preview in notebook
df_entropy.head()


Unnamed: 0,Id,ent_mean,ent_std,ent_min,ent_max,ent_median
0,0A32eTdBKayjCWhZqDOQ,5.237044,2.623987,-0.0,7.549752,6.698176
1,0ACDbR5M3ZhBJajygTuf,4.9583,0.181507,4.721774,6.540587,4.935115
