In [1]:
import os
import pandas as pd
from collections import Counter
from tqdm.notebook import tqdm

# Define the set of symbols to count
SYMBOLS = ['@', '?', '[', ']', '-', '+', '*']


In [2]:
def extract_symbol_frequency(file_path, symbols):
    counter = Counter()
    with open(file_path, 'r', errors='ignore') as f:
        for line in f:
            for symbol in symbols:
                counter[symbol] += line.count(symbol)
    return [counter.get(s, 0) for s in symbols]


In [3]:
symbol_features = []
file_ids = []

for filename in tqdm(os.listdir('.')):
    if filename.endswith('.asm'):
        file_id = filename.replace('.asm', '')
        file_path = os.path.join('.', filename)
        try:
            features = extract_symbol_frequency(file_path, SYMBOLS)
        except Exception as e:
            print(f"Error processing {file_path}: {e}")
            features = [0] * len(SYMBOLS)
        symbol_features.append(features)
        file_ids.append(file_id)


  0%|          | 0/15 [00:00<?, ?it/s]

In [4]:
df_sym = pd.DataFrame(symbol_features, columns=[f'sym_{s}' for s in SYMBOLS])
df_sym.insert(0, 'Id', file_ids)

os.makedirs('features', exist_ok=True)
df_sym.to_csv('features/symbol_frequency.csv', index=False)

# Preview
df_sym.head()


Unnamed: 0,Id,sym_@,sym_?,sym_[,sym_],sym_-,sym_+,sym_*
0,0A32eTdBKayjCWhZqDOQ,3310,3801,4415,4185,51845,6773,2279
1,0ACDbR5M3ZhBJajygTuf,1996,977,698,698,7250,21624,4217
