In [31]:
def count_bytes(stream):
    block = stream.read(512)
    if not block: return

    counts = {}
    
    for byte in block:
        if byte in counts:
            counts[byte] += 1
        else:
            counts[byte] = 1
        
    return counts

In [32]:
def write_output(filename, counts):
    with open(filename, "w") as file:
        for key, val in counts.items():
            file.write(f"{key}:{val}\n")
            

In [33]:
def process_file(in_dir, out_dir, filename):
    print(f"{filename} -> ", end="")
    file_index = 0
    
    with open(join(in_dir, filename), "rb") as file:
        while True:
            counts = count_bytes(file)
            if not counts: break

            out_filename = f"{filename}_{file_index}.txt"
            write_output(join(out_dir, out_filename), counts)
            
            file_index += 1

    print(f"{filename}_[0-{file_index-1}].txt")

In [34]:
from os import listdir, mkdir
from os.path import join, exists

data_dir = "dataset/test_train"
preprocessed_dir = "preprocessed_data"

for data_type in listdir(data_dir):
    print(f"Processing '{data_type}':")
    
    in_dir = join(data_dir, data_type)
    
    out_dir = join(preprocessed_dir, data_type)
    if not exists(out_dir): mkdir(out_dir)
    
    for file in listdir(in_dir):
        process_file(in_dir, out_dir, file)
                
    print()

Processing 'image':
couple.jpg -> couple.jpg_[0-47].txt
tree.jpg -> tree.jpg_[0-78].txt
sailboat.tiff -> sailboat.tiff_[0-1536].txt
jellybeans.png -> jellybeans.png_[0-139].txt
house.jpg -> house.jpg_[0-50].txt
female.png -> female.png_[0-185].txt
baboon.tiff -> baboon.tiff_[0-1536].txt

Processing 'text':
TestActorSystem.java -> TestActorSystem.java_[0-0].txt
example2.js -> example2.js_[0-0].txt
Instance3.java -> Instance3.java_[0-20].txt
view.py -> view.py_[0-76].txt
0002_auto__add_field_address_longitude.py -> 0002_auto__add_field_address_longitude.py_[0-2].txt
0004_auto__del_field_address_longtitude.py -> 0004_auto__del_field_address_longtitude.py_[0-2].txt
mode-scala.js -> mode-scala.js_[0-93].txt
jsPlumb-connection-1.4.0-RC1.js -> jsPlumb-connection-1.4.0-RC1.js_[0-50].txt
jquery.email-autocomplete.js -> jquery.email-autocomplete.js_[0-10].txt
ZoneHelper.java -> ZoneHelper.java_[0-26].txt
TweetVO.java -> TweetVO.java_[0-1].txt
mode-csharp.js -> mode-csharp.js_[0-57].txt
BlockDoor

906_838.wav.wav -> 906_838.wav.wav_[0-614].txt
5803_2518.wav.wav -> 5803_2518.wav.wav_[0-344].txt
659_586.wav.wav -> 659_586.wav.wav_[0-918].txt

Processing 'executable':
whoami.exe -> whoami.exe_[0-60].txt
gcc.exe -> gcc.exe_[0-2045].txt
mv.exe -> mv.exe_[0-268].txt
g++.exe -> g++.exe_[0-2045].txt
ls.exe -> ls.exe_[0-261].txt
rm.exe -> rm.exe_[0-124].txt

