In [1]:
from magic_values import *

In [2]:
import struct
import numpy as np

def classify_float_chunks(raw_data):
    offsets = []
    if len(raw_data) < FLOAT_WIDTH:
        return offsets
    
    byte_shift_unpackers = [
        iter(struct.iter_unpack("f", raw_data[i:(len(raw_data) - i) // FLOAT_WIDTH * FLOAT_WIDTH + i]))
        for i in range(FLOAT_WIDTH)
    ]
    
    current_chunk_offset = None
    
    byte_shift_buffers = [[] for _ in byte_shift_unpackers]
    
    float_offset = 0
    while True:
        # Fill byte shift buffers
        for i, (unpacker, buffer) in enumerate(zip(byte_shift_unpackers, byte_shift_buffers)):
            if unpacker:
                try:
                    while len(buffer) < FLOAT_DETECT_CHAIN_LENGTH:
                        buffer.append(next(unpacker)[0])
                except StopIteration:
                    byte_shift_unpackers[i] = None
        
        if (FLOAT_WIDTH * float_offset) % 1000000 == 0:
            print(f"{FLOAT_WIDTH * float_offset / 1000} kBytes")
        
        if current_chunk_offset is None:
            for i, buffer in enumerate(byte_shift_buffers):
                if buffer:
                    if (np.abs(np.array(buffer)) < FLOAT_DETECT_ABS_HIGH_LIMIT).mean() > FLOAT_DETECT_THRESHOLD:
                        current_chunk_offset = FLOAT_WIDTH * float_offset + i
                        break
            if current_chunk_offset:
                print(f"Found chunk starting at: {current_chunk_offset}")
        if current_chunk_offset is not None:
            buffer = byte_shift_buffers[current_chunk_offset % FLOAT_WIDTH]
            if (np.abs(np.array(buffer)) < FLOAT_DETECT_ABS_HIGH_LIMIT).mean() < FLOAT_UNDETECT_THRESHOLD:
                i = current_chunk_offset % FLOAT_WIDTH
                offsets.append((current_chunk_offset, FLOAT_WIDTH * float_offset + i))
                print(f"Chunk ended: {FLOAT_WIDTH * float_offset + i}")
                current_chunk_offset = None

        # Consume one float from each buffer
        for buffer in byte_shift_buffers:
            if buffer:
                buffer.pop(0)
        if not any(bool(x) for x in byte_shift_buffers):
            if current_chunk_offset is not None:
                i = current_chunk_offset % FLOAT_WIDTH
                offsets.append((current_chunk_offset, FLOAT_WIDTH * float_offset + i))
                print(f"Chunk ended: {FLOAT_WIDTH * float_offset + i}")
                current_chunk_offset = None
            break
        
        float_offset += 1
                
    return offsets

with open("resnet.npy", "rb") as f:
    data = f.read()
    
chunks = classify_float_chunks(data)
print(chunks)

0.0 kBytes
Found chunk starting at: 1
Chunk ended: 137
Found chunk starting at: 241




1000.0 kBytes
Chunk ended: 1329065
Found chunk starting at: 1329070
2000.0 kBytes
3000.0 kBytes
Chunk ended: 3537130
Found chunk starting at: 3537132
Chunk ended: 3734256
Found chunk starting at: 3734261
Chunk ended: 3964969
Found chunk starting at: 3964972
4000.0 kBytes
Chunk ended: 4172328
Found chunk starting at: 4172334
Chunk ended: 4753358
Found chunk starting at: 4753360
Chunk ended: 4834764
Found chunk starting at: 4834769
5000.0 kBytes
Chunk ended: 5066093
Found chunk starting at: 5066096
Chunk ended: 5091928
Found chunk starting at: 5091933
Chunk ended: 5276373
Found chunk starting at: 5276378
Chunk ended: 5394694
Found chunk starting at: 5394697
Chunk ended: 5486717
Found chunk starting at: 5486722
6000.0 kBytes
7000.0 kBytes
Chunk ended: 7993802
Found chunk starting at: 7993807
8000.0 kBytes
Chunk ended: 8309839
Found chunk starting at: 8309841
9000.0 kBytes
10000.0 kBytes
11000.0 kBytes
12000.0 kBytes
Chunk ended: 12303321
Found chunk starting at: 12303324
Chunk ended: 1235

In [3]:
def process_chunk(data, chunk):
    before = data[:chunk[0]]
    after = data[chunk[1]:]
    data = data[chunk[0]:chunk[1]]
    print(f"Len: {chunk}   {len(before)} + {len(data)} + {len(after)} = {len(before) + len(data) + len(after)}")
    
    data_iterator = iter(struct.iter_unpack("I", data))
    
    exponents = []
    mantissas = []
    for v in data_iterator:
        v = v[0]
        exponents.append(((v & 0x7F800000) >> 23) & 0x000000FF)
        
        m = (v & 0x007FFFFF) | ((v & 0x80000000) >> 8)
        mantissas.append((m >> 16) & 0xFF)
        mantissas.append((m >> 8) & 0xFF)
        mantissas.append((m >> 0) & 0xFF)

    exponents = struct.pack(f"{len(exponents)}B", *exponents)
    mantissas = struct.pack(f"{len(mantissas)}B", *mantissas)
    print(f" -> {len(exponents)} {len(mantissas)}")
    
    float_count = struct.pack("I", len(exponents))
    
    return before + MAGIC_MARKER + float_count + exponents + mantissas + after

new_data = bytes(data)
for chunk in reversed(chunks):
    new_data = process_chunk(new_data, chunk)

with open("test_file", "wb") as f:
    f.write(new_data)

Len: (102453358, 102453358)   102453358 + 0 + 4 = 102453362
 -> 0 0
Len: (101421309, 102453353)   101421309 + 1032044 + 22 = 102453375
 -> 258011 774033
Len: (100788024, 101421304)   100788024 + 633280 + 1032084 = 102453388
 -> 158320 474960
Len: (98755206, 100788022)   98755206 + 2032816 + 1665379 = 102453401
 -> 508204 1524612
Len: (98450089, 98755201)   98450089 + 305112 + 3698213 = 102453414
 -> 76278 228834
Len: (98182688, 98450084)   98182688 + 267396 + 4003343 = 102453427
 -> 66849 200547
Len: (97192666, 98182686)   97192666 + 990020 + 4270754 = 102453440
 -> 247505 742515
Len: (96547873, 97192661)   96547873 + 644788 + 5260792 = 102453453
 -> 161197 483591
Len: (96513768, 96547868)   96513768 + 34100 + 5905598 = 102453466
 -> 8525 25575
Len: (91155639, 96513767)   91155639 + 5358128 + 5939712 = 102453479
 -> 1339532 4018596
Len: (90895060, 91155632)   90895060 + 260572 + 11297860 = 102453492
 -> 65143 195429
Len: (90873713, 90895057)   90873713 + 21344 + 11558448 = 102453505
 -

In [None]:
1328824 / 4

In [4]:
1 - 88536992 / 94395924

0.06206763758147016