In [1]:
from struct import pack

data_filename = 'NAhbonds.dat'

# Read the data file
data = []
with open(data_filename, 'r') as file:
    # Skip the first line since it is only the headers
    next(file)
    for line in file:
        # Skip the first line since it is only the row number
        numbers = [ int(s) for s in line.strip().split()[1:] ]
        data += numbers

In [9]:
print(len(data))
print(set(data))
print(data[0:12])

10000000
{0, 1, 2, 3}
[3, 3, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3]


In [10]:
# Data is a list of numeric values
# Bit size is the number of bits for each value in data to be occupied
def store_bits (data : list, bit_size : int, filepath : str):
    # Check bit size to make sense
    if bit_size <= 0:
        raise ValueError('Bit size must a number greater than 0')
    if bit_size % 8 == 0:
        raise ValueError('Bit size is multiple of 8 so bytes must be used instead of bits')
    # Start writting the output file
    with open(filepath, 'wb') as file:
        bit_count = 0
        current_byte = ''
        # Iterate over data list values
        for value in data:
            # Parse the value to binary and make sure the binary is as long as the bit size
            bits = format(value, 'b').zfill(bit_size)
            if len(bits) != bit_size:
                raise ValueError('Value ' + str(value) + ' cannot be stored in ' + str(bit_size) + ' bits')
            # Add bits one by one to the current byte to be written
            for bit in bits:
                current_byte += bit
                bit_count += 1
                # If the current byte is full then write it to the output file
                if bit_count == 8:
                    #print(current_byte + ' -> ' + str(int(current_byte, 2)))
                    file.write(pack('!B', int(current_byte, 2)))
                    current_byte = ''
                    bit_count = 0
        # If last byte is truncated then fill it with 0s and write it
        if bit_count != 0:
            last_byte = current_byte.ljust(8, '0')
            file.write(pack('!B', int(last_byte, 2)))

In [11]:
#all_data = [0,1,2,3,2,3,1,0,2,0]
#all_data = sum(data, [])

store_bits(data, 2, 'test.bin')

In [27]:
parsed_filename = 'parsed_NAhbonds.txt'
parsed_data = []
with open(parsed_filename, 'r', encoding='utf_8') as file:
    content = file.read()
    for c in content:
        if c =='\x00':
            parsed_data.append(0)
        else:
            parsed_data.append(int(c))

In [28]:
print(data[0:10])
print(parsed_data[0:10])

[3, 3, 2, 2, 2, 2, 2, 3, 3, 3]
[3, 3, 2, 2, 2, 2, 1, 3, 3, 3]


In [29]:
print(data[20:30])
print(parsed_data[10:20])

[3, 3, 2, 2, 2, 2, 2, 3, 3, 3]
[3, 3, 2, 2, 2, 2, 2, 3, 3, 3]


In [30]:
print(len(data))
print(len(parsed_data))

10000000
5000000


In [32]:
print(data[-20:])
print(parsed_data[-10:])

[3, 3, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 2, 3, 2, 2, 2, 3, 3, 3]
[3, 3, 2, 2, 2, 2, 2, 3, 3, 3]
