In [161]:
import sys
import pickle

In [162]:
hc_data = [
    'NL', 'BE', 'US', 'US', 'AF', 'GE', 'US', 'PO',
    'FN', 'LT', 'RU', 'NL', 'DE', 'DA', 'ZW', 'AF'
]

In [163]:
lc_data = [
    'NL', 'NL', 'NL', 'US', 'US', 'NL', 'NL', 'NL',
    'BE', 'BE', 'NL', 'NL', 'BE', 'US', 'US', 'US'
]

That is 150 MB for every 1 GB. That is 150 GB for every 1 TB. That is 150 TB for every 1 PB.

In [164]:
def get_dictionaries(data):
    encode_dictionary = { x: i for i, x in enumerate(set(data)) }
    decode_dictionary = { i: x for i, x in enumerate(set(data)) }
    
    return encode_dictionary, decode_dictionary

In [165]:
lc_encode_dictionary, lc_decode_dictionary = get_dictionaries(lc_data)

print(lc_encode_dictionary)
print(lc_decode_dictionary)

{'BE': 0, 'US': 1, 'NL': 2}
{0: 'BE', 1: 'US', 2: 'NL'}


In [167]:
lc_encoded_data = [lc_encode_dictionary.get(item) for item in lc_data]
print(lc_encoded_data)

[2, 2, 2, 1, 1, 2, 2, 2, 0, 0, 2, 2, 0, 1, 1, 1]


In [251]:
def run_length_encode(data):
    result = []
    
    count = 1
    last_item = None
    length = len(data)

    for i, item in enumerate(data):    
        if last_item is item:
            count += 1
            if i == (length - 1):
                result.append((last_item, count))
            
        elif last_item != None:
            result.append((last_item, count))
            count = 1
            
        last_item = item
            
    return result

In [252]:
run_length_encode(lc_encoded_data)

[(2, 3), (1, 2), (2, 3), (0, 2), (2, 2), (0, 1), (1, 3)]

In [253]:
def describe(data):
    encode_dict, _ = get_dictionaries(data)
    print('Encoding dictionary:\n', encode_dict)

    encoded_data = [encode_dict.get(item) for item in data]
    print('\nData:\n', data)
    print('\nDictionairy encoded:\n', encoded_data)

    sorted_encoded_data = sorted(encoded_data)
    print('\nSorted encoded data:\n', sorted_encoded_data)
    
    full_encoded_data = run_length_encode(sorted_encoded_data)
    print('\nDictionairy + Run-length Encoded:\n', full_encoded_data)
    
    original_size = sys.getsizeof(pickle.dumps(data))
    final_size = sys.getsizeof(pickle.dumps(full_encoded_data))

    print(
        f'\nOriginal: {original_size}\n' \
        f'Encoded: {sys.getsizeof(pickle.dumps(encoded_data))}\n' \
        f'Encoded: {final_size}\n' \
        f'That is a {100 - (final_size * 100 // original_size)}% reduction.'
    )

In [254]:
print('Low-Cardinality\n--------------\n')
describe(lc_data)

Low-Cardinality
--------------

Encoding dictionary:
 {'BE': 0, 'US': 1, 'NL': 2}

Data:
 ['NL', 'NL', 'NL', 'US', 'US', 'NL', 'NL', 'NL', 'BE', 'BE', 'NL', 'NL', 'BE', 'US', 'US', 'US']

Dictionairy encoded:
 [2, 2, 2, 1, 1, 2, 2, 2, 0, 0, 2, 2, 0, 1, 1, 1]

Sorted encoded data:
 [0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2]

Dictionairy + Run-length Encoded:
 [(0, 3), (1, 5), (2, 8)]

Original: 94
Encoded: 73
Encoded: 62
That is a 35% reduction.


In [255]:
print('High-Cardinality\n--------------\n')
describe(hc_data)

High-Cardinality
--------------

Encoding dictionary:
 {'DE': 0, 'PO': 1, 'US': 2, 'BE': 3, 'GE': 4, 'ZW': 5, 'LT': 6, 'RU': 7, 'AF': 8, 'DA': 9, 'NL': 10, 'FN': 11}

Data:
 ['NL', 'BE', 'US', 'US', 'AF', 'GE', 'US', 'PO', 'FN', 'LT', 'RU', 'NL', 'DE', 'DA', 'ZW', 'AF']

Dictionairy encoded:
 [10, 3, 2, 2, 8, 4, 2, 1, 11, 6, 7, 10, 0, 9, 5, 8]

Sorted encoded data:
 [0, 1, 2, 2, 2, 3, 4, 5, 6, 7, 8, 8, 9, 10, 10, 11]

Dictionairy + Run-length Encoded:
 [(0, 1), (1, 1), (2, 3), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 2), (9, 1), (10, 2)]

Original: 157
Encoded: 73
Encoded: 118
That is a 25% reduction.
