In [21]:
import sys
import csv
import pickle

We have a CSV file with one column taken from a database. This column contains the type of an item, such as a document or news item. Lets take a look at a small sample:

In [29]:
with open('itemtypes.csv', newline='') as csvfile:
    data = [i[0] for i in list(csv.reader(csvfile))][1:]

print(f'{len(data)} items in CSV.\n\nSample of 10 items:\n\n', data[:10])

500 items in CSV.

Sample of 10 items:

 ['agenda-item', 'news-item', 'vacancy', 'vacancy', 'vacancy', 'vacancy', 'news-item', 'vacancy', 'forum-topic', 'vacancy']


We can encode all the different types in a dictionary so we can represent it by an integer instead of the full string.

In [30]:
def get_dictionaries(data):
    encode_dictionary = { x: i for i, x in enumerate(set(data)) }
    decode_dictionary = { i: x for i, x in enumerate(set(data)) }
    
    return encode_dictionary, decode_dictionary

In [31]:
encode_dictionary, decode_dictionary = get_dictionaries(data)

print(encode_dictionary)
print(decode_dictionary)

{'forum-topic': 0, 'expert-panel-question': 1, 'dossier': 2, 'document': 3, 'page': 4, 'generic-item': 5, 'bulletin-item': 6, 'venue': 7, 'vacancy': 8, 'news-item': 9, 'agenda-item': 10}
{0: 'forum-topic', 1: 'expert-panel-question', 2: 'dossier', 3: 'document', 4: 'page', 5: 'generic-item', 6: 'bulletin-item', 7: 'venue', 8: 'vacancy', 9: 'news-item', 10: 'agenda-item'}


In [47]:
encoded_data = [encode_dictionary.get(item) for item in data]
print(encoded_data[:100])

[10, 9, 8, 8, 8, 8, 9, 8, 0, 8, 8, 9, 9, 8, 8, 8, 8, 0, 0, 3, 0, 9, 8, 9, 0, 9, 5, 3, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 9, 8, 8, 8, 8, 9, 8, 8, 8, 8, 3, 8, 8, 8, 8, 3, 6, 8, 8, 9, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, 9, 9, 1, 8, 9, 3, 0, 9, 7, 8, 8, 8, 3, 0, 8, 9, 8, 3, 8, 8, 8, 8, 8, 9, 9, 8, 8, 8, 8, 8, 3, 3]


As you can see, there are many repetitions of values, such as 8, 8, 8, 8. We can use _run-length encoding_ to save more space. We could encode 8, 8, 8, 8 as (8, 4), leaving us with just 2 integers: one for the actual value, and one for the run length. 

What if the rows are ordered such that there are no long runs of the same value? In case of a real database, where each row has multiple columns you'll have to find the most compact order. Computing the best ordering is an NP-hard problem. Most database systems therefore use a set of heuristics to give a good compaction but runs in a short amount of time.

In this case, for demonstration purposes, we'll just sort the array and use run-length encoding. Lets take a look at the result:

In [48]:
def run_length_encode(data):
    result = []
    
    count = 1
    last_item = None
    length = len(data)

    for i, item in enumerate(data):    
        if last_item is item:
            count += 1
            if i == (length - 1):
                result.append((last_item, count))
            
        elif last_item != None:
            result.append((last_item, count))
            count = 1
            
        last_item = item
            
    return result

In [49]:
run_length_encode(sorted(encoded_data))

[(0, 34),
 (1, 1),
 (2, 3),
 (3, 101),
 (4, 1),
 (5, 3),
 (6, 2),
 (7, 17),
 (8, 167),
 (9, 136),
 (10, 35)]

We went from 500 items to just 11 items in the list!

In [50]:
def describe(data):
    encode_dict, _ = get_dictionaries(data)
    print('Encoding dictionary:\n', encode_dict)

    encoded_data = [encode_dict.get(item) for item in data]
    print('\nData:\n', data[:5])
    print('\nDictionairy encoded:\n', encoded_data[:5])

    sorted_encoded_data = sorted(encoded_data)
    print('\nSorted encoded data:\n', sorted_encoded_data[:5])
    
    full_encoded_data = run_length_encode(sorted_encoded_data)
    print('\nDictionairy + Run-length Encoded:\n', full_encoded_data[:5])
    
    original_size = sys.getsizeof(pickle.dumps(data))
    final_size = sys.getsizeof(pickle.dumps(full_encoded_data))

    print(
        f'\nOriginal: {original_size}\n' \
        f'Encoded: {sys.getsizeof(pickle.dumps(encoded_data))}\n' \
        f'Encoded: {final_size}\n' \
        f'That is a {100 - (final_size * 100 // original_size)}% reduction in size.'
    )

In [51]:
describe(data)

Encoding dictionary:
 {'forum-topic': 0, 'expert-panel-question': 1, 'dossier': 2, 'document': 3, 'page': 4, 'generic-item': 5, 'bulletin-item': 6, 'venue': 7, 'vacancy': 8, 'news-item': 9, 'agenda-item': 10}

Data:
 ['agenda-item', 'news-item', 'vacancy', 'vacancy', 'vacancy']

Dictionairy encoded:
 [10, 9, 8, 8, 8]

Sorted encoded data:
 [0, 0, 0, 0, 0]

Dictionairy + Run-length Encoded:
 [(0, 34), (1, 1), (2, 3), (3, 101), (4, 1)]

Original: 8429
Encoded: 1041
Encoded: 118
That is a 99% reduction in size.


This is a 99% reduction in size! Obviously this is not what you'll get when there are multiple columns, but it does demonstrate the huge amount of compaction you can achieve with dictionary and run-length encoding.

All this makes indexing a whole lot easier as well. When scanning all of the rows, you can just index into the lookup table.
