# Best way to jsonify numpy arrray


In [1]:
# import textwrap
import json
import numpy as np    

# unitility function for this notebook to have nice (not crazy long) output
def print_short(obj):
    
    if isinstance(obj, (list, np.ndarray)):
        obj = str(obj)
    
    # print(obj)

    print(obj[:80], '...' if len(obj) >=130 else '')


### Performance of standard json serializer is poor

In [2]:
ARRAY_SIZE = 1536  # GPT-4 size

array = np.random.rand(ARRAY_SIZE).astype(np.float32)

print_short(array)

print('* numpy => list:')
%timeit array.tolist()

print('* numpy => list => json:')
%timeit json.dumps(array.tolist())


[0.09692152 0.00438261 0.15677369 ... 0.99155533 0.06084263 0.50901854] 
* numpy => list:
15.6 µs ± 98.6 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)
* numpy => list => json:
534 µs ± 3.88 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


### It also can't serialize numpy directly, which causes toubles with precision

In [3]:
print('* Converting to list will enforce using of double precision: ')
print_short(json.dumps(array.tolist()))
print('* Original precision is float32:')
print_short(array)


* Converting to list will enforce using of double precision: 
[0.09692151844501495, 0.004382609855383635, 0.15677368640899658, 0.4102273881435 ...
* Original precision is float32:
[0.09692152 0.00438261 0.15677369 ... 0.99155533 0.06084263 0.50901854] 


### Let's compare with orjson library

In [4]:
import orjson

print('* ~20x faster serialization:')
%timeit orjson.dumps(array, option=orjson.OPT_SERIALIZE_NUMPY,)

print('* Correctly handles dtypes, e.g. float32: ')
print_short(orjson.dumps(array, option=orjson.OPT_SERIALIZE_NUMPY,))



* ~20x faster serialization:
26.1 µs ± 182 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)
* Correctly handles dtypes, e.g. float32: 
b'[0.09692152,0.00438261,0.15677369,0.4102274,0.5338793,0.7979747,0.4922122,0.3227' ...


### One caveat - orjson uses bytes instead of str, which is probably right thing to do

In any case, it's easy to convert to str in case of some compatibility issues

In [5]:
print('* Convert to str from bytes: ')

s = orjson.dumps(array, option=orjson.OPT_SERIALIZE_NUMPY,).decode('UTF-8')
print(type(s))
print_short(s)


* Convert to str from bytes: 
<class 'str'>
[0.09692152,0.00438261,0.15677369,0.4102274,0.5338793,0.7979747,0.4922122,0.3227 ...


### Use in standard module with custom encoder/decoder 

Standard json module allows easily overriding 

In [6]:
# extend the json.JSONEncoder class
from typing import Any
import json
import orjson

class FastJSONEncoder(json.JSONEncoder):

    def encode(self, o: Any) -> str:
        return orjson.dumps(o, option=orjson.OPT_SERIALIZE_NUMPY,).decode('UTF-8')


dict_to_jsonify = {
    'content': 'Some content',
    'embdeddings': array
}

%timeit json.dumps(dict_to_jsonify, cls=FastJSONEncoder)
print_short(json.dumps(dict_to_jsonify, cls=FastJSONEncoder))

28.2 µs ± 196 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)
{"content":"Some content","embdeddings":[0.09692152,0.00438261,0.15677369,0.4102 ...


### Example: storing JSON into Redis 

In [7]:
# Initialize redis client
import redis
redis_c = redis.Redis(host='localhost', port=5379, db=0, protocol=3, decode_responses=True)
redis_c.ping()

True

### In this case, you need to use standard json interface (i.e., our FastJSONEncoder)

In [11]:
import random
doc = {
    'embdeddings': list(np.random.rand(ARRAY_SIZE).astype(np.float64))
}
print('Naive conversion: ndarray => list => standard json:')

%timeit redis_c.json().set(f'key#{random.randint(0,10_000)}', "$", doc)

Naive conversion: ndarray => list => standard json:
4.63 ms ± 235 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [12]:
doc = {
    'content': 'Some content',
    'embdeddings': np.random.rand(ARRAY_SIZE).astype(np.float64)
}

print('* Fast orjson encoder:')
%timeit redis_c.json(encoder=FastJSONEncoder()).set(f'key#{random.randint(0,10_000)}', "$", doc)

* Fast orjson encoder:
951 µs ± 41.4 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
