# Best way to jsonify numpy arrray


### Performance of standard json serializer is poor

In [27]:
import json
import numpy as np

ARRAY_SIZE = 1536  # GPT-4 size

array = np.random.rand(ARRAY_SIZE).astype(np.float32)

print('* numpy => list:')
%timeit array.tolist()

print('* numpy => list => json:')
%timeit json.dumps(array.tolist())



* numpy => list:
14.4 µs ± 47.2 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)
* numpy => list => json:
505 µs ± 3.03 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


### It also can't serialize numpy directly, which causes toubles with pricision

In [32]:
print('* Converting to list will enforce using of double precision: ')
print(json.dumps(array.tolist()))
print('* Original precision float32:')
print(array)


* Converting to list will enforce using of double precision: 
[0.21563313901424408, 0.7994503378868103, 0.34577760100364685, 0.31217435002326965, 0.9884631037712097, 0.5132694244384766, 0.5698449015617371, 0.6472328901290894, 0.9778401255607605, 0.07043750584125519, 0.8072468638420105, 0.1123056709766388, 0.013279295526444912, 0.1911858469247818, 0.08677066117525101, 0.9475278854370117, 0.38051748275756836, 0.49188441038131714, 0.5319479703903198, 0.9956527352333069, 0.40636420249938965, 0.6081914901733398, 0.39233049750328064, 0.4615840017795563, 0.7747907042503357, 0.7054621577262878, 0.4227094352245331, 0.3671291172504425, 0.6187775135040283, 0.012714186683297157, 0.9926808476448059, 0.5629149079322815, 0.009374645538628101, 0.5822248458862305, 0.1486317664384842, 0.3553924560546875, 0.6965367197990417, 0.4886581003665924, 0.20402583479881287, 0.7184921503067017, 0.17298877239227295, 0.7067568302154541, 0.9785791635513306, 0.8015710115432739, 0.17290595173835754, 0.7433208227157593,

### Let's compare with orjson library

In [29]:
import orjson

print('* ~10x faster serialization:')
%timeit orjson.dumps(array, option=orjson.OPT_SERIALIZE_NUMPY,)

print('* Correctly handles dtypes, e.g. float32: ')
print(orjson.dumps(array, option=orjson.OPT_SERIALIZE_NUMPY,))



* ~10x faster serialization:
27 µs ± 280 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)
* Correctly handles dtypes, e.g. float32: 
b'[0.21563314,0.79945034,0.3457776,0.31217435,0.9884631,0.5132694,0.5698449,0.6472329,0.9778401,0.070437506,0.80724686,0.11230567,0.0132792955,0.19118585,0.08677066,0.9475279,0.38051748,0.4918844,0.531948,0.99565274,0.4063642,0.6081915,0.3923305,0.461584,0.7747907,0.70546216,0.42270944,0.36712912,0.6187775,0.012714187,0.99268085,0.5629149,0.009374646,0.58222485,0.14863177,0.35539246,0.6965367,0.4886581,0.20402583,0.71849215,0.17298877,0.70675683,0.97857916,0.801571,0.17290595,0.7433208,0.6333287,0.5981384,0.43991965,0.05781845,0.79285794,0.51093894,0.41190213,0.80024016,0.07764193,0.031173496,0.5328294,0.8053367,0.39277837,0.7248377,0.32222635,0.9006709,0.6525522,0.52944624,0.12021926,0.02660124,0.2779233,0.88576233,0.2628897,0.0031420814,0.10652092,0.9543638,0.69205076,0.75561446,0.5077892,0.26992366,0.21348819,0.6784543,0.025277402,0.08942119

### One caveat - orjson uses bytes instead of str, which is probably right thing to do

In any case, it's easy to convert to str in case of some compatibility issues

In [35]:
print('* Convert to str from bytes: ')

s = orjson.dumps(array, option=orjson.OPT_SERIALIZE_NUMPY,).decode('UTF-8')
print(type(s), s)


* Convert to str from bytes: 
<class 'str'> [0.21563314,0.79945034,0.3457776,0.31217435,0.9884631,0.5132694,0.5698449,0.6472329,0.9778401,0.070437506,0.80724686,0.11230567,0.0132792955,0.19118585,0.08677066,0.9475279,0.38051748,0.4918844,0.531948,0.99565274,0.4063642,0.6081915,0.3923305,0.461584,0.7747907,0.70546216,0.42270944,0.36712912,0.6187775,0.012714187,0.99268085,0.5629149,0.009374646,0.58222485,0.14863177,0.35539246,0.6965367,0.4886581,0.20402583,0.71849215,0.17298877,0.70675683,0.97857916,0.801571,0.17290595,0.7433208,0.6333287,0.5981384,0.43991965,0.05781845,0.79285794,0.51093894,0.41190213,0.80024016,0.07764193,0.031173496,0.5328294,0.8053367,0.39277837,0.7248377,0.32222635,0.9006709,0.6525522,0.52944624,0.12021926,0.02660124,0.2779233,0.88576233,0.2628897,0.0031420814,0.10652092,0.9543638,0.69205076,0.75561446,0.5077892,0.26992366,0.21348819,0.6784543,0.025277402,0.08942119,0.03723109,0.91562843,0.18506947,0.47438467,0.7388024,0.020431451,0.45738587,0.798711,0.5894037,0.504

### Use in standard module with custom encoder/decoder 

Standard json module allows easily overriding 

In [39]:
# extend the json.JSONEncoder class
from typing import Any
import json
import orjson

class FastJSONEncoder(json.JSONEncoder):

    def encode(self, o: Any) -> str:
        return orjson.dumps(o, option=orjson.OPT_SERIALIZE_NUMPY,).decode('UTF-8')


dict_to_jsonify = {
    'content': 'Some content',
    'embdeddaings': array
}

%timeit json.dumps(dict_to_jsonify, cls=FastJSONEncoder)
json.dumps(dict_to_jsonify, cls=FastJSONEncoder)

28.1 µs ± 120 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


'{"content":"Some content","embdeddaings":[0.21563314,0.79945034,0.3457776,0.31217435,0.9884631,0.5132694,0.5698449,0.6472329,0.9778401,0.070437506,0.80724686,0.11230567,0.0132792955,0.19118585,0.08677066,0.9475279,0.38051748,0.4918844,0.531948,0.99565274,0.4063642,0.6081915,0.3923305,0.461584,0.7747907,0.70546216,0.42270944,0.36712912,0.6187775,0.012714187,0.99268085,0.5629149,0.009374646,0.58222485,0.14863177,0.35539246,0.6965367,0.4886581,0.20402583,0.71849215,0.17298877,0.70675683,0.97857916,0.801571,0.17290595,0.7433208,0.6333287,0.5981384,0.43991965,0.05781845,0.79285794,0.51093894,0.41190213,0.80024016,0.07764193,0.031173496,0.5328294,0.8053367,0.39277837,0.7248377,0.32222635,0.9006709,0.6525522,0.52944624,0.12021926,0.02660124,0.2779233,0.88576233,0.2628897,0.0031420814,0.10652092,0.9543638,0.69205076,0.75561446,0.5077892,0.26992366,0.21348819,0.6784543,0.025277402,0.08942119,0.03723109,0.91562843,0.18506947,0.47438467,0.7388024,0.020431451,0.45738587,0.798711,0.5894037,0.50457

### Example: storing JSON into Redis 

In [41]:
# Initialize redis client
import redis
r = redis.Redis(host='localhost', port=5379, db=0, protocol=3, decode_responses=True)
r.ping()

True

### In this case, you need to use standard json interface (i.e., our FastJSONEncoder)

In [44]:
doc = {
    'embdeddings': list(np.random.rand(ARRAY_SIZE).astype(np.float64))
}
print('Naive ndarray => list => standard json:')
%timeit r.json().set(f'id#{random.randint(0,10_000)}', "$", doc)

Naive ndarray => list => standard json:
1.55 ms ± 208 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [46]:
doc = {
    'content': 'Some content',
    'embdeddings': np.random.rand(ARRAY_SIZE).astype(np.float64)
}

print('* Fast orjson encoder:')
%timeit r.json(encoder=FastJSONEncoder()).set(f'id#{random.randint(0,10_000)}', "$", doc)

* Fast orjson encoder:
907 µs ± 68.5 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
