In [86]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [84]:
!docker rm redis-stack
!docker run -d --cpus=1 --name redis-stack -p 5379:6379 -p 8001:8001 redis/redis-stack:latest


redis-stack
7ae53e686a08734ec1c9988d5527fcb764aeee18416a995cad257efe435dce73


In [85]:
import redis
r = redis.Redis(host='localhost', port=5379, db=0, protocol=3, decode_responses=True)
r.ping()

True

In [166]:
def print_redis_stats():
    stats = r.execute_command("MEMORY STATS")
    print(f'Total Allocated: {stats["total.allocated"]/2**30: .3} GB')
    print(f'Number of keys stored: ', stats['keys.count'])
    return stats
_ = print_redis_stats()

Total Allocated:  0.00287 GB
Number of keys stored:  0


# Prepare data

In [112]:
EMBEDDINGS_LEN = 1536  # Openai size

NUM_DOCS = 10**5

print(f'Number of documents: {NUM_DOCS:,}')
print(f'More than {4*NUM_DOCS*EMBEDDINGS_LEN / 10**9: .2} GB  (size of embeddings only) will be stored in the table')

print(f'In fact, Redis requires ~{(redis_100:=55)} MB per 1000 records. Which means, Redis will use ~{redis_100*NUM_DOCS//10**6} GB of RAM.')
print('Also we need to reserve some memory for index')

Number of documents: 100,000
More than  0.61 GB  (size of embeddings only) will be stored in the table
In fact, Redis requires ~55 MB per 1000 records. Which means, Redis will use ~5 GB of RAM.
Also we need to reserve some memory for index


In [94]:
NUM_CLUSTERS =  10**2
CLUSTER_SIZE = int(NUM_DOCS / NUM_CLUSTERS)

import numpy as np

centers = np.random.rand(NUM_CLUSTERS,  EMBEDDINGS_LEN).astype(np.float32)
print('Num. clusters:', centers.shape[0])
errors = np.random.randn(CLUSTER_SIZE, EMBEDDINGS_LEN).astype(np.float32) / 20
print('Cluster size:', errors.shape[0])


Num. clusters: 100
Cluster size: 1000


In [99]:
from utils.batched import batched
print('Utility function, example of usage:')
for b in batched(range(95), batch_size=10): print('next batch:', b)

Utility function, example of usage:
next batch: (0, 1, 2, 3, 4, 5, 6, 7, 8, 9)
next batch: (10, 11, 12, 13, 14, 15, 16, 17, 18, 19)
next batch: (20, 21, 22, 23, 24, 25, 26, 27, 28, 29)
next batch: (30, 31, 32, 33, 34, 35, 36, 37, 38, 39)
next batch: (40, 41, 42, 43, 44, 45, 46, 47, 48, 49)
next batch: (50, 51, 52, 53, 54, 55, 56, 57, 58, 59)
next batch: (60, 61, 62, 63, 64, 65, 66, 67, 68, 69)
next batch: (70, 71, 72, 73, 74, 75, 76, 77, 78, 79)
next batch: (80, 81, 82, 83, 84, 85, 86, 87, 88, 89)
next batch: (90, 91, 92, 93, 94)


In [102]:

def gen_data():
    for i in range(NUM_CLUSTERS):
        embeddings = (centers[i] + errors)#.tolist()
        # idx = np.arange(i*CLUSTER_SIZE, (i+1)*CLUSTER_SIZE).astype(int)
        idx = list(range(i*CLUSTER_SIZE, (i+1)*CLUSTER_SIZE ))


        for j in range(CLUSTER_SIZE):
            yield dict(
                id=idx[j],
                content=f'some unique content #{idx[j]}',
                embedding=embeddings[j]
            )

print('Time for iterating over all docs:')
%time for _ in gen_data(): pass

Time for iterating over all docs:
CPU times: user 61.1 ms, sys: 22.2 ms, total: 83.3 ms
Wall time: 82.9 ms


# Save to redis

In [165]:
# Clean db
r.flushall()
print_redis_stats()

Total Allocated:  0.00287 GB
Number of keys stored:  0


{'peak.allocated': 6182764464,
 'total.allocated': 3081800,
 'startup.allocated': 1313200,
 'replication.backlog': 0,
 'clients.slaves': 0,
 'clients.normal': 22760,
 'cluster.links': 0,
 'aof.buffer': 0,
 'lua.caches': 0,
 'functions.caches': 216,
 'overhead.total': 1336176,
 'keys.count': 0,
 'keys.bytes-per-key': 0,
 'dataset.bytes': 1745624,
 'dataset.percentage': 98.70088958740234,
 'peak.percentage': 0.04984501749277115,
 'allocator.allocated': 3065328,
 'allocator.active': 6691263488,
 'allocator.resident': 6691263488,
 'allocator-fragmentation.ratio': 2182.88671875,
 'allocator-fragmentation.bytes': 6688198160,
 'allocator-rss.ratio': 1.0,
 'allocator-rss.bytes': 0,
 'rss-overhead.ratio': 1.000004768371582,
 'rss-overhead.bytes': 31744,
 'fragmentation': 2182.89697265625,
 'fragmentation.bytes': 6688229904}

In [125]:
%%time
# from tqdm.notebook import trange, tqdm
from tqdm import tqdm
from itertools import islice
from utils.numpy_to_json import FastJSONEncoder

batch_size = 100

# limit = 10_000
limit = None


doc_list = islice(
    gen_data(),
    limit
)

for batch in batched(
                  tqdm(
                      doc_list,
                      total=limit or NUM_DOCS,
                      desc='Write docs to db'
                    ),
              batch_size):
    pipeline = r.pipeline()
    for doc in batch:
      
      redis_key = f"docs:{doc['id']}"
      
      pipeline.json(encoder=FastJSONEncoder()).set(redis_key, "$", doc)
    res = pipeline.execute()
    assert all(res)


Write docs to db: 100%|██████████| 100000/100000 [00:29<00:00, 3342.96it/s]

CPU times: user 5.38 s, sys: 971 ms, total: 6.35 s
Wall time: 29.9 s





## Memory footprint of Redis

In [131]:
_ = print_redis_stats()

Total Allocated:  5.63 GB
Number of keys stored:  100000


# Create index

In [143]:
INDEX_NAME = "idx:docs_vss"


# Drop index if exists
try:
    r.ft(INDEX_NAME).info()
except: print('No index exist yet')
else: # no error happen - therefore index exists
    print(
        'Drop index:',
        r.ft(INDEX_NAME).dropindex()
    )
    

No index exist yet


In [144]:
# %%time - no reason to measure time here - index creation will be running in background

from redis.commands.search.field import TextField, NumericField, TagField, VectorField
from redis.commands.search.indexDefinition import IndexDefinition, IndexType
schema = (
    TextField("$.content", no_stem=True, as_name="content"),
    NumericField("$.id", as_name="id"),
    VectorField(
        "$.embedding",
        "FLAT",
        {
            "TYPE": "FLOAT32",
            "DIM": EMBEDDINGS_LEN,
            "DISTANCE_METRIC": "COSINE",
        },
        as_name="vector",
    ),
)
definition = IndexDefinition(prefix=["docs:"], index_type=IndexType.JSON)
res = r.ft(INDEX_NAME).create_index(
    fields=schema, definition=definition
)

CPU times: user 259 µs, sys: 664 µs, total: 923 µs
Wall time: 9.56 ms


In [155]:
info = r.ft(INDEX_NAME).info()
num_docs = info["num_docs"]
indexing_failures = info["hash_indexing_failures"]
print(f"{num_docs} documents indexed with {indexing_failures} failures")

100000.0 documents indexed with 0.0 failures


## Index memory footprint
>After index created, memory consumption was increased to 5.6 GB in total.

In [163]:
_ = print_redis_stats()

Total Allocated:  5.63 GB
Number of keys stored:  100000


# Basic search (baseline performace)

In [162]:
import random
rid = random.randint(0, NUM_DOCS)

print('## Retrieve by ID performance:')
%timeit r.json().get(f"docs:{rid}")
r.json().get(f"docs:{rid}")

from redis.commands.search.query import Query, NumericFilter


print('## Filter by numeric field performance:')
query = Query("*").add_filter(NumericFilter(field="id", minval=rid, maxval=rid)).return_fields('id', 'content')
%timeit r.ft(INDEX_NAME).search(query)['results']

print('## Result of search')
r.ft(INDEX_NAME).search(query)['results']



## Retrieve by ID performance:
718 µs ± 37.1 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
## Filter by numeric field performance:
533 µs ± 36 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
## Result of search


[{'id': 'docs:86712',
  'extra_attributes': {'id': '86712', 'content': 'some unique content #86712'},
  'values': []}]

# Vector search

In [164]:
query = (
    Query('(*)=>[KNN 10 @vector $query_vector AS vector_score]')
     .sort_by('vector_score')
     .return_fields('vector_score', 'id', 'content')
     .dialect(2)
)

rid = random.randint(0, NUM_DOCS)
print(f'{rid=}')

rid_embeddings = np.array(centers[rid//CLUSTER_SIZE], dtype=np.float32)

def search(embeddings: np.ndarray):
    return r.ft(INDEX_NAME).search(query, { 
    'query_vector': embeddings.tobytes() 
})
%timeit search(rid_embeddings)
found = search(rid_embeddings)


from tabulate import tabulate
print(tabulate([d['extra_attributes'] for d in found['results']], headers='keys')) # type: ignore


rid=12139
156 ms ± 31.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
  vector_score     id  content
--------------  -----  --------------------------
    0.00324905  12386  some unique content #12386
    0.00327367  12703  some unique content #12703
    0.00329417  12131  some unique content #12131
    0.00336385  12485  some unique content #12485
    0.00336987  12468  some unique content #12468
    0.00337225  12694  some unique content #12694
    0.00339061  12483  some unique content #12483
    0.00339288  12721  some unique content #12721
    0.00339317  12036  some unique content #12036
    0.00339681  12776  some unique content #12776
