##  Redis vector search - Py Notebook 🐍📑

    Created by: Domen Žukovec

### Imports and const values for OpenAI 🤖🏗️

In [16]:
import os
import json
import tiktoken
import openai
import numpy as np
import redis
from redis.commands.search.field import TagField, VectorField
from redis.commands.search.indexDefinition import IndexDefinition, IndexType
from redis.commands.search.query import Query
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from dotenv import load_dotenv
from openai.embeddings_utils import cosine_similarity
from tenacity import retry, wait_random_exponential, stop_after_attempt


# Load environment variables
load_dotenv()

# Configure Azure OpenAI Service API
openai.api_type = "azure"
openai.api_version = "2022-12-01"
openai.api_base = "https://domzisopenai.openai.azure.com/"
openai.api_key = "3673181a896c4997baba06bc82db5e6f"

# Define embedding model and encoding
EMBEDDING_MODEL = 'text-embedding-ada-002'
EMBEDDING_ENCODING = 'cl100k_base'
EMBEDDING_CHUNK_SIZE = 8000
COMPLETION_MODEL = 'TextDavinci003'

REDIS_INDEX_NAME = 'domtistestindex'
VECTOR_FIELD_IN_REDIS='item_vector'
NUMBER_PRODUCTS_INDEX=1000
CHOSEN_EMB_MODEL = 'TextEmbeddingAda002'
REDIS_ADDR = 'redisdbtriguc3.westeurope.redisenterprise.cache.azure.net'
REDIS_PORT = 10000
REDIS_PASSWORD = 'hB+FWOy6MDlsL21bmaD1plSNwTa6Vd70TnID02P0r+0='


# initialize tiktoken for encoding text
encoding = tiktoken.get_encoding(EMBEDDING_ENCODING)

### Load data into notebook 📒📑

In [3]:
def create_array_from_lines(file_path):
    with open(file_path, 'r') as f:
        lines = f.readlines()

    # Create an empty list to hold the lines
    array_from_lines = []

    for line in lines:
        # Add each line to the list
        line = line.replace("\n", " ")
        line = line.replace("  ", " ")
        array_from_lines.append(line)

    return array_from_lines

# Use the function and print the result
Questions_array = create_array_from_lines('Questions.txt')
Nivo3A_array = create_array_from_lines('Nivo3A.txt')
Nivo3_array = create_array_from_lines('Nivo3.txt')

In [4]:
# print some stats about the questions
print(f"Loaded {len(Questions_array)} documents")
for doc in Questions_array[:3]:
    num_tokens = len(encoding.encode(doc))
    print(f"Content: {doc[:80]}... \n---> Tokens: {num_tokens}\n")

Loaded 37 documents
Content: Kaj je Triglav komplet? ... 
---> Tokens: 10

Content: Kje lahko dobim dodatne informacije o zavarovanju? ... 
---> Tokens: 19

Content: Kakšne so možnosti dodatnih vplačil v zavarovanje? ... 
---> Tokens: 22

Content: Kje lahko dobim dodatne informacije o zavarovanju? ... 
---> Tokens: 19

Content: Kakšne so možnosti dodatnih vplačil v zavarovanje? ... 
---> Tokens: 22

Content: Kakšen je učinek davčne olajšave? ... 
---> Tokens: 18



### Create embeddings for all the questions ⚙️💡

In [14]:
@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
def get_embedding(text):
    return openai.Embedding.create(input=text, engine=EMBEDDING_MODEL)["data"][0]["embedding"]

# Create embeddings for all questions
#embeddings = [get_embedding(doc) for doc in Questions_array]

# print some stats about the embeddings (first 3)
#for e in embeddings[:3]:
#    print(e)

### Working with Redis 📲📮

For more info you can visit: https://redis-py.readthedocs.io/en/stable/examples/search_vector_similarity_examples.html

In [2]:
# Connect to Redis - here change the host, port, password and ssl parameters to match your Redis Enterprise setup
r = redis.StrictRedis(host='redisdbtriguc3.westeurope.redisenterprise.cache.azure.net',
    port=10000, db=0, password='hB+FWOy6MDlsL21bmaD1plSNwTa6Vd70TnID02P0r+0=', ssl=True)

INDEX_NAME = "domzis_index"                       # Vector Index Name
DOC_PREFIX = "doc:"                               # RediSearch Key Prefix for the Index

# function to create the index
def create_index(vector_dimensions: int):
    try:
        # check to see if index exists
        r.ft(INDEX_NAME).info()
        print("Index already exists!")
    except:
        # schema
        schema = (
            TagField("tag"),                       # Tag Field Name
            VectorField("vector",                  # Vector Field Name
                "FLAT", {                          # Vector Index Type: FLAT or HNSW
                    "TYPE": "FLOAT32",             # FLOAT32 or FLOAT64
                    "DIM": vector_dimensions,      # Number of Vector Dimensions
                    "DISTANCE_METRIC": "COSINE",   # Vector Search Distance Metric
                }
            ),
        )

        # index Definition
        definition = IndexDefinition(prefix=[DOC_PREFIX], index_type=IndexType.HASH)

        # create Index
        r.ft(INDEX_NAME).create_index(fields=schema, definition=definition)


In [3]:
# Test the Redis connection
r.ping()

True

#### Index set-up ⚙️

In [8]:
# Drop the index
#r.ft(INDEX_NAME).dropindex(delete_documents=True)

b'OK'

In [9]:
# define vector dimensions
VECTOR_DIMENSIONS = 1536

# Create a new index
#create_index(vector_dimensions=VECTOR_DIMENSIONS)

#### Write data to Redis 📝

In [10]:
np_embedding = np.array(embeddings, dtype=np.float32)

# Write to Redis
pipe = r.pipeline()
ct = 0
for i, embedding in enumerate(np_embedding):
    by_em = embedding.tobytes()
    pipe.hset(f"doc:{i}", mapping = {
        "vector": by_em,
        "content": Questions_array[i],
        "nivo3": Nivo3_array[i],
        "nivo3A": Nivo3A_array[i],
        "tag": "openai"
    })
    res = pipe.execute()

#### Now you can query the database by embedding a question and findig the closest one in the Redis DB 🛢️📮

In [31]:
# expected found question: Kaj pomeni odprta zavarovalna doba?

q_em = get_embedding("Dobil sem mail, da potrebujem čas odprte zavarovalne dobe. Kaj to pomeni?")
q_em = np.array(q_em, dtype=np.float32)

In [32]:
INDEX_NAME = "triglav_c3kw"

In [43]:
query = (
    Query("(@tag:{ openai })=>[KNN 2 @c3kw_vector $vec as score]")
     .sort_by("score")
     .return_fields("c3kw_l4", "c3kw_tiid", "nivo3A")
     .paging(0, 1)
     .dialect(2)
)

query_params = {"vec": q_em.tobytes()}
temp_json = r.ft(INDEX_NAME).search(query, query_params).docs

In [44]:
print(temp_json)

[]


In [27]:
for doc in temp_json:
    print("Taxonomy:",  doc['c3kw_l4'], "\nID:", doc['c3kw_tiid'], doc['score'])

### This is the end of the notebook 😊

In [49]:

onekey = r.execute_command("GET triglav_c3kw")
print(onekey)

None


In [39]:
keys = r.keys("triglav_c3kw*")

print(keys)

[b'triglav_c3kw:234', b'triglav_c3kw:51', b'triglav_c3kw:280', b'triglav_c3kw:131', b'triglav_c3kw:459', b'triglav_c3kw:455', b'triglav_c3kw:212', b'triglav_c3kw:552', b'triglav_c3kw:410', b'triglav_c3kw:556', b'triglav_c3kw:184', b'triglav_c3kw:592', b'triglav_c3kw:216', b'triglav_c3kw:342', b'triglav_c3kw:217', b'triglav_c3kw:127', b'triglav_c3kw:162', b'triglav_c3kw:451', b'triglav_c3kw:252', b'triglav_c3kw:517', b'triglav_c3kw:429', b'triglav_c3kw:68', b'triglav_c3kw:58', b'triglav_c3kw:223', b'triglav_c3kw:65', b'triglav_c3kw:566', b'triglav_c3kw:310', b'triglav_c3kw:253', b'triglav_c3kw:25', b'triglav_c3kw:181', b'triglav_c3kw:544', b'triglav_c3kw:28', b'triglav_c3kw:231', b'triglav_c3kw:15', b'triglav_c3kw:447', b'triglav_c3kw:411', b'triglav_c3kw:274', b'triglav_c3kw:24', b'triglav_c3kw:574', b'triglav_c3kw:240', b'triglav_c3kw:483', b'triglav_c3kw:494', b'triglav_c3kw:328', b'triglav_c3kw:126', b'triglav_c3kw:205', b'triglav_c3kw:557', b'triglav_c3kw:399', b'triglav_c3kw:387',

In [36]:
keys = r.execute_command("KEYS *")
print("Keys count", len(keys))
keys

ResponseError: unknown command 'c3kw'

In [19]:
indexes = r.execute_command("FT._List")

# Print the keys
for index in indexes:
    print(index)

b'triglav_c3kw'
b'acs_emb_index'
b'domzis_index_simple'
b'triglav_c3en'
b'triglav_c3'
b'domzis_index_complex'


In [None]:
def redis_query_embedding_index_kw(index_name, redis_conn, query_emb, t_id, topK=5):

    try:
        print("Embeding query size:" + str(len(query_emb)))
        query_vector = np.array(query_emb).astype(np.float32).tobytes()
        q = Query(f'*=>[KNN {topK} @{"c3kw_vector"} $vec_param AS vector_score]').sort_by('vector_score').paging(0,topK).return_fields('vector_score','c3kw_l4','c3kw_tiid').dialect(2)
        params_dict = {"vec_param": query_vector}
        #logging.info(params_dict)
        results = redis_conn.ft(index_name).search(q, query_params = params_dict)
        print(results)
        #logging.info("Query results: %s", results)
        result_for_return = [{
                'id':match.id,
                'cpxq':match['c3kw_l4'],
                'tiid':match['c3kw_tiid'],
                'value':match.vector_score,
                'rank':''
            } for match in results.docs if match.id != t_id]
    except Exception as err:
        print("Upsss exception!! :(", err, err.args)
        #logging.info("err: %s, err.args: %s", err, err.args)