In [1]:
import pandas as pd
import os
import json
import tiktoken
import openai
import numpy as np
import redis
from redis.commands.search.field import TagField, VectorField
from redis.commands.search.indexDefinition import IndexDefinition, IndexType
from redis.commands.search.query import Query
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from dotenv import load_dotenv
from openai.embeddings_utils import cosine_similarity
from tenacity import retry, wait_random_exponential, stop_after_attempt
from langchain.llms import AzureOpenAI

# Load environment variables
load_dotenv()

# Configure Azure OpenAI Service API
openai.api_type = "azure"
openai.api_version = "2022-12-01"
openai.api_base = "https://domzisopenai.openai.azure.com/"
openai.api_key = "3673181a896c4997baba06bc82db5e6f"

# Define embedding model and encoding
EMBEDDING_MODEL = 'text-embedding-ada-002'
EMBEDDING_ENCODING = 'cl100k_base'
EMBEDDING_CHUNK_SIZE = 8000
COMPLETION_MODEL = 'text-davinci-003'

REDIS_INDEX_NAME = 'domtistestindex'
VECTOR_FIELD_IN_REDIS='item_vector'
NUMBER_PRODUCTS_INDEX=1000
CHOSEN_EMB_MODEL = 'TextEmbeddingAda002'
REDIS_ADDR = 'redisdbtriguc3.westeurope.redisenterprise.cache.azure.net'
REDIS_PORT = 10000
REDIS_PASSWORD = 'hB+FWOy6MDlsL21bmaD1plSNwTa6Vd70TnID02P0r+0='


# initialize tiktoken for encoding text
encoding = tiktoken.get_encoding(EMBEDDING_ENCODING)

In [3]:
data = pd.read_excel('https://domenstorageacc.blob.core.windows.net/randomstuff/Copy%20of%20kb-ada-embeddings-full.xlsx', skiprows=0)
#data.head()
Questions_array = data['COMPLEX_QUESTION'].values
simple_questions = data['L4'].values


In [4]:
print(simple_questions[:3])

['Kako uredim spremembo imena/priimka?'
 'Kako uredim spremembo kontaktnega podatka?'
 'Kako uredim spremembo telefonske številke?']


In [5]:
@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
def get_embedding(text):
    return openai.Embedding.create(input=text, engine=EMBEDDING_MODEL)["data"][0]["embedding"]

# Create embeddings for all questions
embeddings = [get_embedding(doc) for doc in simple_questions[:3]]

In [8]:
print(embeddings)

[[-0.006034877151250839, 0.014568888582289219, -0.023387888446450233, -0.007478612475097179, -0.0015290756709873676, 0.023813806474208832, -0.004500321112573147, 0.01351662166416645, -0.005605827551335096, -0.007910793647170067, 0.0163226667791605, -0.012144915759563446, 0.0016363379545509815, 0.010222023352980614, -0.004240385722368956, 0.00012125734065193683, 0.0313425287604332, -0.00839934591203928, 0.004531638231128454, -0.036428485065698624, -0.012445563450455666, 0.026356784626841545, -0.0004556692438200116, 0.018063917756080627, -0.02953864075243473, -0.013792214915156364, 0.017901068553328514, -0.00585010414943099, 0.004115115851163864, -0.01691143587231636, 0.0010162519756704569, -0.004519111476838589, -0.009251181967556477, -0.016535626724362373, -0.03226952627301216, 0.001543951453641057, -0.00464438134804368, -0.00935139786452055, 0.020218560472130775, -0.015145130455493927, 0.0158967487514019, -0.007234336342662573, 0.005991032812744379, -0.016961542889475822, -0.026807757

In [2]:
# Connect to Redis - here change the host, port, password and ssl parameters to match your Redis Enterprise setup
r = redis.StrictRedis(host='redisdbtriguc3.westeurope.redisenterprise.cache.azure.net',
    port=10000, db=0, password='hB+FWOy6MDlsL21bmaD1plSNwTa6Vd70TnID02P0r+0=', ssl=True)

INDEX_NAME = "domzis_index_simple"                       # Vector Index Name
DOC_PREFIX = "domzis_doc_simple:"                               # RediSearch Key Prefix for the Index

# function to create the index
def create_index(vector_dimensions: int):
    try:
        # check to see if index exists
        r.ft(INDEX_NAME).info()
        print("Index already exists!")
    except:
        # schema
        schema = (
            TagField("tag"),                       # Tag Field Name
            VectorField("vector",                  # Vector Field Name
                "FLAT", {                          # Vector Index Type: FLAT or HNSW
                    "TYPE": "FLOAT32",             # FLOAT32 or FLOAT64
                    "DIM": vector_dimensions,      # Number of Vector Dimensions
                    "DISTANCE_METRIC": "COSINE",   # Vector Search Distance Metric
                }
            ),
        )

        # index Definition
        definition = IndexDefinition(prefix=[DOC_PREFIX], index_type=IndexType.HASH)

        # create Index
        r.ft(INDEX_NAME).create_index(fields=schema, definition=definition)

In [3]:
# Test the Redis connection
r.ping()

True

In [11]:
# Drop the index
#r.ft(INDEX_NAME).dropindex(delete_documents=True)

In [16]:
# define vector dimensions
#VECTOR_DIMENSIONS = 1536

# Create a new index
#create_index(vector_dimensions=VECTOR_DIMENSIONS)

In [10]:
np_embedding = np.array(embeddings, dtype=np.float32)

# Write to Redis
pipe = r.pipeline()
ct = 0
for i, embedding in enumerate(np_embedding):
    pipe.hset(f"domzis_doc_simple:{i}", mapping = {
        "vector": embedding.tobytes(),
        "content": simple_questions[i],
        "tag": "openai"
    })
    res = pipe.execute()