In [37]:
import pandas as pd
import os
import json
import tiktoken
import openai
import numpy as np
import redis
from redis.commands.search.field import TagField, VectorField
from redis.commands.search.indexDefinition import IndexDefinition, IndexType
from redis.commands.search.query import Query
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from dotenv import load_dotenv
from openai.embeddings_utils import cosine_similarity
from tenacity import retry, wait_random_exponential, stop_after_attempt

# Load environment variables
load_dotenv()

# Configure Azure OpenAI Service API
openai.api_type = "azure"
openai.api_version = "2022-12-01"
openai.api_base = "https://domzisopenai.openai.azure.com/"
openai.api_key = "3673181a896c4997baba06bc82db5e6f"

# Define embedding model and encoding
EMBEDDING_MODEL = 'text-embedding-ada-002'
EMBEDDING_ENCODING = 'cl100k_base'
EMBEDDING_CHUNK_SIZE = 8000
COMPLETION_MODEL = 'text-davinci-003'

REDIS_INDEX_NAME = 'domtistestindex'
VECTOR_FIELD_IN_REDIS='item_vector'
NUMBER_PRODUCTS_INDEX=1000
CHOSEN_EMB_MODEL = 'TextEmbeddingAda002'
REDIS_ADDR = 'redisdbtriguc3.westeurope.redisenterprise.cache.azure.net'
REDIS_PORT = 10000
REDIS_PASSWORD = 'hB+FWOy6MDlsL21bmaD1plSNwTa6Vd70TnID02P0r+0='


# initialize tiktoken for encoding text
encoding = tiktoken.get_encoding(EMBEDDING_ENCODING)

In [38]:
data = pd.read_excel('https://domenstorageacc.blob.core.windows.net/randomstuff/Kategorije_delovna%20verzija_BS_20092021.xlsx', skiprows=2)
data = data[['VSEBINA VPRAŠANJA', 'Unnamed: 3', 'STRANKINA VPRAŠANJA Kaj stranke vprašajo pri določeni temi; Kaj jih zanima?']].dropna()
data.head()

Unnamed: 0,VSEBINA VPRAŠANJA,Unnamed: 3,STRANKINA VPRAŠANJA Kaj stranke vprašajo pri določeni temi; Kaj jih zanima?
33,Triglav komplet,KATEGORIZACIJA - NE MOREM SPREMENITI,Kaj je Triglav komplet?
57,Pokojninska zavarovanja,Individualno prostovoljno pokojninsko zavarovanje,Kje lahko dobim dodatne informacije o zavarova...
58,Pokojninska zavarovanja,Individualno prostovoljno pokojninsko zavarovanje,Kakšne so možnosti dodatnih vplačil v zavarova...
59,Pokojninska zavarovanja,Individualno prostovoljno dodatno pokojninsko ...,Kje lahko dobim dodatne informacije o zavarova...
60,Pokojninska zavarovanja,Individualno prostovoljno dodatno pokojninsko ...,Kakšne so možnosti dodatnih vplačil v zavarova...


In [39]:
Nivo3_array = data['VSEBINA VPRAŠANJA'].values
Nivo3A_array = data['Unnamed: 3'].values
Questions_array = data['STRANKINA VPRAŠANJA Kaj stranke vprašajo pri določeni temi; Kaj jih zanima?'].values

In [40]:
print(Questions_array[0])

Kaj je Triglav komplet?


In [41]:
len(Nivo3_array)

325

In [42]:
# print some stats about the questions
print(f"Loaded {len(Questions_array)} documents")
for doc in Questions_array[:3]:
    num_tokens = len(encoding.encode(doc))
    print(f"Content: {doc[:80]}... \n---> Tokens: {num_tokens}\n")

Loaded 325 documents
Content: Kaj je Triglav komplet?... 
---> Tokens: 9

Content: Kje lahko dobim dodatne informacije o zavarovanju?... 
---> Tokens: 18

Content: Kakšne so možnosti dodatnih vplačil v zavarovanje?... 
---> Tokens: 21



In [43]:
len(Questions_array)

325

In [44]:
@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
def get_embedding(text):
    return openai.Embedding.create(input=text, engine=EMBEDDING_MODEL)["data"][0]["embedding"]

In [45]:
@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
def get_embedding(text):
    return openai.Embedding.create(input=text, engine=EMBEDDING_MODEL)["data"][0]["embedding"]

# Create embeddings for all questions
embeddings = [get_embedding(doc) for doc in Questions_array]

# print some stats about the embeddings (first 3)
for e in embeddings[:3]:
    print(e)

[0.014151286333799362, 0.005806498695164919, 0.004165817517787218, -0.015597978606820107, -0.009173347614705563, 0.015242881141602993, -0.03361587971448898, -0.00862754974514246, -0.004557082429528236, -0.015795255079865456, 0.028486697003245354, 0.028539303690195084, -0.009311441332101822, -0.007568833883851767, -0.00015514958067797124, -0.001984270755201578, 0.02668490633368492, -0.025054089725017548, 0.0059018488973379135, -0.013848795555531979, 0.01118556596338749, 0.0056289504282176495, -0.005635526031255722, -0.009521869011223316, 0.0018922084709629416, 0.003922510426491499, 0.017833776772022247, -0.018307240679860115, -0.004977938253432512, -0.015663737431168556, -0.0014458708465099335, 0.00033043778967112303, -0.02738194912672043, -0.013020235113799572, 0.013756733387708664, -0.0013381908647716045, 0.0004927797126583755, 0.018991131335496902, 0.003439183346927166, 0.022949809208512306, 0.0344838947057724, 0.017925839871168137, -0.0048924521543085575, 0.005369203165173531, -0.03

In [11]:
# Connect to Redis - here change the host, port, password and ssl parameters to match your Redis Enterprise setup
r = redis.StrictRedis(host='redisdbtriguc3.westeurope.redisenterprise.cache.azure.net',
    port=10000, db=0, password='hB+FWOy6MDlsL21bmaD1plSNwTa6Vd70TnID02P0r+0=', ssl=True)

INDEX_NAME = "domzis_index"                       # Vector Index Name
DOC_PREFIX = "doc:"                               # RediSearch Key Prefix for the Index

# function to create the index
def create_index(vector_dimensions: int):
    try:
        # check to see if index exists
        r.ft(INDEX_NAME).info()
        print("Index already exists!")
    except:
        # schema
        schema = (
            TagField("tag"),                       # Tag Field Name
            VectorField("vector",                  # Vector Field Name
                "FLAT", {                          # Vector Index Type: FLAT or HNSW
                    "TYPE": "FLOAT32",             # FLOAT32 or FLOAT64
                    "DIM": vector_dimensions,      # Number of Vector Dimensions
                    "DISTANCE_METRIC": "COSINE",   # Vector Search Distance Metric
                }
            ),
        )

        # index Definition
        definition = IndexDefinition(prefix=[DOC_PREFIX], index_type=IndexType.HASH)

        # create Index
        r.ft(INDEX_NAME).create_index(fields=schema, definition=definition)


In [33]:
# Test the Redis connection
r.ping()

True

In [34]:
# Drop the index
r.ft(INDEX_NAME).dropindex(delete_documents=True)

b'OK'

In [27]:
# define vector dimensions
VECTOR_DIMENSIONS = 1536

# Create a new index
create_index(vector_dimensions=VECTOR_DIMENSIONS)

In [28]:
np_embedding = np.array(embeddings, dtype=np.float32)

# Write to Redis
pipe = r.pipeline()
ct = 0
for i, embedding in enumerate(np_embedding):
    by_em = embedding.tobytes()
    pipe.hset(f"doc:{i}", mapping = {
        "vector": by_em,
        "content": Questions_array[i],
        "nivo3": Nivo3_array[i],
        "nivo3A": Nivo3A_array[i],
        "tag": "openai"
    })
    res = pipe.execute()

In [14]:
# expected found question: Kaj pomeni odprta zavarovalna doba?

q_em = get_embedding("Pozdravleni, dobil sem mail, da je moja zavarovalna doba odprta. Kaj to pomeni?")
q_em = np.array(q_em, dtype=np.float32)

In [15]:
query = (
    Query("(@tag:{ openai })=>[KNN 3 @vector $vec as score]")
     .sort_by("score")
     .return_fields("content", "nivo3", "nivo3A", "tag", "score")
     .paging(0, 3)
     .dialect(2)
)

query_params = {"vec": q_em.tobytes()}
temp_json = r.ft(INDEX_NAME).search(query, query_params).docs

In [16]:
for doc in temp_json:
    print("Taxonomy:", doc['content'], "\nNivo3:", doc['nivo3'], "\nNivo3A:", doc['nivo3A'], "\nScore:", doc['score'], "\n")

Taxonomy: Kaj pomeni odprta zavarovalna doba? 
Nivo3: Naložbena in investicijska zavarovanja 
Nivo3A: Naložbeno življenjsko zavarovanje 
Score: 0.0818277001381 

Taxonomy: Zakaj sem prejel/a dokumente za obnovo zavarovanja? 
Nivo3: Nezgodna zavarovanja 
Nivo3A: Nezgodno zavarovanje Pazi name 
Score: 0.130839824677 

Taxonomy: Kako določiti zavarovalno dobo? 
Nivo3: Naložbena in investicijska zavarovanja 
Nivo3A: Naložbeno življenjsko zavarovanje 
Score: 0.145883858204 



In [35]:
indexes = r.execute_command("FT._List")
# Print the keys
for index in indexes:
    print(index)

b'test'
b'acs_emb_index'
b'triglav_c3'


In [36]:
keys = r.execute_command("KEYS *")
print("Keys count", len(keys))
keys

Keys count 5


[b'aHR0cHM6Ly9zdG9yYWdldHJpZ3VjMy5ibG9iLmNvcmUud2luZG93cy5uZXQva21vYWlkZW1vL2dlb19yZWZfbGp1YmxqYW5hXzAzLnBkZg2_S_0',
 b'aHR0cHM6Ly9zdG9yYWdldHJpZ3VjMy5ibG9iLmNvcmUud2luZG93cy5uZXQva21vYWlkZW1vL2dlb19yZWZfbGp1YmxqYW5hXzAzLnBkZg2_S_1',
 b'test:1',
 b'aHR0cHM6Ly9zdG9yYWdldHJpZ3VjMy5ibG9iLmNvcmUud2luZG93cy5uZXQva21vYWlkZW1vL2dlb19yZWZfbGp1YmxqYW5hXzAzLnBkZg2_S_2',
 b'test:2']