## Design Prompts Chains and Retrieval Mechanisms

Load configs for database, Azure OpenAI, and other resources as environment variables.

In [2]:
from dotenv import dotenv_values

# specify the name of the .env file name 
env_name = "llm_pgvector.env" # change to your own .env file name
config = dotenv_values(env_name)

In [None]:
# we will use the same filter_id1_name and filter_id2_name in previous notebooks. 
filter_id1_name = ""  
filter_id2_name = ""

# Connect to Flex Postgres (PG)  for retrievals

In [None]:
import psycopg2
from psycopg2 import pool
from psycopg2 import Error

host = config["HOST"]
dbname = config["DBNAME"] 
user = config["USER"] 
password = config["PASSWORD"] 
sslmode = config["SSLMODE"] 

# Build a connection string from the variables
conn_string = "host={0} user={1} dbname={2} password={3} sslmode={4}".format(host, user, dbname, password, sslmode)

postgreSQL_pool = psycopg2.pool.SimpleConnectionPool(1, 20,conn_string)
if (postgreSQL_pool):
    print("Connection pool created successfully")

# Use getconn() to get a connection from the connection pool
connection = postgreSQL_pool.getconn()
cursor = connection.cursor()

In order to use pgvector, we need to first create the vector extension as described in this [link](https://learn.microsoft.com/en-us/azure/postgresql/flexible-server/how-to-use-pgvector) and shown below.

In [None]:
# Open a cursor to perform database operations
# This is likely in case extension isn't already created from portal.
cursor = connection.cursor()

try:
    # Start a new transaction
    cursor.execute("BEGIN;")

    # Previous transaction statements
    # ...

    # Check if the extension already exists
    extension_query = "SELECT * FROM pg_extension WHERE extname = 'vector';"
    cursor.execute(extension_query)
    extension_exists = cursor.fetchone()

    if not extension_exists:
        # Extension does not exist, create it
        create_extension_query = "CREATE EXTENSION vector;"
        cursor.execute(create_extension_query)
        connection.commit()
    else:
        # Extension already exists, pass through
        pass

    # Commit the transaction
    cursor.execute("COMMIT;")
except Exception as e:
    # An error occurred, rollback the transaction
    cursor.execute("ROLLBACK;")
    raise e
finally:
    # Close the cursor
    cursor.close()


Examine the total number of items based on filter_id1_name and Chunk Embedding

In [None]:
# Assuming you have already established a connection and have a cursor object

# Rollback the current transaction
connection.rollback()
cursor = connection.cursor()
# Execute the SELECT statement
table_name1 = filter_id1_name
table_name2 = 'ChunksEmbedding'
try:
    cursor.execute(f"SELECT count(Id) FROM {table_name1};")
    rows = cursor.fetchall()
    for row in rows:
        print(f"Number of items: {row}")
except (Exception, Error) as e:
    print(f"Error executing SELECT statement: {e}")
    
try:
    cursor.execute(f"SELECT embedding FROM {table_name2} limit 5;")
    rows = cursor.fetchall()
    for row in rows:
        print(f"Items ID: {row}")
except (Exception, Error) as e:
    print(f"Error executing SELECT statement: {e}")

Prepare for question embedding

In [None]:
import openai
import pandas as pd
import pandas as pd
import numpy as np


openai.api_type = config["OPENAI_API_TYPE"] 
openai.api_key = config['OPENAI_API_KEY']
openai.api_base = config['OPENAI_API_BASE'] 
openai.api_version = config['OPENAI_API_VERSION'] 


def createEmbeddings(text):
    response = openai.Embedding.create(input=text , engine=config["OPENAI_DEPLOYMENT_EMBEDDING"])
    embeddings = response['data'][0]['embedding']
    return embeddings

## User Asks a Question 
In this step, the code will convert the user's question to an embedding and then retieve the top K document chunks based on the users' question using the similarity. Please note that other similarity metrics can also be used.

In [None]:
userQuestion = ""
filter_id1_val = ""
retrieve_k = 3 # for retrieving the top k reviews from the database

In [None]:
# Generate embeddings for the question and retrieve the top k document chunks
questionEmbedding = createEmbeddings(userQuestion)

## Hybrid Search
In this case, we will first filter based on id range, and then do similarity search

In [None]:
from pgvector.psycopg2 import register_vector

connection = psycopg2.connect(conn_string)
# Create a cursor after the connection
# Register 'pgvector' type for the 'embedding' column
register_vector(connection)
cursor = connection.cursor()
filter_id1_name = 'abcd'
table_name1 = filter_id1_name
table_name2 = 'ChunksEmbedding'
select_docid_query = f"SELECT DocId FROM {table_name1} WHERE filter_id1_name = '{filter_id1_val}'"
cursor.execute(select_docid_query)
doc_id = cursor.fetchone()[0]
select_query = f"SELECT Id FROM {table_name2} where DocId = '{doc_id}' ORDER BY embedding <-> %s LIMIT {retrieve_k}"
cursor = connection.cursor()
cursor.execute(select_query, (np.array(questionEmbedding),))
results = cursor.fetchall()

In [None]:
# Use the top k ids to retrieve the actual text from the database 
top_ids = []
for i in range(len(results)):
    top_ids.append(int(results[i][0]))

print(top_ids)

#### Retrieve text from database

In [None]:
# Assuming you have already established a connection and have a cursor object

# Rollback the current transaction
connection.rollback()

format_ids = ', '.join(['%s'] * len(top_ids))

sql = f"SELECT CONCAT('PageNumber: ', PageNumber, ' ', 'LineNumber: ', LineNumber, ' ', 'Text: ', Chunk) AS concat FROM {table_name2} WHERE id IN ({format_ids})"

# Execute the SELECT statement
try:
    cursor.execute(sql, top_ids)    
    top_rows = cursor.fetchall()
    for row in top_rows:
        print(row)
except (Exception, Error) as e:
    print(f"Error executing SELECT statement: {e}")

In [None]:
# create the context from the top_rows
context = ""
for row in top_rows:
    context += row[0]
    context += "\n"
    
print(context)

# Provide answer to a user's question
We use [langchain](https://python.langchain.com/en/latest/index.html) to construct chains and add prompt template.

In [None]:
from langchain.prompts import PromptTemplate
question_prompt_template = """Use the following portion of the context document to find relevant text and answer the question in details. Extract PageNumber and LineNumber and show it in the answer. 
{context}
Question: {question}
If the answer is not found, say that answer is not available in the documentation."""
QUESTION_PROMPT = PromptTemplate(
    template=question_prompt_template, input_variables=["context", "question"]
)

### Define llm model

In [None]:
import os
os.environ["OPENAI_API_TYPE"] = "azure"
os.environ["OPENAI_API_KEY"] = openai.api_key
os.environ["OPENAI_API_BASE"] = "https://synapseml-openai.openai.azure.com/"
os.environ["OPENAI_API_VERSION"] = "2022-12-01"
os.environ["OPENAI_DEPLOYMENT_NAME"] = "text-davinci-003"

In [None]:
from langchain.llms import AzureOpenAI
llm= AzureOpenAI(deployment_name=config["OPENAI_MODEL_COMPLETION"], model_name=config["OPENAI_MODEL_EMBEDDING"], temperature=0)

In [None]:
from typing import List, Optional

from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader


class TextFormatter(BaseLoader):
    """Load text files."""

    def __init__(self, text: str):
        """Initialize with file path."""
        self.text = text

    def load(self) -> List[Document]:
        """Load from file path."""
        metadata = {"source": ""}
        return [Document(page_content=self.text, metadata=metadata)]

In [None]:
from langchain.chains.question_answering import load_qa_chain

loader = TextFormatter(context)
# qa_document_chain.run(input_document=context, question=userQuestion)
chain = load_qa_chain(llm, chain_type="stuff", prompt=QUESTION_PROMPT)
ans = chain({"input_documents": loader.load(), "question": userQuestion}, return_only_outputs=True)

In [None]:
ans['output_text']

In [None]:
import os
import pandas as pd

# Get the current working directory
current_directory = os.getcwd()

# Navigate to the directory containing the CSV file (one level above the current directory)
data_directory = os.path.abspath(os.path.join(current_directory, '..', 'ValidationSetOfQA'))

# Construct the file path for your CSV file in the data_directory
csv_file_path = os.path.join(data_directory, 'QnAValidationSet.csv')

# Load the CSV file using pandas
df = pd.read_csv(csv_file_path)


In [None]:
df.columns = [filter_id1_name, filter_id2_name, 'Question', 'Answer', 'ReferenceText', 'PageNumber']
df.head(10)

In [None]:
df_eval = df.copy()
#df_eval.dropna(subset=["question"] ,inplace=True)
#df_eval.reset_index(drop=True, inplace=True)
df_eval.head(3)

In [None]:
values = [item for pair in zip(list(df_eval['Question']), list(df_eval['Answer'])) for item in pair]
keys = [str(i//2)+'a' if i%2==0 else str(i//2+1)+'q' for i in range(1,len(values)+2)]

userQuestions = {keys[i]:values[i] for i in range(len(keys)-1)}
filter_id1_vals = [item for item in list(df_eval[filter_id1_name]) for _ in range(2)]

In [None]:
userQuestions

In [None]:
filter_id1_name

In [None]:
# GPT based question answering with type checking
from langchain import PromptTemplate
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd

Using a retrieve_k_chunk function to facilitate multiple query evaluations

In [None]:
def retrieve_k_chunk(retrieve_k, questionEmbedding,filter_id1_val):
    connection = psycopg2.connect(conn_string)
# Create a cursor after the connection
# Register 'pgvector' type for the 'embedding' column
    register_vector(connection)
    cursor = connection.cursor()
    print("filter_id1_name:", filter_id1_name)
    select_docid_query = f"SELECT DocId FROM {table_name1} WHERE {filter_id1_name} = '{filter_id1_val}'"
    cursor.execute(select_docid_query)
    doc_id = cursor.fetchone()[0]
    print('docid:', doc_id)
    select_query = f"SELECT Id FROM {table_name2} where DocId = '{doc_id}' ORDER BY embedding <-> %s LIMIT {retrieve_k}"
    cursor = connection.cursor()
    cursor.execute(select_query, (np.array(questionEmbedding),))
    results = cursor.fetchall()
    top_ids = []
    for i in range(len(results)):
        top_ids.append(int(results[i][0]))

    # Rollback the current transaction
    connection.rollback()

    format_ids = ', '.join(['%s'] * len(top_ids))

    sql = f"SELECT CONCAT('PageNumber: ', PageNumber, ' ', 'LineNumber: ', LineNumber, ' ', 'Text: ', Chunk) AS concat FROM {table_name2} WHERE id IN ({format_ids})"

    # Execute the SELECT statement
    try:
        cursor.execute(sql, top_ids)    
        top_rows = cursor.fetchall()
    except (Exception, Error) as e:
        print(f"Error executing SELECT statement: {e}")
    finally:
        cursor.close()
    return top_rows

In [None]:
userQuestions.items()

In [None]:
def get_user_questions_answers():
    """
    Collection of user questions with known answers.
    """

    Q = []
    A = []
    Agpt = []
    i = 0
    for key, value in userQuestions.items():
        if "q" in key:
            Q.append(value)
            questionEmbedding = createEmbeddings(value)
            output = retrieve_k_chunk(retrieve_k, questionEmbedding,filter_id1_vals[i] )
            # create the context from the top_rows
            context = ""
            for row in top_rows:
                context += row[0]
                context += "\n"
            loader = TextFormatter(context)
            chain = load_qa_chain(llm, chain_type="stuff", prompt=QUESTION_PROMPT)
            ans = chain({"input_documents": loader.load(), "question": value}, return_only_outputs=True)
            Agpt.append(ans['output_text'])
            print(ans['output_text'])
            i+=2
        else:
            A.append(value)

    return  Q, A, Agpt

In [None]:
Q, A, Agpt = get_user_questions_answers()

In [None]:
def get_cosine_similarities(QA_results):
    # compare cosine similarity between two vectors
    cosine_similarities = []
    for i in range(len(QA_results[0])):
        emd1 = createEmbeddings(QA_results[0][i])
        emd2 = createEmbeddings(QA_results[1][i])
        cosine_similarity_val = cosine_similarity(
            np.array(emd1).reshape(1, -1), np.array(emd2).reshape(1, -1)
        )[0][0]
        cosine_similarities.append(np.round(cosine_similarity_val, 2))
    return cosine_similarities

In [None]:
QAres = [A, Agpt]
scores = get_cosine_similarities(QAres)

For elaborate experimentation and additional evaluations for page number, see Notebook Step_5_mlflow_experimentation.ipynb.