## Experimentation and Evaluation

Load configs for database, Azure OpenAI, and other resources as environment variables.

In [None]:
from dotenv import dotenv_values

# specify the name of the .env file name 
env_name = "llm_pgvector.env" # change to your own .env file name
config = dotenv_values(env_name)

### Connect to Flex Postgres (PG)  for retrievals

In [None]:
import psycopg2
from psycopg2 import pool
from psycopg2 import Error

host = config["HOST"]
dbname = config["DBNAME"] 
user = config["USER"] 
password = config["PASSWORD"] 
sslmode = config["SSLMODE"] 

# Build a connection string from the variables
conn_string = "host={0} user={1} dbname={2} password={3} sslmode={4}".format(host, user, dbname, password, sslmode)

postgreSQL_pool = psycopg2.pool.SimpleConnectionPool(1, 20,conn_string)
if (postgreSQL_pool):
    print("Connection pool created successfully")

# Use getconn() to get a connection from the connection pool
connection = postgreSQL_pool.getconn()
cursor = connection.cursor()

Prepare for question embedding

In [None]:
import openai
import pandas as pd
import pandas as pd
import numpy as np


openai.api_type = config["OPENAI_API_TYPE"] 
openai.api_key = config['OPENAI_API_KEY']
openai.api_base = config['OPENAI_API_BASE'] 
openai.api_version = config['OPENAI_API_VERSION']  


def createEmbeddings(text):
    response = openai.Embedding.create(input=text , engine=config["OPENAI_DEPLOYMENT_EMBEDDING"])
    embeddings = response['data'][0]['embedding']
    return embeddings

### Define llm model

In [None]:
import os
os.environ["OPENAI_API_TYPE"] = "azure"
os.environ["OPENAI_API_KEY"] = openai.api_key
os.environ["OPENAI_API_BASE"] = "https://synapseml-openai.openai.azure.com/"
os.environ["OPENAI_API_VERSION"] = "2022-12-01"
os.environ["OPENAI_DEPLOYMENT_NAME"] = "text-davinci-003"

In [None]:
from langchain.llms import AzureOpenAI
llm= AzureOpenAI(deployment_name=config["OPENAI_MODEL_COMPLETION"], model_name=config["OPENAI_MODEL_EMBEDDING"], temperature=0)

In [None]:
from typing import List, Optional

from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader
from langchain.chains.question_answering import load_qa_chain

class TextFormatter(BaseLoader):
    """Load text files."""

    def __init__(self, text: str):
        """Initialize with file path."""
        self.text = text

    def load(self) -> List[Document]:
        """Load from file path."""
        metadata = {"source": ""}
        return [Document(page_content=self.text, metadata=metadata)]

In [None]:
import os
import pandas as pd

# Get the current working directory
current_directory = os.getcwd()

# Navigate to the directory containing the CSV file (one level above the current directory)
data_directory = os.path.abspath(os.path.join(current_directory, '..', 'ValidationSetOfQA'))

# Construct the file path for your CSV file in the data_directory
csv_file_path = os.path.join(data_directory, 'QnAValidationSet.csv')

# Load the CSV file using pandas
df = pd.read_csv(csv_file_path)


In [None]:
filter_id1_name = ""
filter_id2_name = ""

In [None]:
df.columns = [filter_id1_name, filter_id2_name, 'Question', 'Answer', 'ReferenceText', 'PageNumber']
df.head(10)

In [None]:
df_eval = df.copy()
df_eval.head(3)


In [None]:
values = [item for pair in zip(list(df_eval['Question']), list(df_eval['Answer'])) for item in pair]
keys = [str(i//2)+'a' if i%2==0 else str(i//2+1)+'q' for i in range(1,len(values)+2)]

userQuestions = {keys[i]:values[i] for i in range(len(keys)-1)}
filter_id1_vals = [item for item in list(df_eval[filter_id1_name]) for _ in range(2)]

In [None]:
# GPT based question answering with type checking
from langchain import PromptTemplate
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd

In [None]:
def get_cosine_similarities(QA_results, verbose = False):
    # compare cosine similarity between two vectors
    cosine_similarities = []
    for i in range(len(QA_results[0])):
        if verbose:
            print('calculating cosine similarity for: \n', QA_results[0][i], '\n', QA_results[1][i])
            print(30*'-')
        emd1 = createEmbeddings(QA_results[0][i])
        emd2 = createEmbeddings(QA_results[1][i])
        cosine_similarity_val = cosine_similarity(
            np.array(emd1).reshape(1, -1), np.array(emd2).reshape(1, -1)
        )[0][0]
        cosine_similarities.append(np.round(cosine_similarity_val, 2))
    return cosine_similarities

In [None]:
# let's also evaluate the reference page number

import re

def extract_page_number(text):
    # Regular expression pattern to find the PageNumber value
    pattern = r'PageNumber:\s+(\d+)'

    # Search for the pattern in the text
    match = re.search(pattern, text)

    # If a match is found, return the extracted PageNumber value, otherwise return 0
    if match:
        return int(match.group(1))
    else:
        return 0

def get_all_page_numbers(Agpt):
    page_numbers = []
    for answers in Agpt:
        page_numbers.append(extract_page_number(answers))
    print(page_numbers)
    print(df_eval["PageNumber"].tolist())
    page_number_score = [1 if page_numbers[i] == df_eval["PageNumber"].tolist()[i] else 0 for i in range(len(page_numbers))]
    print(page_number_score)
    return page_numbers, page_number_score



In [None]:
from pgvector.psycopg2 import register_vector

def retrieve_k_chunk(retrieve_k, questionEmbedding,filter_id1_val, similarity_method, verbose=False):
    connection = psycopg2.connect(conn_string)
# Create a cursor after the connection
# Register 'pgvector' type for the 'embedding' column
    register_vector(connection)
    cursor = connection.cursor()
    
    #print("filter_id1_name:", filter_id1_name)
    select_docid_query = f"SELECT DocId FROM {table_name1} WHERE {filter_id1_name} = '{filter_id1_val}'"
    cursor.execute(select_docid_query)
    doc_id = cursor.fetchone()[0]
    if verbose:
        print('filter_id1_name:', filter_id1_name)
        print('DocId:', doc_id)
    
    if similarity_method == 'NN':
        sign = '<->'
    elif similarity_method=='cosine':
        sign = '<=>'
    elif similarity_method=='inner':
        sign = '<#>'
    select_query = f"SELECT Id FROM {table_name2} where DocId = '{doc_id}' ORDER BY embedding <-> %s LIMIT {retrieve_k}"
    cursor = connection.cursor()
    cursor.execute(select_query, (np.array(questionEmbedding),))
    results = cursor.fetchall()
    top_ids = []
    for i in range(len(results)):
        top_ids.append(int(results[i][0]))

    if verbose:
        print('top_ids:', top_ids)
    # Rollback the current transaction
    connection.rollback()

    format_ids = ', '.join(['%s'] * len(top_ids))

    sql = f"SELECT CONCAT('PageNumber: ', PageNumber, ' ', 'LineNumber: ', LineNumber, ' ', 'Text: ', Chunk) AS concat FROM {table_name2} WHERE id IN ({format_ids})"

    # Execute the SELECT statement
    try:
        cursor.execute(sql, top_ids)    
        top_rows = cursor.fetchall()
    except (Exception, Error) as e:
        print(f"Error executing SELECT statement: {e}")
    finally:
        pass
        # cursor.close()
    #print("top_rows", top_rows)
    # getting teh 
    sql_pages = f"SELECT PageNumber FROM {table_name2} WHERE id IN ({format_ids})"
    # Execute the SELECT statement
    try:
        cursor.execute(sql_pages, top_ids)    
        pages = cursor.fetchall()
    except (Exception, Error) as e:
        print(f"Error executing SELECT statement: {e}")
    finally:
        cursor.close()    
    
    retrieved_pages = [int(page[0]) for page in pages]
    return top_rows, retrieved_pages

In [None]:
from importlib.metadata import packages_distributions


def get_user_questions_answers(retrieve_k, filter_id1_vals,similarity_method, QUESTION_PROMPT, verbose=False):
    """
    Collection of user questions with known answers.
    """
    Q = []
    A = []
    Agpt = []
    contexts = []
    pages = []
    i = 0
    for key, value in userQuestions.items():
        if "q" in key:
            Q.append(value)
            questionEmbedding = createEmbeddings(value)
            if verbose:
                print("question: ", value)
            top_rows, retreived_pages = retrieve_k_chunk(retrieve_k, questionEmbedding,filter_id1_vals[i],similarity_method, verbose = verbose)
            # create the context from the top_rows
            context = ""
            for row in top_rows:
                context += row[0]
                context += "\n"
            if verbose:
                print('context: \n', context)
            loader = TextFormatter(context)
            chain = load_qa_chain(llm, chain_type="stuff", prompt=QUESTION_PROMPT)
            ans = chain({"input_documents": loader.load(), "question": value}, return_only_outputs=True)
            Agpt.append(ans['output_text'])
            contexts.append(context)
            pages.append(retreived_pages)
            if verbose:
                print(ans['output_text'])
            i+=2
        else:
            A.append(value)
        
        
    return  Q, A, Agpt, contexts, pages

In [None]:
def dict_to_text(config_dict):
    config_text = ""
    for key, value in config_dict.items():
        config_text += f"{key}: {value}\n"
    return config_text

In [None]:
# Create a filename with the timestamp as part of the extension
  # You can choose any file name with an appropriate extension (.csv, .parquet, etc.)

def run_experiment(ExperimentConfig, QUESTION_PROMPT, verbose=False):
    config_text = dict_to_text(ExperimentConfig)
    Q, A, Agpt, contexts, pages = get_user_questions_answers(retrieve_k = ExperimentConfig['retrieve_k'] ,filter_id1_vals=filter_id1_vals, similarity_method = ExperimentConfig['similarity_method'], QUESTION_PROMPT=QUESTION_PROMPT, verbose=verbose)
    QAres = [A, Agpt, Q]
    cos_sim_scores = get_cosine_similarities(QAres, verbose= True)
    page_numbers, page_number_score = get_all_page_numbers(Agpt)
    df_evaluation = pd.DataFrame({'Question': Q, 'Answer': A, 'Answer_gpt': Agpt, 'Score': cos_sim_scores, 'detected_page_number': page_numbers, 'actual_page_number': df_eval["PageNumber"].tolist(), 'page_number_score': page_number_score, 'context': contexts, \
        'retrieved_pages': pages})
    df_evaluation["correct_page_in_retrieved"] = df_evaluation.apply(lambda row: row['actual_page_number'] in row['retrieved_pages'], axis=1)
    df_evaluation["config"] = config_text
    df_evaluation["prompt"] = QUESTION_PROMPT.template
    current_timestamp = pd.Timestamp.now()
    timestamp_str = current_timestamp.strftime("%Y%m%d_%H%M%S")
    file_path ='..\DATA\evaluation'+ '_retrieve_'+ str(ExperimentConfig['retrieve_k']) + '_similarity_'+ ExperimentConfig['similarity_method']+'_date_' +timestamp_str + '_.csv'  
    df_evaluation.to_csv(file_path, index=False)
    return np.mean(cos_sim_scores), np.mean(page_number_score),  cos_sim_scores, page_number_score, df_evaluation, file_path

# Experimentation using MLFLOW library

We will run multiple experiments using mlflow to compare results of various search methods, and number of top retrievals. Similarly, other parameter variation based experiments can be run and tracked using mlflow.

In [None]:
import mlflow, time
mlflow.set_experiment(experiment_name="RAG_EXP")
    
#import yaml
#config = yaml.load(open("EvalConfig.yaml", "r"), Loader=yaml.FullLoader)
table_name1 = filter_id1_name
table_name2 = 'ChunksEmbedding'
ExperimentConfig1 = {'retrieve_k': 5, 'similarity_method': 'knn'} # similarity = ['cosine', 'NN', 'inner']
ExperimentConfig2 = {'retrieve_k': 10, 'similarity_method': 'knn'} # similarity = ['cosine', 'NN', 'inner']
ExperimentConfig3 = {'retrieve_k': 2, 'similarity_method': 'knn'} # similarity = ['cosine', 'NN', 'inner']
ExperimentConfig4 = {'retrieve_k': 5, 'similarity_method': 'cosine'} # similarity = ['cosine', 'NN', 'inner']
ExperimentConfig5 = {'retrieve_k': 5, 'similarity_method': 'inner'} # similarity = ['cosine', 'NN', 'inner']
ExperimentConfig6 = {'retrieve_k': 5, 'similarity_method': 'knn'} # similarity = ['cosine', 'NN', 'inner']

RUNConfigs = [ExperimentConfig1, ExperimentConfig2, ExperimentConfig3, ExperimentConfig4, ExperimentConfig5]
question_prompt_template = """Use the following portion of the context document to find relevant text and answer the question in details. Extract PageNumber and LineNumber and show it in the answer. 
{context}
Question: {question}
If the answer is not found, say that answer is not available in the documentation."""
QUESTION_PROMPT = PromptTemplate(
    template=question_prompt_template, input_variables=["context", "question"]
)
for ExperimentConfig in RUNConfigs:
    current_timestamp = pd.Timestamp.now()
    timestamp_str = current_timestamp.strftime("%Y%m%d_%H%M%S")
    mlflow_run_name = "Retreive_k_" + str(ExperimentConfig['retrieve_k']) + "_similarity_" + ExperimentConfig['similarity_method']+ "_date_" + timestamp_str
    with mlflow.start_run(run_name=mlflow_run_name) as run:
        mean_sim_score, mean_page_score,cos_sim_score, page_number_score, df_evaluation, df_path = run_experiment(ExperimentConfig, QUESTION_PROMPT, verbose = False)
        print("mean_sim_score, mean_page_score:", mean_sim_score, mean_page_score)
        mlflow.log_metric("mean_sim_score", mean_sim_score)
        mlflow.log_metric("mean_page_score", mean_page_score)
        mlflow.log_param("cosine_similarity_score", str(cos_sim_score))
        mlflow.log_param("page_number_score", str(page_number_score))
        mlflow.log_param("Client code table", table_name1)
        mlflow.log_param("Chunk table", table_name2)
        mlflow.log_param("prompt", QUESTION_PROMPT)  
        mlflow.log_param("config", "".join(ExperimentConfig))
        mlflow.log_artifact(df_path)
        print("config :\n", ExperimentConfig)
        time.sleep(0.5)
mlflow.end_run()

In [None]:
df_evaluation

The results will be saved in `\mlruns` inside the `Notebooks` directory. To view visualization, run `mlflow ui` command from the Notebooks directory path level.