## Experimentation and Evaluation

JSON IO: In this notebook, we will experiment with prompt template. We will pass input context as json and extract output answer from gpt as json format as well. 

Load configs for database, Azure OpenAI, and other resources as environment variables.

In [137]:
from dotenv import dotenv_values

# specify the name of the .env file name 
env_name = "llm_pgvector.env" # change to your own .env file name
config = dotenv_values(env_name)

### Connect to Flex Postgres (PG)  for retrievals

In [138]:
import psycopg2
from psycopg2 import pool
from psycopg2 import Error

host = config["HOST"]
dbname = config["DBNAME"] 
user = config["USER"] 
password = config["PASSWORD"] 
sslmode = config["SSLMODE"]  

# Build a connection string from the variables
conn_string = "host={0} user={1} dbname={2} password={3} sslmode={4}".format(host, user, dbname, password, sslmode)

postgreSQL_pool = psycopg2.pool.SimpleConnectionPool(1, 20,conn_string)
if (postgreSQL_pool):
    print("Connection pool created successfully")

# Use getconn() to get a connection from the connection pool
connection = postgreSQL_pool.getconn()
cursor = connection.cursor()

Connection pool created successfully


Prepare for question embedding

In [139]:
import openai
import pandas as pd
import pandas as pd
import numpy as np


openai.api_type = config["OPENAI_API_TYPE"] 
openai.api_key = config['OPENAI_API_KEY']
openai.api_base = config['OPENAI_API_BASE'] 
openai.api_version = config['OPENAI_API_VERSION']  


def createEmbeddings(text):
    response = openai.Embedding.create(input=text , engine=config["OPENAI_DEPLOYMENT_EMBEDDING"])
    embeddings = response['data'][0]['embedding']
    return embeddings

### Define llm model

In [140]:
import os
os.environ["OPENAI_API_TYPE"] = "azure"
os.environ["OPENAI_API_KEY"] = openai.api_key
os.environ["OPENAI_API_BASE"] = "https://synapseml-openai.openai.azure.com/"
os.environ["OPENAI_API_VERSION"] = "2022-12-01"
os.environ["OPENAI_DEPLOYMENT_NAME"] = "text-davinci-003"

In [141]:
from langchain.llms import AzureOpenAI
llm= AzureOpenAI(deployment_name=config["OPENAI_MODEL_COMPLETION"], model_name=config["OPENAI_MODEL_EMBEDDING"], temperature=0)

In [142]:
from typing import List, Optional

from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader
from langchain.chains.question_answering import load_qa_chain

class TextFormatter(BaseLoader):
    """Load text files."""

    def __init__(self, text: str):
        """Initialize with file path."""
        self.text = text

    def load(self) -> List[Document]:
        """Load from file path."""
        metadata = {"source": ""}
        return [Document(page_content=self.text, metadata=metadata)]

In [143]:
import os
import pandas as pd

# Get the current working directory
current_directory = os.getcwd()

# Navigate to the directory containing the CSV file (one level above the current directory)
data_directory = os.path.abspath(os.path.join(current_directory, '..', 'ValidationSetOfQA'))

# Construct the file path for your CSV file in the data_directory
csv_file_path = os.path.join(data_directory, 'QnAValidationSet.csv')

# Load the CSV file using pandas
df = pd.read_csv(csv_file_path)


In [144]:
df.columns

Index(['Ticker', 'Quarter', 'Question', 'Answer', 'PageNumber'], dtype='object')

In [145]:
# choose the filters you want to apply 
filter1_name = 'Ticker'
filter2_name = 'Quarter'

In [146]:
df_eval = df.copy()
df_eval.head(3)

Unnamed: 0,Ticker,Quarter,Question,Answer,PageNumber
0,MSFT,1,How did the first quarter financial results co...,"In the first quarter, our revenue reached $50....",18
1,MSFT,1,What were the key highlights and growth trends...,"Within the commercial business segment, we wit...",19


In [147]:
values = [item for pair in zip(list(df_eval['Question']), list(df_eval['Answer'])) for item in pair]
keys = [str(i//2)+'a' if i%2==0 else str(i//2+1)+'q' for i in range(1,len(values)+2)]

userQuestions = {keys[i]:values[i] for i in range(len(keys)-1)}
filter1_vals = [item for item in list(df_eval[filter1_name]) for _ in range(2)]
filter2_vals = [item for item in list(df_eval[filter2_name]) for _ in range(2)]

In [148]:
# GPT based question answering with type checking
from langchain import PromptTemplate
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd

In [149]:
def get_cosine_similarities(QA_results, verbose = False):
    # compare cosine similarity between two vectors
    cosine_similarities = []
    for i in range(len(QA_results[0])):
        if verbose:
            print('calculating cosine similarity for: \n', QA_results[0][i], '\n', QA_results[1][i])
            print(30*'-')
        emd1 = createEmbeddings(QA_results[0][i])
        emd2 = createEmbeddings(QA_results[1][i])
        cosine_similarity_val = cosine_similarity(
            np.array(emd1).reshape(1, -1), np.array(emd2).reshape(1, -1)
        )[0][0]
        cosine_similarities.append(np.round(cosine_similarity_val, 2))
    return cosine_similarities

In [150]:
# let's also evaluate the reference page number

import re

def extract_page_number(text):
    # Regular expression pattern to find the PageNumber value
    pattern = r'PageNumber:\s+(\d+)'

    # Search for the pattern in the text
    match = re.search(pattern, text)

    # If a match is found, return the extracted PageNumber value, otherwise return 0
    if match:
        return int(match.group(1))
    else:
        return 0

def get_all_page_numbers(Agpt):
    page_numbers = []
    for answers in Agpt:
        page_numbers.append(extract_page_number(answers))
    print(page_numbers)
    print(df_eval["PageNumber"].tolist())
    page_number_score = [1 if page_numbers[i] == df_eval["PageNumber"].tolist()[i] else 0 for i in range(len(page_numbers))]
    print(page_number_score)
    return page_numbers, page_number_score
    
    
def get_all_page_numbers_from_json(Agpt):
    page_numbers = []
    for answers in Agpt:
        print('page numbers: ', answers["Answer"]["PageNumber"])
        print(type(answers))
        print(type(answers["Answer"]["PageNumber"]))
        #page_numbers.append(int(answers["Answer"]["PageNumber"]))
    print(page_numbers)
    print(df_eval["PageNumber"].tolist())
    page_number_score = [1 if page_numbers[i] == df_eval["PageNumber"].tolist()[i] else 0 for i in range(len(page_numbers))]
    print(page_number_score)
    return page_numbers, page_number_score


In [151]:
from pgvector.psycopg2 import register_vector

def retrieve_k_chunk(retrieve_k, questionEmbedding, filter1_val, filter2_val, similarity_method="knn", verbose=False):
    connection = psycopg2.connect(conn_string)
    # Create a cursor after the connection
    # Register 'pgvector' type for the 'embedding' column
    register_vector(connection)
    cursor = connection.cursor()
    
 
    if similarity_method == 'knn':
        sign = '<->'
    elif similarity_method=='cosine':
        sign = '<=>'
    elif similarity_method=='inner':
        sign = '<#>'
    select_query = f"SELECT Id FROM {table_name} where {filter1_name} = '{filter1_val}' and {filter2_name}='{filter2_val}' ORDER BY embedding {sign} %s LIMIT {retrieve_k}"
    cursor.execute(select_query, (np.array(questionEmbedding),))
    results = cursor.fetchall()
    top_ids = []
    for i in range(len(results)):
        top_ids.append(int(results[i][0]))

    if verbose:
        print('top_ids:', top_ids)
    # Rollback the current transaction
    connection.rollback()

    format_ids = ', '.join(['%s'] * len(top_ids))

    sql = f"SELECT CONCAT('PageNumber: ', PageNumber, ' ', 'LineNumber: ', LineNumber, ' ', 'Text: ', Chunk) AS concat FROM {table_name} WHERE id IN ({format_ids})"

    # Execute the SELECT statement
    try:
        cursor.execute(sql, top_ids)    
        top_rows = cursor.fetchall()
    except (Exception, Error) as e:
        print(f"Error executing SELECT statement: {e}")
    finally:
        pass
    
    sql_lines = f"SELECT LineNumber FROM {table_name} WHERE id IN ({format_ids})"
    # Execute the SELECT statement
    try:
        cursor.execute(sql_lines, top_ids)    
        lines = cursor.fetchall()
    except (Exception, Error) as e:
        print(f"Error executing SELECT statement: {e}")
    finally:
        pass
        # cursor.close()
    #print("top_rows", top_rows)
    # getting teh 
    sql_pages = f"SELECT PageNumber FROM {table_name} WHERE id IN ({format_ids})"
    # Execute the SELECT statement
    try:
        cursor.execute(sql_pages, top_ids)    
        pages = cursor.fetchall()
    except (Exception, Error) as e:
        print(f"Error executing SELECT statement: {e}")
    finally:
        cursor.close()    
    
    retrieved_pages = [int(page[0]) for page in pages]
    retrieved_lines = [int(line[0]) for line in lines]
    return top_rows, retrieved_pages, retrieved_lines 

In [152]:
import json

def create_json(pages, lines, rows):
    if len(pages) != len(lines) or len(pages) != len(rows):
        raise ValueError("The number of pages, lines, and rows must be the same.")

    data = {}
    for page, line, row in zip(pages, lines, rows):
        key = f"{page}, {line}"
        data[key] = [row]

    json_data = json.dumps(data)
    return json_data



#### Use this section only if you wish to format the json output (commented). Otherwise use the next cell for pydantic approach (recommended)

In [153]:
# import json
# def parse_json(text_with_json):
#     '''
#         # Sample JSON text (replace this with your actual JSON text)
#     json_text = """
#     {
#         "PageNumber": "2",
#         "LineNumber": "11",
#         "Answer": "<Enter Ground Truth Answer Here>"
#     }
#     '''

#     # Step 1: Extract the JSON part from the larger text
#     start_index = text_with_json.find("{")
#     end_index = text_with_json.rfind("}")
#     if start_index == -1 or end_index == -1:
#         print("Error: JSON data not found in the text.")
#         exit()

#     json_text = text_with_json[start_index:end_index+1]

#     # Step 2: Convert the JSON text to a Python dictionary
#     try:
#         parsed_data = json.loads(json_text)
#     except json.JSONDecodeError:
#         print("Error: The extracted text does not contain valid JSON data.")
#         exit()

    
#     # Step 3: Access the "PageNumber" field
#     if "PageNumber" in parsed_data:
#         page_number = parsed_data["PageNumber"]
#         print("PageNumber:", page_number)
#     else:
#         print("Error: The 'PageNumber' field was not found in the JSON data.")
        
#     if "LineNumber" in parsed_data:
#         line_number = parsed_data["LineNumber"]
#         print("LineNumber:", line_number)
    
#     if "Answer" in parsed_data:
#         answer = parsed_data["Answer"]
#         print("Answer:", answer)
    
    
#     return page_number, line_number, answer

# def get_user_questions_answers(retrieve_k, filter1_vals, filter2_vals, similarity_method, QUESTION_PROMPT, verbose=False):
#     """
#     Collection of user questions with known answers.
#     """
#     Q = []
#     A = []
#     Agpt = []
#     contexts = []
#     ret_lines = []
#     ret_pages = []
#     pages = []
#     lines =[]
#     i = 0
#     for key, value in userQuestions.items():
#         if "q" in key:
#             Q.append(value)
#             questionEmbedding = createEmbeddings(value)
#             if verbose:
#                 print("question: ", value)
#             top_rows, retreived_pages, retreived_lines = retrieve_k_chunk(retrieve_k, questionEmbedding,filter1_vals[i],filter_id2_vals[i],similarity_method, verbose = verbose)
#             context = create_json(retreived_pages, retreived_lines, top_rows)
#             # create the context from the top_rows
#             # context = ""
#             # for row in top_rows:
#             #     context += row[0]
#             #     context += "\n"
#             if verbose:
#                 print('context: \n', context)
#             loader = TextFormatter(context)
#             chain = load_qa_chain(llm, chain_type="stuff", prompt=QUESTION_PROMPT)
#             ans = chain({"input_documents": loader.load(), "question": value}, return_only_outputs=True)
#             #if verbose:
#             print('gpt_output', ans['output_text'])
#             PageNumber, LineNumber, Answer = parse_json(ans['output_text'])

#             Agpt.append(Answer)
#             contexts.append(context)
#             pages.append(int(PageNumber))
#             lines.append(int(LineNumber))
#             ret_pages.append(retreived_pages)
#             ret_lines.append(retreived_lines)
#             if verbose:
#                 print(ans['output_text'])
#             i+=2
#         else:
#             A.append(value)
        
        
#     return  Q, A, Agpt, contexts, pages, lines, ret_pages, ret_lines

#### Use the following cell to extract json output using pydantic

In [154]:
from langchain.output_parsers import PydanticOutputParser
from pydantic import BaseModel, Field, validator
from typing import List
# Here's another example, but with a compound typed field.
class LLMAnswer(BaseModel):
    PageNumber: int = Field(description="reference page number")
    LineNumber: int = Field(description="reference line number")
    Answer: str= Field(description = "answer found in the text")

parser = PydanticOutputParser(pydantic_object=LLMAnswer)

def get_user_questions_answers(retrieve_k, filter1_vals, filter2_vals, similarity_method, QUESTION_PROMPT, verbose=False):
    """
    Collection of user questions with known answers.
    """
    Q = []
    A = []
    Agpt = []
    contexts = []
    ret_lines = []
    ret_pages = []
    pages = []
    lines =[]
    i = 0
    for key, value in userQuestions.items():
        if "q" in key:
            Q.append(value)
            questionEmbedding = createEmbeddings(value)
            if verbose:
                print("question: ", value)
            top_rows, retreived_pages, retreived_lines = retrieve_k_chunk(retrieve_k, questionEmbedding,filter1_vals[i],filter2_vals[i],similarity_method, verbose = verbose)
            context = create_json(retreived_pages, retreived_lines, top_rows)
            # create the context from the top_rows
            # context = ""
            # for row in top_rows:
            #     context += row[0]
            #     context += "\n"
            if verbose:
                print('context: \n', context)
            loader = TextFormatter(context)
            chain = load_qa_chain(llm, chain_type="stuff", prompt=QUESTION_PROMPT)
            ans = chain({"input_documents": loader.load(), "question": value}, return_only_outputs=True)
            #if verbose:
            print('gpt_output', ans['output_text'])
            PageNumber, LineNumber, Answer = parse_json(ans['output_text'])

            parsed_ans = parser.parse(ans['output_text'])
            Agpt.append(parsed_ans.Answer)
            contexts.append(context)
            pages.append(parsed_ans.PageNumber)
            lines.append(parsed_ans.LineNumber)
            ret_pages.append(retreived_pages)
            ret_lines.append(retreived_lines)

            if verbose:
                print(ans['output_text'])
            i+=2
        else:
            A.append(value)
        
    return  Q, A, Agpt, contexts, pages, lines, ret_pages, ret_lines


In [155]:
def dict_to_text(config_dict):
    config_text = ""
    for key, value in config_dict.items():
        config_text += f"{key}: {value}\n"
    return config_text

In [156]:
def run_experiment(ExperimentConfig, QUESTION_PROMPT, verbose=False):
    config_text = dict_to_text(ExperimentConfig)
    Q, A, Agpt, contexts, page_numbers, line_numbers, ret_pages, ret_lines = get_user_questions_answers(retrieve_k = ExperimentConfig['retrieve_k'] ,filter1_vals = filter1_vals, filter2_vals = filter2_vals, similarity_method = ExperimentConfig['similarity_method'], QUESTION_PROMPT=QUESTION_PROMPT, verbose=verbose)
    print(page_numbers)
    QAres = [A, Agpt, Q]
    cos_sim_scores = get_cosine_similarities(QAres, verbose= True)
    #page_numbers, page_number_score = get_all_page_numbers_from_json(Agpt)
    
    page_number_score = [1 if page_numbers[i] == df_eval["PageNumber"].tolist()[i] else 0 for i in range(len(page_numbers))]
    df_evaluation = pd.DataFrame({'Question': Q, 'Answer': A, 'Answer_gpt': Agpt, 'Score': cos_sim_scores, 'detected_page_number': page_numbers, 'actual_page_number': df_eval["PageNumber"].tolist(), 'page_number_score': page_number_score, 'context': contexts, \
        'retrieved_pages': ret_pages})
    df_evaluation["correct_page_in_retrieved"] = df_evaluation.apply(lambda row: row['actual_page_number'] in row['retrieved_pages'], axis=1)
    df_evaluation["config"] = config_text
    df_evaluation["prompt"] = QUESTION_PROMPT.template
    current_timestamp = pd.Timestamp.now()
    timestamp_str = current_timestamp.strftime("%Y%m%d_%H%M%S")
    file_path ='..\DATA\evaluation'+ '_retrieve_'+ str(ExperimentConfig['retrieve_k']) + '_similarity_'+ ExperimentConfig['similarity_method']+'_date_' +timestamp_str + '_.csv'  
    df_evaluation.to_csv(file_path, index=False)
    return np.mean(cos_sim_scores), np.mean(page_number_score),  cos_sim_scores, page_number_score, df_evaluation, file_path

# Experimentation using MLFLOW library

Next, we would like to experiment with input and out format. We will pass input as json file and extract output as json file. 

In [157]:
import mlflow
mlflow.set_experiment(experiment_name="RAG_EXP")
    
#import yaml
#config = yaml.load(open("EvalConfig.yaml", "r"), Loader=yaml.FullLoader)
table_name = 'EarningsCallChunksEmbedding'

ExperimentConfig1 = {'retrieve_k': 5, 'similarity_method': 'knn'} # similarity = ['cosine', 'NN', 'inner']

RUNConfigs = [ExperimentConfig1]

question_prompt_template = """Use the following portion of the context document to find relevant text and answer the question in details. Extract PageNumber and LineNumber and show it in the answer. 
    {context}
    Question: {question}
    If the answer is not found, please double check as it is most likely in the provided context, and only if you are sure say that answer is not available in the documentation, in this case the value for pagenumber and line number would be NA.
    If you found an answer from the provided text, make sure to provide the answer in a json format with PageNumber, LineNumber, and answer as the json keys. 
    
    """
QUESTION_PROMPT = PromptTemplate(
        template=question_prompt_template, input_variables=["context", "question"]
    )

for ExperimentConfig in RUNConfigs:
    current_timestamp = pd.Timestamp.now()
    timestamp_str = current_timestamp.strftime("%Y%m%d_%H%M%S")
    mlflow_run_name = "Retreive_k_" + str(ExperimentConfig['retrieve_k']) + "_similarity_" + ExperimentConfig['similarity_method']+ "_date_" + timestamp_str
    with mlflow.start_run(run_name=mlflow_run_name) as run:
        mean_sim_score, mean_page_score,cos_sim_score, page_number_score, df_evaluation, df_path = run_experiment(ExperimentConfig, QUESTION_PROMPT, verbose = False)
        print("mean_sim_score, mean_page_score:", mean_sim_score, mean_page_score)
        mlflow.log_metric("mean_sim_score", mean_sim_score)
        mlflow.log_metric("mean_page_score", mean_page_score)
        mlflow.log_param("cosine_similarity_score", str(cos_sim_score))
        mlflow.log_param("page_number_score", str(page_number_score))
        mlflow.log_param("Client code table", table_name)
        #mlflow.log_param("prompt", QUESTION_PROMPT)  
        mlflow.log_param("config", "".join(ExperimentConfig))
        mlflow.log_artifact(df_path)
        print("config :\n", ExperimentConfig)
mlflow.end_run()

gpt_output 
Answer: {
    "PageNumber": 1,
    "LineNumber": 17,
    "Answer": "The first quarter revenue was $50.1 billion, up 11 percent and 16 percent in constant currency. Earnings per share was $2.35 - and increased 4 percent and 11 percent in constant currency, when adjusted for the net tax benefit from the first quarter of fiscal year 22."
}
PageNumber: 1
LineNumber: 17
Answer: The first quarter revenue was $50.1 billion, up 11 percent and 16 percent in constant currency. Earnings per share was $2.35 - and increased 4 percent and 11 percent in constant currency, when adjusted for the net tax benefit from the first quarter of fiscal year 22.
gpt_output 
Answer: {
    "PageNumber": 19, 
    "LineNumber": 9, 
    "Answer": "In the commercial business segment, there was strong overall demand for Microsoft Cloud offerings, with growth of 31 percent in constant currency as well as share gains across many businesses. Commercial bookings declined 3 percent and increased 16 percent in co