## Importing libraries

In [1]:
import pandas as pd
from langchain_community.document_loaders import DataFrameLoader
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings
import uuid
import os
from langchain_core.documents import Document
from tqdm import tqdm
from langchain_core.prompts import ChatPromptTemplate
from langchain.prompts import PromptTemplate
from langchain_groq import ChatGroq
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.output_parsers import StrOutputParser

In [2]:
os.environ['OPENAI_API_KEY'] = "***"
groq_api_key = "***"

## Implementing RAG with Multi-query

In [3]:
#Load data
test_dataset = pd.read_csv("random_dataset.csv")
loader = DataFrameLoader(test_dataset, page_content_column='statement')
orig_test_documents = loader.load()

# sample queries
questions = ['Do Best Buy employees feel understaffed?', 'What are the most common reasons for employees to leave Best Buy?', 'What do Best Buy employees think of the company?']

In [4]:
# Split
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=300, 
    chunk_overlap=50)
# Make splits
orig_test_documents_splits = text_splitter.split_documents(orig_test_documents)

In [None]:
#Index
print("indexing ...")
vectorstore = Chroma.from_documents(documents=orig_test_documents_splits, embedding=OpenAIEmbeddings())
retriever = vectorstore.as_retriever()

In [6]:
# Multi Query: Different Perspectives on the same query 
template = """Given the question below, generate five different variants of the same question. Original question: {question}"""
prompt_perspectives = ChatPromptTemplate.from_template(template)

llm = ChatGroq(temperature=0, groq_api_key=groq_api_key, model_name='mixtral-8x7b-32768')

generate_queries = (
    prompt_perspectives 
    | llm
    | StrOutputParser() 
    | (lambda x: x.split("\n"))
)

In [11]:
multi_queries=[]
for question in questions:
    ques_chain = generate_queries
    queries = ques_chain.invoke({"question":question})
    multi_queries.append(queries)

In [12]:
multi_queries

[['1. Are Best Buy employees experiencing a lack of sufficient staffing?',
  '2. To what extent do Best Buy employees perceive there to be inadequate staffing levels?',
  '3. How often do Best Buy employees feel that there are not enough staff members to handle the workload?',
  '4. Do Best Buy employees report feeling overwhelmed by the workload due to insufficient staffing?',
  '5. Is there a general consensus among Best Buy employees that the company is understaffed?'],
 ['1. What are the primary factors that frequently lead to employee turnover at Best Buy?',
  '2. Can you identify the most prevalent causes of staff departure from Best Buy?',
  '3. What are the top reasons that have been observed for employees leaving Best Buy?',
  '4. What are some of the most recurring motives for employees to resign from Best Buy?',
  '5. Can you enumerate the most frequent reasons for employee attrition at Best Buy?'],
 ['1. What are the thoughts of Best Buy employees regarding the company they

In [13]:
# Retreive
q1_retrive_documents = [vectorstore.similarity_search_with_score(query,k=20) for query in multi_queries[0]]
q2_retrive_documents = [vectorstore.similarity_search_with_score(query,k=20) for query in multi_queries[1]]
q3_retrive_documents = [vectorstore.similarity_search_with_score(query,k=20) for query in multi_queries[2]]

In [15]:
# Flatten and sort the retreived doc by similarity score
q1_retrive_documents_final = sorted([item for sublist in q1_retrive_documents for item in sublist], key=lambda x:x[1])
q2_retrive_documents_final = sorted([item for sublist in q2_retrive_documents for item in sublist], key=lambda x:x[1])
q3_retrive_documents_final = sorted([item for sublist in q3_retrive_documents for item in sublist], key=lambda x:x[1])

In [16]:
# remove all duplicate docs

present_ids = set()
q1_retrive_documents_final_final = []
for doc in q1_retrive_documents_final:
    if doc[0].metadata["reddit_id"] not in present_ids:
        present_ids.add(doc[0].metadata["reddit_id"])
        q1_retrive_documents_final_final.append(doc)

present_ids = set()
q2_retrive_documents_final_final = []
for doc in q2_retrive_documents_final:
    if doc[0].metadata["reddit_id"] not in present_ids:
        present_ids.add(doc[0].metadata["reddit_id"])
        q2_retrive_documents_final_final.append(doc)
len(q2_retrive_documents_final_final)

present_ids = set()
q3_retrive_documents_final_final = []
for doc in q3_retrive_documents_final:
    if doc[0].metadata["reddit_id"] not in present_ids:
        present_ids.add(doc[0].metadata["reddit_id"])
        q3_retrive_documents_final_final.append(doc)
len(q3_retrive_documents_final_final)

23

In [17]:
[doc[0].metadata['Question 1'] for doc in q1_retrive_documents_final_final[:20]]

[1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0]

In [18]:
[doc[0].metadata['Question 2'] for doc in q2_retrive_documents_final_final[:20]]

[1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0]

In [19]:
[doc[0].metadata['Question 3'] for doc in q3_retrive_documents_final_final[:20]]

[1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0]

In [24]:
MQRAGQ1 = [1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0]
MQRAGQ2 = [1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0]
MQRAGQ3 = [1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0]

## To compare with baseline retreiver we remove the multi-query part below

In [35]:
q1_retrive_documents_simple = vectorstore.similarity_search_with_score(questions[0],k=20)
q2_retrive_documents_simple = vectorstore.similarity_search_with_score(questions[1],k=20)
q3_retrive_documents_simple = vectorstore.similarity_search_with_score(questions[2],k=20)

In [36]:
[doc[0].metadata['Question 1'] for doc in q1_retrive_documents_simple]

[1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1]

In [37]:
[doc[0].metadata['Question 2'] for doc in q2_retrive_documents_simple]

[1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1]

In [38]:
[doc[0].metadata['Question 3'] for doc in q3_retrive_documents_simple]

[1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]

In [25]:
baselineRAGQ1 = [1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1]
baselineRAGQ2 = [1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1]
baselineRAGQ3 = [1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]

## Final evaluations

In [29]:
def evaluate(lis, k):
    
    number_of_relevant_item_retreived_k = sum(lis[:k])
    total_relevant = sum(lis)
    
    recall = number_of_relevant_item_retreived_k/total_relevant
    prec = number_of_relevant_item_retreived_k / k
    f1_score = 2*prec*recall/(prec+recall)

    # print('prec : ', prec, '  recall : ', recall, ' f1_score : ', f1_score)
    return prec, recall, f1_score
    

In [30]:
BL_metrics_q1 = [evaluate(baselineRAGQ1, i) for i in range(5,21,5)]
MQ_metrics_q1 = [evaluate(MQRAGQ1, i) for i in range(5,21,5)]

In [31]:
BL_metrics_q2 = [evaluate(baselineRAGQ2, i) for i in range(5,21,5)]
MQ_metrics_q2 = [evaluate(MQRAGQ2, i) for i in range(5,21,5)]

In [32]:
BL_metrics_q3 = [evaluate(baselineRAGQ3, i) for i in range(5,21,5)]
MQ_metrics_q3 = [evaluate(MQRAGQ3, i) for i in range(5,21,5)]

In [33]:
import numpy as np

(np.array([met[2] for met in BL_metrics_q1]) + np.array([met[2] for met in BL_metrics_q2]) + np.array([met[2] for met in BL_metrics_q3]) )/3

array([0.35130719, 0.60549595, 0.74135124, 0.7491854 ])

In [34]:
(np.array([met[2] for met in MQ_metrics_q1]) + np.array([met[2] for met in MQ_metrics_q2]) + np.array([met[2] for met in MQ_metrics_q3]) )/3

array([0.3995098 , 0.64646465, 0.77492877, 0.73655914])