# Retriever Evaluation

**Metrics**: Recall, Precision

**Embeddings** methods: OpenAI Ada, Google Embedding-001, MPNet-Multilingual

**Retriever type**: Similarity Search, MMR Search

In [1]:
import ast
import chromadb
from tqdm import tqdm
import pandas as pd
import streamlit as st
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings, HuggingFaceEmbeddings
import google.generativeai as genai
genai.configure(api_key=st.secrets["google_api_key"])

In [2]:
# Memastikan vector database sudah sama
client = chromadb.PersistentClient("knowledge_database")
client.list_collections()

[Collection(name=hf_ftmm), Collection(name=gemini_ftmm), Collection(name=ftmm)]

In [3]:
# OpenAI
embed_openai = OpenAIEmbeddings(openai_api_key=st.secrets["openai_key"])
docsearch_openai =  Chroma(persist_directory="knowledge_database", collection_name="ftmm", embedding_function=embed_openai)

# Huggingface
embed_hf = HuggingFaceEmbeddings(model_name="sentence-transformers/paraphrase-multilingual-mpnet-base-v2")
docsearch_hf = Chroma(persist_directory="knowledge_database", collection_name="hf_ftmm", embedding_function=embed_hf)

# Google
def embed_fn(text):
    return genai.embed_content(model="models/embedding-001", content=text, task_type="retrieval_document")["embedding"]
docsearch_google = Chroma(persist_directory="knowledge_database", collection_name="gemini_ftmm")

In [4]:
print("Number of Chunks in the collection")
print(f"OpenAI: {docsearch_openai._collection.count()}")
print(f"Huggingface: {docsearch_hf._collection.count()}")
print(f"Google: {docsearch_google._collection.count()}")

Number of Chunks in the collection
OpenAI: 227
Huggingface: 227
Google: 227


In [6]:
# Data Test untuk Retrieval
corpus = pd.read_csv("corpus.csv")
retrieval_data = pd.read_csv("relevant_docs.csv")
retrieval_data['relevant_docs'] = retrieval_data['relevant_docs'].apply(lambda x: ast.literal_eval(x))
retrieval_data.head()

Unnamed: 0,id,tag,query,relevant_docs,title_1,title_2
0,0,Pengenalan FTMM,FTMM adalah,"[92, 75]",Sejarah FTMM.txt,Prestasi Mahasiswa.txt
1,1,Lokasi FTMM,Halo FTMM ada dimana?,"[92, 32]",Sejarah FTMM.txt,FTMM Overview.txt
2,2,Informasi Kontak,Apakah FTMM mempunyai akun media sosial?,[81],QnA.txt,
3,3,Sejarah FTMM,Berapa jumlah mahasiswa FTMM?,[92],Sejarah FTMM.txt,
4,4,Prodi FTMM,Apakah ada jurusan yang hanya ada satu di Indo...,"[93, 32]",Selayang Pandang FTMM.txt,FTMM Overview.txt


**Retriever:** Similarity Search

In [8]:
# OpenAI
retrieved_docs_openai = list()
for i in tqdm(range(len(retrieval_data))):
    q = retrieval_data["query"].iloc[i]
    retrieved_docs = docsearch_openai.similarity_search(q, k=2)
    docs_title = [d.metadata["source"][11:] for d in retrieved_docs]
    docs_id = [corpus[corpus["file_name"] == docs_title[i]]["id"].values[0] for i in range(len(docs_title))]
    retrieved_docs_openai.append(docs_id)

100%|██████████| 61/61 [00:30<00:00,  2.00it/s]


In [7]:
# HuggingFace
retrieved_docs_hf = list()
for i in tqdm(range(len(retrieval_data))):
    q = retrieval_data["query"].iloc[i]
    retrieved_docs = docsearch_hf.similarity_search(q, k=2)
    docs_title = [d.metadata["source"][11:] for d in retrieved_docs]
    docs_id = [corpus[corpus["file_name"] == docs_title[i]]["id"].values[0] for i in range(len(docs_title))]
    retrieved_docs_hf.append(docs_id)

100%|██████████| 61/61 [00:04<00:00, 14.24it/s]


In [9]:
# Google
retrieved_docs_google = list()
for i in tqdm(range(len(retrieval_data))):
    q = retrieval_data["query"].iloc[i]
    embedding = embed_fn(q)
    retrieved_docs = docsearch_google.similarity_search_by_vector(embedding, k=2)
    docs_title = [d.metadata["source"][11:] for d in retrieved_docs]
    docs_id = [corpus[corpus["file_name"] == docs_title[i]]["id"].values[0] for i in range(len(docs_title))]
    retrieved_docs_google.append(docs_id)

100%|██████████| 61/61 [00:41<00:00,  1.47it/s]


In [10]:
retrieval_data["ss_openai"] = retrieved_docs_openai
retrieval_data["ss_hf"] = retrieved_docs_hf
retrieval_data["ss_google"] = retrieved_docs_google

**Retriever:** MMR Search

In [11]:
# OpenAI
retrieved_docs_openai = list()
for i in tqdm(range(len(retrieval_data))):
    q = retrieval_data["query"].iloc[i]
    retrieved_docs = docsearch_openai.max_marginal_relevance_search(q, k=2)
    docs_title = [d.metadata["source"][11:] for d in retrieved_docs]
    docs_id = [corpus[corpus["file_name"] == docs_title[i]]["id"].values[0] for i in range(len(docs_title))]
    retrieved_docs_openai.append(docs_id)

100%|██████████| 61/61 [00:28<00:00,  2.15it/s]


In [12]:
# HuggingFace
retrieved_docs_hf = list()
for i in tqdm(range(len(retrieval_data))):
    q = retrieval_data["query"].iloc[i]
    retrieved_docs = docsearch_hf.max_marginal_relevance_search(q, k=2)
    docs_title = [d.metadata["source"][11:] for d in retrieved_docs]
    docs_id = [corpus[corpus["file_name"] == docs_title[i]]["id"].values[0] for i in range(len(docs_title))]
    retrieved_docs_hf.append(docs_id)

100%|██████████| 61/61 [00:04<00:00, 15.16it/s]


In [13]:
# Google
retrieved_docs_google = list()
for i in tqdm(range(len(retrieval_data))):
    q = retrieval_data["query"].iloc[i]
    embedding = embed_fn(q)
    retrieved_docs = docsearch_google.max_marginal_relevance_search_by_vector(embedding, k=2)
    docs_title = [d.metadata["source"][11:] for d in retrieved_docs]
    docs_id = [corpus[corpus["file_name"] == docs_title[i]]["id"].values[0] for i in range(len(docs_title))]
    retrieved_docs_google.append(docs_id)

100%|██████████| 61/61 [00:26<00:00,  2.27it/s]


In [14]:
retrieval_data["mmr_openai"] = retrieved_docs_openai
retrieval_data["mmr_hf"] = retrieved_docs_hf
retrieval_data["mmr_google"] = retrieved_docs_google

In [41]:
retrieval_data.to_csv("result_retrieval.csv", index=False)

# Results

In [15]:
def evaluation(true_values, predicted_values):
    true_positives = set(true_values) & set(predicted_values)
    false_positives = set(predicted_values) - set(true_values)
    false_negatives = set(true_values) - set(predicted_values)

    recall = len(true_positives) / (len(true_positives) + len(false_negatives)) if (len(true_positives) + len(false_negatives)) > 0 else 0
    precision = len(true_positives) / (len(true_positives) + len(false_positives)) if (len(true_positives) + len(false_positives)) > 0 else 0

    return recall, precision

In [16]:
experiments = ["ss_openai", "ss_hf", "ss_google", "mmr_openai", "mmr_hf", "mmr_google"]

result = dict()
for e in experiments:
    recall_list = list()
    precision_list = list()
    for i in range(len(retrieval_data)):
        true, pred = retrieval_data[["relevant_docs", e]].iloc[i].values
        r, p = evaluation(true, pred)
        recall_list.append(r)
        precision_list.append(p)
        
    avg_recall = sum(recall_list) / len(recall_list)
    avg_precision = sum(precision_list) / len(precision_list)

    result[e] = {"recall": round(avg_recall, 4) * 100, "precision": round(avg_precision, 4) * 100}

In [46]:
pd.DataFrame(result).to_csv("metrics_retrieval.csv")

In [47]:
pd.DataFrame(result)

Unnamed: 0,ss_openai,ss_hf,ss_google,mmr_openai,mmr_hf,mmr_google
recall,86.07,53.28,22.95,59.02,47.54,22.95
precision,77.05,45.9,19.67,48.36,39.34,18.85


# Measuring the Quality of Generated Text

**Preparation**

In [18]:
import json
import time
import numpy as np
from datasets import load_metric

In [19]:
with open("intentsFTMM.json", "r", encoding="utf-8") as f:
    data = json.load(f)["intents"]

In [20]:
question = list()
answer = list()
for i in range(1, len(data)):
    sample = data[i]
    for j in range(len(sample["patterns"])):
        question.append(sample["patterns"][j])
        answer.append(sample["responses"][0])
QA_pair = pd.DataFrame({
    "question": question,
    "answer": answer
})
QA_pair = QA_pair.reset_index()
QA_pair.head()

Unnamed: 0,index,question,answer
0,0,FTMM adalah,Fakultas Teknologi Maju dan Multidisiplin (FTM...
1,1,Halo FTMM itu apa?,Fakultas Teknologi Maju dan Multidisiplin (FTM...
2,2,Kapan FTMM didirikan?,Fakultas Teknologi Maju dan Multidisiplin (FTM...
3,3,Ada berapa program studi di FTMM?,Fakultas Teknologi Maju dan Multidisiplin (FTM...
4,4,Apa saja program studi yang ada di FTMM?,Fakultas Teknologi Maju dan Multidisiplin (FTM...


In [21]:
bleu_metric = load_metric("sacrebleu", trust_remote_code=True)
rouge_metric = load_metric("rouge", trust_remote_code=True)

  bleu_metric = load_metric("sacrebleu", trust_remote_code=True)


In [22]:
def compute_bleu(prediction, reference):
    bleu_metric.add(prediction=prediction, reference=reference)
    results = bleu_metric.compute(smooth_method="floor", smooth_value=0)
    results["precisions"] = [np.round(p, 2) for p in results["precisions"]]
    return results

def compute_rouge(prediction, reference):
    rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]
    rouge_metric.add(prediction=prediction, reference=reference)
    score = rouge_metric.compute()
    results = dict((rn, score[rn].mid.fmeasure) for rn in rouge_names)
    return results

In [23]:
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.prompts import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate,
    MessagesPlaceholder
)
from langchain.memory import ConversationBufferWindowMemory

In [24]:
# Prompt and Memory
# Template prompt
template = """Jawablah pertanyaan di bawah ini berdasarkan konteks yang diberikan! \
Jika dalam pertanyaan merujuk ke histori chat sebelumnya, maka gunakan konteks dari pertanyaan \
sebelumnya untuk menjawab!
Konteks:
{context}

Pertanyaan:
{question}
"""

template_system = """Namamu adalah FTMMQA, sebuah chatbot Fakultas Teknologi Maju dan \
Multidisiplin (FTMM), Universitas Airlangga. Kamu siap menjawab pertanyaan apapun \
seputar FTMM. Kamu menjawab setiap pertanyaan dengan ceria, sopan, dan asik!
"""

# Prompt
prompt_template = ChatPromptTemplate(
    messages=[
        SystemMessagePromptTemplate.from_template(template_system),
        MessagesPlaceholder(variable_name="chat_history"),
        HumanMessagePromptTemplate.from_template(template),
    ]
)

# Memory
memory = ConversationBufferWindowMemory(memory_key="chat_history", return_messages=True, k=1)


In [25]:
embed = OpenAIEmbeddings(openai_api_key=st.secrets["openai_key"])
docsearch = Chroma(persist_directory="knowledge_database", collection_name="ftmm", embedding_function=embed)

**Gemini**

In [26]:
chat_model = ChatGoogleGenerativeAI(model="gemini-pro", 
                                    temperature=0, 
                                    google_api_key=st.secrets["google_api_key"],
                                    convert_system_message_to_human=True)
chain = prompt_template | chat_model

In [27]:
result_gemini = dict()

In [28]:
max_retries = 3

for i in tqdm(range(len(QA_pair))):
    retries = 0
    success = False

    while retries < max_retries and not success:
        try:
            q = QA_pair.iloc[i]["question"]
            contexts = docsearch.similarity_search(q, k=2)
            start = time.time()
            r = chain.invoke({"question": q, "context":contexts, "chat_history":[]})
            result_gemini[str(i)] = {
                "response": r,
                "time": time.time() - start
            }
            time.sleep(1)
            success = True
        except Exception as e:
            print(f"Error: {e}")
            retries += 1
            time.sleep(10)

    if not success:
        print(f"Failed after {max_retries} retries. Moving to the next item.")

 40%|████      | 36/90 [02:55<03:51,  4.29s/it]

Error: index: 0
finish_reason: RECITATION

Error: index: 0
finish_reason: RECITATION

Error: index: 0
finish_reason: RECITATION



 41%|████      | 37/90 [03:39<14:14, 16.12s/it]

Failed after 3 retries. Moving to the next item.


 77%|███████▋  | 69/90 [06:08<01:39,  4.75s/it]

Error: index: 0
finish_reason: RECITATION

Error: index: 0
finish_reason: RECITATION

Error: index: 0
finish_reason: RECITATION



 78%|███████▊  | 70/90 [06:56<05:49, 17.47s/it]

Failed after 3 retries. Moving to the next item.


100%|██████████| 90/90 [08:55<00:00,  5.95s/it]


In [29]:
keys = [int(k) for k in result_gemini.keys()]
for i in range(len(QA_pair)):
    if i not in keys:
        print(i)

36
69


In [36]:
# result to dataframe
df_result_gemini = pd.DataFrame({
    "index": [int(i) for i in result_gemini.keys()],
    "response": [result_gemini[k]['response'].content for k in result_gemini.keys()],
    "running_time": [result_gemini[k]['time'] for k in result_gemini.keys()]
})

# handling errors
df_result_gemini_NA = pd.concat([df_result_gemini, pd.DataFrame({
    "index" : [36, 69],
    "response" : [pd.NA, pd.NA],
    "running_time" : [pd.NA, pd.NA]
})]).reset_index(drop=True)

# fix df
RESULT_GEMINI = QA_pair.copy()
RESULT_GEMINI["response_gemini"] = df_result_gemini_NA["response"].copy()
RESULT_GEMINI["running_time_gemini"] = df_result_gemini_NA["running_time"].copy()

# compute metrics and update fix df
BLEU_GEMINI = list()
ROUGE_GEMINI = list()
for i in tqdm(range(len(QA_pair))):
    p = RESULT_GEMINI["response_gemini"].iloc[i]
    r = RESULT_GEMINI["answer"].iloc[i]
    BLEU_GEMINI.append(compute_bleu(p, [r]))
    ROUGE_GEMINI.append(compute_rouge(p, [r]))

# fixx df
RESULT_GEMINI = pd.concat([RESULT_GEMINI, pd.DataFrame(BLEU_GEMINI), pd.DataFrame(ROUGE_GEMINI)], axis=1)

  df_result_gemini_NA = pd.concat([df_result_gemini, pd.DataFrame({
100%|██████████| 90/90 [00:12<00:00,  7.36it/s]


**OPENAI**

In [30]:
from langchain.chat_models import ChatOpenAI

In [31]:
chat_model = ChatOpenAI(temperature=0, 
                        openai_api_key=st.secrets["openai_key"])
chain = prompt_template | chat_model

In [32]:
result_openai = dict()

In [33]:
max_retries = 3

for i in tqdm(range(len(QA_pair))):
    retries = 0
    success = False

    while retries < max_retries and not success:
        try:
            q = QA_pair.iloc[i]["question"]
            contexts = docsearch.similarity_search(q, k=2)
            start = time.time()
            r = chain.invoke({"question": q, "context":contexts, "chat_history":[]})
            result_openai[str(i)] = {
                "response": r,
                "time": time.time() - start
            }
            time.sleep(1)
            success = True
        except Exception as e:
            print(f"Error: {e}")
            retries += 1
            time.sleep(10)

    if not success:
        print(f"Failed after {max_retries} retries. Moving to the next item.")

100%|██████████| 90/90 [12:25<00:00,  8.28s/it]


In [34]:
# result to dataframe
df_result_openai = pd.DataFrame({
    "index": [int(i) for i in result_openai.keys()],
    "response": [result_openai[k]['response'].content for k in result_openai.keys()],
    "running_time": [result_openai[k]['time'] for k in result_openai.keys()]
})

# handling errors
df_result_openai_NA = pd.concat([df_result_openai, pd.DataFrame({
    "index" : [i for i in range(10, len(QA_pair))],
    "response" : [pd.NA for i in range(10, len(QA_pair))],
    "running_time" : [pd.NA for i in range(10, len(QA_pair))]
})]).reset_index(drop=True)

# fix df
RESULT_OPENAI = QA_pair.copy()
RESULT_OPENAI["response_openai"] = df_result_openai_NA["response"].copy()
RESULT_OPENAI["running_time_openai"] = df_result_openai_NA["running_time"].copy()

# compute metrics and update fix df
BLEU_OPENAI = list()
ROUGE_OPENAI = list()
for i in tqdm(range(len(QA_pair))):
    p = RESULT_OPENAI["response_openai"].iloc[i]
    r = RESULT_OPENAI["answer"].iloc[i]
    BLEU_OPENAI.append(compute_bleu(p, [r]))
    ROUGE_OPENAI.append(compute_rouge(p, [r]))

# fixx df
RESULT_OPENAI = pd.concat([RESULT_OPENAI, pd.DataFrame(BLEU_OPENAI), pd.DataFrame(ROUGE_OPENAI)], axis=1)

  df_result_openai_NA = pd.concat([df_result_openai, pd.DataFrame({
100%|██████████| 90/90 [00:12<00:00,  7.11it/s]


# Result

In [43]:
RESULT_GEMINI.to_csv("result_gemini.csv", index=False)
RESULT_OPENAI.to_csv("result_openai.csv", index=False)

# Analysis

In [64]:
result_retrieval = pd.read_csv("result_retrieval.csv")

columns_list = ["relevant_docs", "ss_openai", "ss_hf", "ss_google", "mmr_openai", "mmr_hf", "mmr_google"]
for c in columns_list:
    result_retrieval[c] = result_retrieval[c].apply(lambda x: ast.literal_eval(x))

If K = 1

In [79]:
label = result_retrieval["relevant_docs"]

label_k1 = label.apply(lambda x: [x[0]])
ss_openai_k1 = result_retrieval["ss_openai"].apply(lambda x: [x[0]])
ss_google_k1 = result_retrieval["ss_google"].apply(lambda x: [x[0]])
ss_hf_k1 = result_retrieval["ss_hf"].apply(lambda x: [x[0]])
mmr_openai_k1 = result_retrieval["mmr_openai"].apply(lambda x: [x[0]])
mmr_google_k1 = result_retrieval["mmr_google"].apply(lambda x: [x[0]])
mmr_hf_k1 = result_retrieval["mmr_hf"].apply(lambda x: [x[0]])

In [92]:
experiments = [ss_openai_k1, ss_google_k1, ss_hf_k1, mmr_openai_k1, mmr_google_k1, mmr_hf_k1]
experiments_name = ["ss_openai_k1", "ss_google_k1", "ss_hf_k1", "mmr_openai_k1", "mmr_google_k1", "mmr_hf_k1"]

result = dict()
for e, name in zip(experiments, experiments_name):
    recall_list = list()
    precision_list = list()
    for i in range(len(result_retrieval)):
        true, pred = label_k1[i], e[i]
        r, p = evaluation(true, pred)
        recall_list.append(r)
        precision_list.append(p)
        
    avg_recall = sum(recall_list) / len(recall_list)
    avg_precision = sum(precision_list) / len(precision_list)

    result[name] = {"recall": round(avg_recall, 4) * 100, "precision": round(avg_precision, 4) * 100}

In [94]:
pd.DataFrame(result)

Unnamed: 0,ss_openai_k1,ss_google_k1,ss_hf_k1,mmr_openai_k1,mmr_google_k1,mmr_hf_k1
recall,85.25,6.56,65.57,85.25,6.56,65.57
precision,85.25,6.56,65.57,85.25,6.56,65.57
