code adapted from https://github.com/svpino/llm/blob/main/evaluation/notebook.ipynb

In [None]:
pip install -q langchain langchain-openai langchain_community docarray pydantic==1.10.8 python-dotenv ruff bs4 ipytest giskard[llm] pypdf chromadb

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/817.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m163.8/817.7 kB[0m [31m4.6 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m817.7/817.7 kB[0m [31m14.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m70.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m270.2/270.2 kB[0m [31m32.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m81.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.8/8.8 MB[0m [31m42.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m594.2/594.2 kB[0m [31m37.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
import os
from google.colab import drive
import constants

from operator import itemgetter
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough, RunnableParallel
from langchain_core.messages import AIMessage, get_buffer_string
from langchain_community.llms import HuggingFaceEndpoint

os.environ["OPENAI_API_KEY"] = constants.APIKEY
os.environ["HUGGINGFACEHUB_API_TOKEN"] = constants.HUGGINGFACE_TOKEN

In [None]:
loader = PyPDFDirectoryLoader("/content/gdrive/MyDrive/pdf/.")
formatted_data = loader.load()



In [None]:
text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200,
        length_function=len,
)
documents = loader.load_and_split(text_splitter)
vector_store = Chroma.from_documents(documents, embedding=OpenAIEmbeddings(model="text-embedding-ada-002"))
retriever = vector_store.as_retriever()

#documents



Create a Knowledge Base

In [None]:
import pandas as pd
df = pd.DataFrame([d.page_content for d in documents], columns=["text"])
#df.head(10)

In [None]:
from giskard.rag import KnowledgeBase

knowledge_base = KnowledgeBase(df)

  validated_func = validate_arguments(func, config={"arbitrary_types_allowed": True})
  validated_func = validate_arguments(func, config={"arbitrary_types_allowed": True})


# Generate Test Set

In [None]:
from giskard.rag import generate_testset

testset = generate_testset(
    knowledge_base,
    num_questions=10,
    agent_description="A chatbot that can answer questions about the COMP2121 Data Mining and Text Analytics module, summarise an article, and generate sample questions",
)

Generating questions:   0%|          | 0/10 [00:00<?, ?it/s]

In [None]:
from giskard.rag import QATestset

testset = QATestset.load("test-set.jsonl")

In [None]:
test_set_df = testset.to_pandas()

#for index, row in enumerate(test_set_df.head(10).iterrows()):
#    print(f"Question {index + 1}: {row[1]['question']}")
#    print(f"Reference answer: {row[1]['reference_answer']}")
#    print("Reference context:")
#    print(row[1]['reference_context'])
#    print("******************", end="\n\n")

In [None]:
llm = ChatOpenAI(temperature=0.2, model_name="gpt-4-turbo")

In [None]:
def setup_rag_chain(retriever, llm):
    system_prompt = """You are an intelligent assistant expertise in answering questions and is dedicated to provide detailed, accurate, and relevant
            information about COMP2121 Data Mining and Text Analytics module to university students. Your knowledge is derived exclusively from a specific document/article designated as the official module content.
            Your responses must:

            1. Remain strictly within the boundaries of the Data Mining and Text Analytics module. If a question falls outside of this module, politely inform the user with a standardized response:
            "This question is beyond the scope of this module."

            2. Are based solely on the information contained within the provided document/article. Answer only the specific question posed and avoid including any irrelevant information.

            3. Are clear and directly address the question posed. Avoid providing overly broad or generic information that might detract from the specific focus of the module.

            Answer Template:
            {context}

            Question: {question}

            Your task is to apply critical thinking to interpret the question's intent, and craft a response that is informative, precise, and wholly relevant to the query at hand.
            If a student asks for a definition, provide a clear and concise definition.
            If a student asks for an explanation, provide a detailed explanation.
            If a student asks for a summarisation, provide a detailed and informative summary about the content of the document.
            """

    prompt = PromptTemplate.from_template(system_prompt)

    rag_chain = {
        "context": itemgetter("question") | retriever,
        "question": itemgetter("question")
    }
    conversation_rag_chain = rag_chain | prompt | llm | StrOutputParser()

    return conversation_rag_chain

In [None]:
conv_chain = setup_rag_chain(retriever, llm)

# Evaluating the Model on the Test Set

Create a function that invokes the chain with a specific question and returns the answer.

In [None]:
def answer_fn(question, history=None):
  return conv_chain.invoke(
                {
                    "question": question
                }
            )

Use the evaluate() function to evaluate the model on the test set. This function will compare the answers from the chain with the reference answers in the test set.

In [None]:
from giskard.rag import evaluate

report = evaluate(answer_fn, testset=testset, knowledge_base=knowledge_base)

Asking questions to the agent:   0%|          | 0/200 [00:00<?, ?it/s]

correctness evaluation:   0%|          | 0/200 [00:00<?, ?it/s]

In [None]:
display(report)

INFO:giskard.rag:Finding topics in the knowledge base.
INFO:giskard.rag:Found 24 topics in the knowledge base.


In [None]:
report.to_html("gpt4.html")

In [None]:
report.correctness_by_question_type()

Unnamed: 0_level_0,correctness
question_type,Unnamed: 1_level_1
complex,0.852941
conversational,0.060606
distracting element,0.575758
double,0.787879
simple,0.794118
situational,0.848485


In [None]:
report.get_failures()

Unnamed: 0_level_0,question,reference_answer,reference_context,conversation_history,metadata,agent_answer,correctness,correctness_reason
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
bac4169e-d216-4f6b-b37a-64db35e5f5d8,Considering the need for training and time for...,Current machine translation systems remain imp...,Document 113: language learners can clearly be...,[],"{'question_type': 'distracting element', 'seed...","In educational settings, the constraints of pr...",False,The agent's answer focuses on the integration ...
0af7474a-01d3-4bc4-b35a-32376ed71a73,"Hi, I'm currently studying for my upcoming COM...",The Turkish word 'yürek' means 'heart' in Engl...,Document 523: Fig. 13 Word sketch for Turkish ...,[],"{'question_type': 'situational', 'seed_documen...",This question is beyond the scope of this module.,False,The agent failed to provide the correct transl...
371f3ec4-96f7-41f9-873e-ba0eeca20b74,What are the two central questions that arise ...,The two central questions that arise in the co...,"Document 528: many different parts, two of the...","[{'role': 'user', 'content': 'Let's think abou...","{'question_type': 'conversational', 'seed_docu...",This question is beyond the scope of this module.,False,The agent failed to provide the two central qu...


In [None]:
pip install ragas

Collecting ragas
  Downloading ragas-0.1.7-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.2/81.2 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
Collecting pysbd>=0.3.4 (from ragas)
  Downloading pysbd-0.3.4-py3-none-any.whl (71 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m71.1/71.1 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pysbd, ragas
Successfully installed pysbd-0.3.4 ragas-0.1.7
