# **All Imports**

In [24]:
!pip install llama_index pypdf

import openai
import time
import pypdf
import pandas as pd

from llama_index.evaluation import (
    RelevancyEvaluator,
    FaithfulnessEvaluator,
)

from llama_index import (
    SimpleDirectoryReader,
    VectorStoreIndex,
    ServiceContext
)

from llama_index.llms import OpenAI



# **OpenAI API Key**

In [16]:
openai.api_key = 'OPENAI_API_KEY'

# **Dataset**

In [17]:
!mkdir -p 'dataset'
!wget 'https://www.irs.gov/pub/irs-pdf/p3.pdf' -O 'dataset/IRS.pdf'

--2023-10-17 21:10:05--  https://www.irs.gov/pub/irs-pdf/p3.pdf
Resolving www.irs.gov (www.irs.gov)... 104.92.251.161, 2600:1409:9800:987::f50, 2600:1409:9800:991::f50
Connecting to www.irs.gov (www.irs.gov)|104.92.251.161|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1433499 (1.4M) [application/pdf]
Saving to: ‘dataset/IRS.pdf’


2023-10-17 21:10:06 (23.5 MB/s) - ‘dataset/IRS.pdf’ saved [1433499/1433499]



# **Load Dataset**

In [18]:
documents = SimpleDirectoryReader("./dataset/").load_data()
# To evaluate for each chunk size,
# we will first generate a set of 10 questions from first 10 pages.
documents = documents[0:10]

# **Defining Question Bank**

In [19]:
questionBank = ['What is the purpose of Publication 3 by the Internal Revenue Service?',
                'How can individuals access forms and information related to taxes faster and easier?',
                'What are some examples of income items that are excluded from gross income for servicemembers?',
                'What is the definition of a combat zone and how does it affect the taxation of servicemembers?',
                'How are travel expenses of Armed Forces Reservists treated for tax purposes?',
                'What are some adjustments to income that individuals can make on their tax returns?',
                'How does the Combat Zone Exclusion impact the reporting of combat zone pay?',
                'What are some credits available to taxpayers, specifically related to children and dependents?',
                'How is the Earned Income Credit calculated and who is eligible for it?',
                'What are the requirements for claiming tax forgiveness related to terrorist or military action?']

# **Establishing Evaluators**

In [20]:
llm = OpenAI(temperature=0, model="gpt-3.5-turbo")
serviceContextLLM = ServiceContext.from_defaults(llm = llm)
faithfulnessLLM = FaithfulnessEvaluator(service_context=serviceContextLLM)
relevancyLLM = RelevancyEvaluator(service_context=serviceContextLLM)

# **Main Evaluator Method**

In [21]:
def evaluator(chunkSize, questionBank):
    llm = OpenAI(model = "gpt-3.5-turbo")
    serviceContext = ServiceContext.from_defaults(llm = llm,
                                                  chunk_size = chunkSize)
    vectorIndex = VectorStoreIndex.from_documents(
        documents,
        service_context = serviceContext
    )
    # building query engine
    queryEngine = vectorIndex.as_query_engine()
    # Defining Total Questions
    totalQuestions = len(questionBank)

    totalResponseTime = 0
    totalFaithfulness = 0
    totalRelevancy = 0

    # Traversing through the question bank
    for question in questionBank:
        startTime = time.time()
        responseVector = queryEngine.query(question)
        elapsedTime = time.time() - startTime

        faithfulnessResult = faithfulnessLLM.evaluate_response(
            response=responseVector
        ).passing

        relevancyResult = relevancyLLM.evaluate_response(
            query=question, response=responseVector
        ).passing

        totalResponseTime += elapsedTime
        totalFaithfulness += faithfulnessResult
        totalRelevancy += relevancyResult

    averageRelevancy = totalRelevancy / totalQuestions
    averageFaithfulness = totalFaithfulness / totalQuestions
    averageResponseTime = totalResponseTime / totalQuestions

    return averageResponseTime, averageFaithfulness, averageRelevancy

# **Result**

In [22]:
chunkSizes = [128, 256, 512, 1024]
data = []

for chunkSize in chunkSizes:
    avgResponseTime, avgFaithfulness, avgRelevancy = evaluator(chunkSize, questionBank)
    data.append({'Chunk Size': chunkSize, 'Average Response Time': avgResponseTime, 'Average Faithfulness': avgFaithfulness, 'Average Relevancy': avgRelevancy})

# Creating a DataFrame
df = pd.DataFrame(data)
df.head()

Unnamed: 0,Chunk Size,Average Response Time,Average Faithfulness,Average Relevancy
0,128,2.450118,1.0,0.9
1,256,2.420739,0.7,0.6
2,512,3.943271,0.8,0.9
3,1024,2.505998,0.4,0.8
