# **All Imports**

In [58]:
!pip install llama_index pypdf



In [59]:
import openai
import time
import pypdf
import pandas as pd

from llama_index.evaluation import (
    RelevancyEvaluator,
    DatasetGenerator,
    FaithfulnessEvaluator,
)

from llama_index import (
    SimpleDirectoryReader,
    VectorStoreIndex,
    ServiceContext
)

from llama_index.llms import OpenAI

# **OpenAI API Key**

In [60]:
openai.api_key = 'OPENAI_API_KEY'

# **Dataset**

In [61]:
!mkdir -p 'dataset'
!wget 'https://www.irs.gov/pub/irs-pdf/p3.pdf' -O 'dataset/IRS.pdf'

--2023-10-16 22:53:44--  https://www.irs.gov/pub/irs-pdf/p3.pdf
Resolving www.irs.gov (www.irs.gov)... 23.66.67.174, 2600:1407:3c00:d8f::f50, 2600:1407:3c00:d95::f50
Connecting to www.irs.gov (www.irs.gov)|23.66.67.174|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1433499 (1.4M) [application/pdf]
Saving to: ‘dataset/IRS.pdf’


2023-10-16 22:53:44 (16.4 MB/s) - ‘dataset/IRS.pdf’ saved [1433499/1433499]



# **Load Dataset**

In [62]:
documents = SimpleDirectoryReader("./dataset/").load_data()

In [63]:
# To evaluate for each chunk size,
# we will first generate a set of 10 questions from first 10 pages.
eval_documents = documents[0:10]

# **Question Generation**

In [64]:
data_generator = DatasetGenerator.from_documents(documents)
eval_questions = data_generator.generate_questions_from_nodes(num = 10) # Only 10 questions



# **Establishing Evaluators**

In [65]:
# We will use gpt-3.5-turbo for evaluating the responses
llm = OpenAI(temperature=0, model="gpt-3.5-turbo")

# Define service context for gpt-3.5-turbo for evaluation
service_context_llm = ServiceContext.from_defaults(llm=llm)

# Define Faithfulness and Relevancy Evaluators which are based on gpt-3.5-turbo
faithfulness_llm = FaithfulnessEvaluator(service_context=service_context_llm)
relevancy_llm = RelevancyEvaluator(service_context=service_context_llm)

# **Main Evaluator Method**

In [66]:
# Define function to calculate average response time, average faithfulness and average relevancy metrics for given chunk size
# We use GPT-3.5-Turbo to generate response and GPT-4 to evaluate it.
def evaluator(chunk_size, eval_questions):
    total_response_time = 0
    total_faithfulness = 0
    total_relevancy = 0

    # create vector index
    llm = OpenAI(model="gpt-3.5-turbo")
    service_context = ServiceContext.from_defaults(llm=llm, chunk_size=chunk_size)
    vector_index = VectorStoreIndex.from_documents(
        eval_documents, service_context=service_context
    )
    # build query engine
    query_engine = vector_index.as_query_engine()
    num_questions = len(eval_questions)

   # we're using a loop here to specifically measure response time for different chunk sizes.
    for question in eval_questions:
        start_time = time.time()
        response_vector = query_engine.query(question)
        elapsed_time = time.time() - start_time

        faithfulness_result = faithfulness_llm.evaluate_response(
            response=response_vector
        ).passing

        relevancy_result = relevancy_llm.evaluate_response(
            query=question, response=response_vector
        ).passing

        total_response_time += elapsed_time
        total_faithfulness += faithfulness_result
        total_relevancy += relevancy_result

    average_response_time = total_response_time / num_questions
    average_faithfulness = total_faithfulness / num_questions
    average_relevancy = total_relevancy / num_questions

    return average_response_time, average_faithfulness, average_relevancy

# **Result**

In [67]:
chunk_sizes = [128, 256, 512, 1024, 2048]
data = []

for chunk_size in chunk_sizes:
    avg_response_time, avg_faithfulness, avg_relevancy = evaluator(chunk_size, eval_questions)
    data.append({'Chunk Size': chunk_size, 'Average Response Time': avg_response_time, 'Average Faithfulness': avg_faithfulness, 'Average Relevancy': avg_relevancy})

# Creating a DataFrame
df = pd.DataFrame(data)

In [68]:
df.head()

Unnamed: 0,Chunk Size,Average Response Time,Average Faithfulness,Average Relevancy
0,128,1.899488,0.9,0.9
1,256,1.979282,0.8,0.7
2,512,1.851558,0.7,0.8
3,1024,2.048325,0.5,0.9
4,2048,2.240861,0.4,0.8
