<a href="https://colab.research.google.com/github/mattambrogi/llama-index-experiments/blob/main/copy_response_evaluation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Note: gpt_index imports outdated. More up to date code in recency evaluation.

**Download blog posts**

In [None]:
# Code hidden

**LlamaIndex / GPT Setup**

In [None]:
!pip install openai==0.27
!pip install gpt_index
!pip install langchain

from gpt_index.evaluation import DatasetGenerator, QueryResponseEvaluator
from gpt_index import SimpleDirectoryReader, GPTSimpleVectorIndex, ServiceContext, LLMPredictor, Response
from langchain.chat_models import ChatOpenAI

In [None]:
import logging
import sys
import pandas as pd

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

In [None]:
os.environ["OPENAI_API_KEY"] = "<OPENAI_API_KEY"

In [None]:
reader = SimpleDirectoryReader('/content/blog_posts')
documents = reader.load_data()

**Load in pre-generated test questions**


In [None]:
# read the questions back into an array
with open('questions.txt', 'r') as f:
    eval_questions = f.read().splitlines()

In [None]:
for i in range(10):
  print(eval_questions[i])

In [None]:
# define jupyter display function
def display_eval_df(query: str, response: Response, eval_result: str) -> None:
    eval_df = pd.DataFrame(
        {
            "Query": query,
            "Response": str(response),
            "Source": response.source_nodes[0].source_text[:1000] + "...",
            "Evaluation Result": eval_result
        },
        index=[0]
    )
    eval_df = eval_df.style.set_properties(
        **{
            'inline-size': '600px',
            'overflow-wrap': 'break-word',
        },
        subset=["Response", "Source"]
    )
    display(eval_df)

**Contruct Index**



In [None]:
from gpt_index import PromptHelper

In [None]:
max_input_size = 4096
num_outputs = 256
max_chunk_overlap = 20
chunk_size_limit = 600

llm_predictor = LLMPredictor(llm=ChatOpenAI(temperature=0, model_name="gpt-3.5-turbo"))
prompt_helper = PromptHelper(max_input_size, num_outputs, max_chunk_overlap, chunk_size_limit=chunk_size_limit)
service_context = ServiceContext.from_defaults(llm_predictor=llm_predictor, prompt_helper = prompt_helper)

documents = SimpleDirectoryReader('/content/blog_posts').load_data()

index = GPTSimpleVectorIndex.from_documents(
  documents,
  service_context=service_context
)

**Test evalute 20 questions**

In [None]:
evaluator = QueryResponseEvaluator(service_context=service_context)

In [None]:
total = 20
count_valid = 0
for i in range(total):
  response_vector = index.query(eval_questions[i])
  eval_result = evaluator.evaluate(eval_questions[i], response_vector)
  print(eval_result)
  if eval_result == "YES":
    count_valid += 1

percent_valid = count_valid / total
percent_string = '{:.0%}'.format(percent_valid)
print(percent_string)

Select 100 random questions to test

In [None]:
import random

max_idx = len(eval_questions)-1
seen = set()
count = 0
rand_questions = []
while count < 120:
  rand_idx = random.randint(0, max_idx)
  while rand_idx in seen:
    rand_idx = random.randint(0, max_idx)
  seen.add(rand_idx)
  rand_questions.append(eval_questions[rand_idx])
  count += 1

print(len(rand_questions))

120


In [None]:
rand_questions[22]

"What is the pricing structure for Lexicon's software and services?"

**Run evaluation and build dataframe**

In [None]:
def create_eval_df_row(query: str, response: Response, eval_result: str) -> pd.DataFrame:
    eval_df = pd.DataFrame(
        {
            "Query": query,
            "Response": str(response),
            "Source": response.source_nodes[0].source_text[:1000] + "...",
            "Evaluation Result": eval_result
        },
        index=[0]
    )
    return eval_df

In [None]:
final_df = pd.DataFrame(columns=["Query", "Response", "Source", "Evaluation Result"])

In [None]:
total = len(rand_questions)
count_valid = 0
for i in range(total):
  response_vector = index.query(rand_questions[i])
  eval_result = evaluator.evaluate(rand_questions[i], response_vector)
  print('{}: {}'.format(i, eval_result))
  if eval_result == "YES":
    count_valid += 1
  new_row = create_eval_df_row(rand_questions[i], response_vector, eval_result)
  final_df = final_df.append(new_row, ignore_index=True)

percent_valid = count_valid / total
percent_string = '{:.0%}'.format(percent_valid)
print(percent_string)

In [None]:
final_df.head()

Unnamed: 0,Query,Response,Source,Evaluation Result
0,What legal technology companies has MyCase rec...,MyCase has recently acquired two legal technol...,The law practice management company MyCase has...,NO
1,What is Ron Markezich's vision for Lighthouse'...,Ron Markezich's vision for Lighthouse's future...,"Lighthouse, the Seattle-based company that pro...",NO
2,How has Lexion grown in the past year and whic...,Lexion has raised $11 million in an oversubscr...,Positioning itself as offering a simpler and m...,NO
3,Who led the oversubscribed Series A funding ro...,The oversubscribed Series A funding round for ...,Positioning itself as offering a simpler and m...,NO
4,When did the unauthorized access to the ABA ne...,The unauthorized access to the ABA network occ...,The American Bar Association said last night t...,NO


In [None]:
final_df.to_csv('eval_one_output.csv', index=False)



```
# This is formatted as code
```

**Evalute Test Set 2**

Non-LlamaIndex generated set

In [None]:
# read the questions back into an array
with open('gpt-generated-questions.txt', 'r') as f:
    eval_questions = f.read().split('?')
    # Remove empty strings resulting from the split and add '?' back to each question
    eval_questions = [question.strip() + '?' for question in eval_questions if question.strip()]

In [None]:
final_df_gpt = pd.DataFrame(columns=["Query", "Response", "Source", "Evaluation Result"])

In [None]:
evaluator = QueryResponseEvaluator(service_context=service_context)

In [None]:
total = len(eval_questions)
count_valid = 0
for i in range(total):
  response_vector = index.query(eval_questions[i])
  eval_result = evaluator.evaluate(eval_questions[i], response_vector)
  print('{}: {}'.format(i, eval_result))
  if eval_result == "YES":
    count_valid += 1
  new_row = create_eval_df_row(eval_questions[i], response_vector, eval_result)
  final_df_gpt = final_df_gpt.append(new_row, ignore_index=True)

percent_valid = count_valid / total
percent_string = '{:.0%}'.format(percent_valid)
print(percent_string)

In [None]:
final_df_gpt.head()

Unnamed: 0,Query,Response,Source,Evaluation Result
0,What are the main features of Clio's practice ...,The main features of Clio's practice managemen...,During his keynote address opening his company...,YES
1,How does LexisNexis streamline legal research ...,LexisNexis streamlines legal research for atto...,"in July, Pfeifer told me this tight integratio...",YES
2,What is Relativity's role in the eDiscovery pr...,Relativity's role in the eDiscovery process is...,At its annual RelativityFest user conference i...,YES
3,How does LegalZoom assist with online legal se...,LegalZoom assists with online legal services b...,I just wrote about Intapp's filing of papers w...,YES
4,How do law firms use artificial intelligence f...,The context information does not specifically ...,"Each week, we’re highlighting one of the resou...",YES


In [None]:
final_df_gpt.to_csv('eval_one_gpt_output.csv', index=False)