# Demo -- Model Evaluation



In [1]:
import logging
import os

from cjw.knowledgeqa import indexers, bots
from cjw.knowledgeqa.bots import GptBot
from cjw.knowledgeqa.evaluators.ProximityEvaluator import ProximityEvaluator
from cjw.knowledgeqa.evaluators.QAData import QAData


## Set up the Environment

Adjust these variables if necessary to suit the local environment.


In [2]:

HOME = os.path.expanduser("~")
PROJECT_DIR = f"{HOME}/IdeaProjects/knowledgeqa"
DATA_FILE = f"{PROJECT_DIR}/data/wikipedia_question_similar_answer.tsv"

MARQO_SERVER = 'http://localhost:8882'
TEST_INDEX_NAME = "wiki_test_qa"

RAG_KNOWLEDGE = 5       # How many facts should be pulled out from the index before being fed to LLM
GPT_TEMPERATURE = 0.6

In [3]:
# Turn on logger too see the internal work

logging.basicConfig(level=logging.INFO)
ProximityEvaluator.logger.setLevel(logging.INFO)
GptBot.logger.setLevel(logging.INFO)


## Load Data, Indexing, and Create the Bot under Evaluation

In [4]:
# Read the golden Q&A data
data = QAData(DATA_FILE)

In [5]:
# Start an index to hold the Q&A data
index = indexers.index("marqo", new=True, serverUrl=MARQO_SERVER, indexName=TEST_INDEX_NAME)

In [6]:
# Index the data if it is not yet in there
if index.size() == 0:
    await index.add(data.to_dict(), keyFields=["answer"])

  if index.size() == 0:


In [7]:
# Test indexer's answer
question = "what is singapore's currency"
indexerAnswers = await index.search(question, top=3)
indexerAnswers

[{'question': "what is singapore's currency",
  'answer': 'the singapore dollar or dollar ( sign : $; code : sgd) is the official currency of singapore .',
  '_id': '1377',
  '_highlights': {'answer': 'the singapore dollar or dollar ( sign : $; code : sgd) is the official currency of singapore .'},
  '_score': 0.90729886},
 {'question': 'what is korean money called',
  'answer': 'the won () ( sign : ₩; code : krw) is the currency of south korea .',
  '_id': '1429',
  '_highlights': {'answer': 'the won () ( sign : ₩; code : krw) is the currency of south korea .'},
  '_score': 0.7079103},
 {'question': 'who composed the singapore national anthem',
  'answer': "composed by zubir said in 1958 as a theme song for official functions of the city council of singapore, the song was selected in 1959 as the island's anthem when it attained self-government .",
  '_id': '678',
  '_highlights': {'answer': "composed by zubir said in 1958 as a theme song for official functions of the city council of s

In [8]:
# Create a bot using the Q&A contents in the index
bot = bots.bot("gpt4").withFacts(index, contentFields=["answer"], top=RAG_KNOWLEDGE)

In [9]:
# Test the bot.  (Note the multiple citations.)
question = "what does the president of the usa do"
botAnswer = await bot.ask(question)
print(botAnswer)

print("In the index")
articles = await index.get(botAnswer.citations)
for a in articles:
    print(f"[{a['_id']}] {a['answer']}")

INFO:GptBot:Considering facts:
[409]
the president of the united states of america (potus) is the head of state and head of government of the united states .

[410]
the president leads the executive branch of the federal government and is the commander-in-chief of the united states armed forces .

[411]
the president is further empowered to grant federal pardons and reprieves , and to convene and adjourn either or both houses of congress under extraordinary circumstances.

[813]
george washington ( – , 1799) was the first president of the united states (1789–1797), the commander-in-chief of the continental army during the american revolutionary war , and one of the founding fathers of the united states .

[1156]
george washington ( – , 1799) was the first president of the united states (1789–1797), the commander-in-chief of the continental army during the american revolutionary war , and one of the founding fathers of the united states .
INFO:GptBot:Question:
what does the president of

The President of the United States of America (POTUS) is the head of state and head of government of the United States, leads the executive branch of the federal government, is the commander-in-chief of the United States Armed Forces, and is empowered to grant federal pardons and reprieves, and to convene and adjourn either or both houses of Congress under extraordinary circumstances. [409,410,411]
In the index
[409] the president of the united states of america (potus) is the head of state and head of government of the united states .
[410] the president leads the executive branch of the federal government and is the commander-in-chief of the united states armed forces .
[411] the president is further empowered to grant federal pardons and reprieves , and to convene and adjourn either or both houses of congress under extraordinary circumstances.


## Found Standard Answers in Proximity of Embedding

Make the bot answer a question, embed the answer, and see if the standard answer is in the top N items in the proximity.

In [10]:
# Create the Evaluator for the bot
evaluator = await ProximityEvaluator().forBot(bot).withData(data, index)

score = await evaluator.evaluate(sampleSize=10)

print(score)

INFO:ProximityEvaluator:Question: what was the first honda car
INFO:GptBot:Considering facts:
[1058]
the honda s600 is an automobile manufactured by honda .

[933]
triumph engineering co ltd, a defunct british motorcycle manufacturer

[291]
it was first developed by siemens in the mid-1990s and introduced by mercedes-benz under the name "keyless go" in 1998 on the w220 s-class following its design patent filed by daimler-benz on may 17, 1997.

[435]
delaware was one of the 13 colonies participating in the american revolution and on december 7, 1787, became the first state to ratify the constitution of the united states , thereby becoming known as the first state.

[1160]
motorcycle speedway, usually referred to as speedway, is a motorcycle sport involving four and sometimes up to six riders competing over four anti-clockwise laps of an oval circuit.
INFO:GptBot:Question:
what was the first honda car
INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/chat/completio

0.9


During my tests, I found the bot stated that it does not know "what states does interstate 70 travel through".  I checked the test data and found the answer to the question was:

*"interstate 70 (i-70) is an interstate highway in the united states that runs from interstate 15 near cove fort, utah , to a park and ride near baltimore, maryland ."*

This is **NOT** a correct answer for the question.  Thus, the bot is actually correct not knowing the answer from the provided facts.

In [11]:
answerUnknownQuestion = "what states does interstate 70 travel through"
answerUnknownId = '777'
print(await index.get(answerUnknownId))

print(await bot.ask(answerUnknownQuestion))

[{'_found': True, 'question': 'what states does interstate 70 travel through', 'answer': 'interstate 70 (i-70) is an interstate highway in the united states that runs from interstate 15 near cove fort, utah , to a park and ride near baltimore, maryland .', '_id': '777'}]


INFO:GptBot:Considering facts:
[777]
interstate 70 (i-70) is an interstate highway in the united states that runs from interstate 15 near cove fort, utah , to a park and ride near baltimore, maryland .

[552]
the states which have shoreline on the east coast are, from north to south, the u.s. states of maine , new hampshire , massachusetts , rhode island , connecticut , new york , new jersey , delaware , maryland , virginia , north carolina , south carolina , georgia , and florida .

[943]
the lower forty-eight contiguous states and the federal district of washington, d.c. are in central north america between canada and mexico.

[944]
the state of alaska is west of canada and east of russia across the bering strait, and the state of hawaii is in the mid-north pacific.

[982]
arizona became the second state to adopt a "state firearm" after utah adopted the browning m1911 .
INFO:GptBot:Question:
what states does interstate 70 travel through
INFO:openai:message='OpenAI API response' path=

I don't know [--]


In [12]:
# This question could not be found in the top-5 picks in the index.

answerNotFoundByIndex = "what age group is generation x"
print(await bot.ask(answerNotFoundByIndex))

INFO:GptBot:Considering facts:
[444]
after a gestation period of about 65 days, two to five young, known as "kits", are born in spring.

[1079]
rock of ages is a rock / jukebox musical , with a book by chris d'arienzo, built around classic rock hits from the 1980s, especially from the famous glam metal bands of the decade.

[292]
as a medical diagnosis, pedophilia or paedophilia is a psychiatric disorder in persons 16 years of age or older typically characterized by a primary or exclusive sexual interest toward prepubescent children (generally age 11 years or younger, though specific diagnosis criteria for the disorder extends the cut-off point for prepubescence to age 13).

[250]
humans are born with over 270 bones, some of which fuse together into a longitudinal axis, the axial skeleton , to which the appendicular skeleton is attached.

[821]
the launch of the playstation 3 into the japanese market on 10 november 2006 marked the second major seventh generation entertainment system to

I don't know [--]
