# Demo -- Model Evaluation



In [1]:
import logging
import os

from cjw.knowledgeqa import indexers, bots
from cjw.knowledgeqa.bots import GptBot
from cjw.knowledgeqa.evaluators.ProximityEvaluator import ProximityEvaluator
from cjw.knowledgeqa.evaluators.QAData import QAData


## Set up the Environment

Adjust these variables if necessary to suit the local environment.


In [2]:

HOME = os.path.expanduser("~")
PROJECT_DIR = f"{HOME}/IdeaProjects/knowledgeqa"
DATA_FILE = f"{PROJECT_DIR}/data/wikipedia_question_similar_answer.tsv"

MARQO_SERVER = 'http://localhost:8882'
TEST_INDEX_NAME = "wiki_test_qa"

RAG_KNOWLEDGE = 5       # How many facts should be pulled out from the index before being fed to LLM
GPT_TEMPERATURE = 0.6

In [3]:
# Turn on logger too see the internal work

logging.basicConfig(level=logging.INFO)
ProximityEvaluator.logger.setLevel(logging.INFO)
GptBot.logger.setLevel(logging.INFO)


## Load Data, Indexing, and Create the Bot under Evaluation

In [4]:
# Read the golden Q&A data
data = QAData(DATA_FILE)

In [5]:
# Start an index to hold the Q&A data
index = indexers.index("marqo", new=True, serverUrl=MARQO_SERVER, indexName=TEST_INDEX_NAME)

In [6]:
# Index the data if it is not yet in there
if index.size() == 0:
    await index.add(data.to_dict(), keyFields=["answer"])

  if index.size() == 0:


In [7]:
# Test indexer's answer
question = "what is singapore's currency"
indexerAnswers = await index.search(question, top=3)
indexerAnswers

[{'question': "what is singapore's currency",
  'answer': 'the singapore dollar or dollar ( sign : $; code : sgd) is the official currency of singapore .',
  '_id': '1377',
  '_highlights': {'answer': 'the singapore dollar or dollar ( sign : $; code : sgd) is the official currency of singapore .'},
  '_score': 0.90729886},
 {'question': 'what is korean money called',
  'answer': 'the won () ( sign : ₩; code : krw) is the currency of south korea .',
  '_id': '1429',
  '_highlights': {'answer': 'the won () ( sign : ₩; code : krw) is the currency of south korea .'},
  '_score': 0.7079103},
 {'question': 'who composed the singapore national anthem',
  'answer': "composed by zubir said in 1958 as a theme song for official functions of the city council of singapore, the song was selected in 1959 as the island's anthem when it attained self-government .",
  '_id': '678',
  '_highlights': {'answer': "composed by zubir said in 1958 as a theme song for official functions of the city council of s

In [8]:
# Create a bot using the Q&A contents in the index
bot = bots.bot("gpt4").withFacts(index, contentFields=["answer"], top=RAG_KNOWLEDGE)

In [9]:
# Test the bot.  (Note the multiple citations.)
question = "what does the president of the usa do"
botAnswer = await bot.ask(question)
print(botAnswer)

print("In the index")
articles = await index.get(botAnswer.citations)
for a in articles:
    print(f"[{a['_id']}] {a['answer']}")

INFO:GptBot:Considering facts:
[409]
the president of the united states of america (potus) is the head of state and head of government of the united states .

[410]
the president leads the executive branch of the federal government and is the commander-in-chief of the united states armed forces .

[411]
the president is further empowered to grant federal pardons and reprieves , and to convene and adjourn either or both houses of congress under extraordinary circumstances.

[813]
george washington ( – , 1799) was the first president of the united states (1789–1797), the commander-in-chief of the continental army during the american revolutionary war , and one of the founding fathers of the united states .

[1156]
george washington ( – , 1799) was the first president of the united states (1789–1797), the commander-in-chief of the continental army during the american revolutionary war , and one of the founding fathers of the united states .
INFO:GptBot:Question:
what does the president of

The President of the United States (POTUS) is the head of state and head of government of the United States, leading the executive branch of the federal government. They are also the commander-in-chief of the United States Armed Forces. Furthermore, the president has the power to grant federal pardons and reprieves, and to convene and adjourn either or both houses of Congress under extraordinary circumstances. [409,410,411]
In the index
[409] the president of the united states of america (potus) is the head of state and head of government of the united states .
[410] the president leads the executive branch of the federal government and is the commander-in-chief of the united states armed forces .
[411] the president is further empowered to grant federal pardons and reprieves , and to convene and adjourn either or both houses of congress under extraordinary circumstances.


## Found Standard Answers in Proximity of Embedding

Make the bot answer a question, embed the answer, and see if the standard answer is in the top N items in the proximity.

In [14]:
# Create the Evaluator for the bot
evaluator = await ProximityEvaluator().forBot(bot).withData(data, index)

score = await evaluator.evaluate(sampleSize=10, showFailedQuestions=True)

print(score)

INFO:ProximityEvaluator:Question: where is the island new guinea?
INFO:GptBot:Considering facts:
[73]
australia is a continent comprising mainland australia , tasmania , new guinea , seram , possibly timor , and neighbouring islands.

[60]
nassau

[157]
belize , is a country located on the northeastern coast of central america.

[61]
freeport, bahamas

[1157]
located in the southwest pacific ocean , it lies geographically to the east of the malay archipelago , with which it is sometimes included as part of a greater indo-australian archipelago .
INFO:GptBot:Question:
where is the island new guinea?
INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/chat/completions processing_ms=2745 request_id=fc2c0d067d080544caebf1230667826d response_code=200
INFO:GptBot:Answer:
New Guinea is part of the continent of Australia, located in the southwest Pacific Ocean. [73] [1157]
INFO:ProximityEvaluator:Answer: New Guinea is part of the continent of Australia, located in the sout

Failed question: what cards do you need in poker to get a royal flush (I don't know [--])


INFO:GptBot:Considering facts:
[96]
while he achieved fame during his lifetime as an author, philosopher, alchemist and astronomer , composing a scientific treatise on the astrolabe for his ten year-old son lewis, chaucer also maintained an active career in the civil service as a bureaucrat, courtier and diplomat.

[742]
charles john huffam dickens (; 7 february 1812 – 9 june 1870) was an english writer and social critic.

[1026]
postmodern literature is literature characterized by heavy reliance on techniques like fragmentation, paradox, and questionable narrators, and is often (though not exclusively) defined as a style or trend which emerged in the post–world war ii era.

[405]
the humanities are academic disciplines that study human culture, using methods that are primarily analytical , critical , or speculative , and having a significant historical element, as distinguished from the mainly empirical approaches of the natural sciences .

[1117]
jackie keith whitley (july 1, 1954sta

Failed question: what is the title of hobbes main work (I don't know [--])


INFO:GptBot:Considering facts:
[212]
in 2004, jennings won 74 jeopardy!

[127]
he has led teams to eight division championships ( 1995 , 1996 , 1997 , 2002 , 2003 , 2004 , 2007 , 2009 ), five nfc championship games ( 1995 , 1996 , 1997 , 2007 , 2009 ), and two super bowl appearances ( super bowl xxxi , super bowl xxxii ), winning one (super bowl xxxi).

[1184]
unlike the preakness and belmont stakes, which took hiatuses in 1891-1893 and 1911-1912 respectively, the kentucky derby has been run every consecutive year since 1875.

[213]
jennings is a small city in and the parish seat of jefferson davis parish , louisiana , united states , near lake charles .

[1434]
wilt chamberlain holds the all-time records for total points scored (4,029) and points per game (50.4) in a season; both records were achieved in the 1961–62 season .
INFO:GptBot:Question:
how many consecutive games did ken jennings win?
INFO:openai:message='OpenAI API response' path=https://api.openai.com/v1/chat/completions p

0.8


During my tests, I found the bot stated that it does not know "what states does interstate 70 travel through".  I checked the test data and found the answer to the question was:

*"interstate 70 (i-70) is an interstate highway in the united states that runs from interstate 15 near cove fort, utah , to a park and ride near baltimore, maryland ."*

This is **NOT** a correct answer for the question.  Thus, the bot is actually correct not knowing the answer from the provided facts.

In [11]:
answerUnknownQuestion = "what states does interstate 70 travel through"
answerUnknownId = '777'
print(await index.get(answerUnknownId))

print(await bot.ask(answerUnknownQuestion))

[{'_found': True, 'question': 'what states does interstate 70 travel through', 'answer': 'interstate 70 (i-70) is an interstate highway in the united states that runs from interstate 15 near cove fort, utah , to a park and ride near baltimore, maryland .', '_id': '777'}]


INFO:GptBot:Considering facts:
[777]
interstate 70 (i-70) is an interstate highway in the united states that runs from interstate 15 near cove fort, utah , to a park and ride near baltimore, maryland .

[552]
the states which have shoreline on the east coast are, from north to south, the u.s. states of maine , new hampshire , massachusetts , rhode island , connecticut , new york , new jersey , delaware , maryland , virginia , north carolina , south carolina , georgia , and florida .

[943]
the lower forty-eight contiguous states and the federal district of washington, d.c. are in central north america between canada and mexico.

[944]
the state of alaska is west of canada and east of russia across the bering strait, and the state of hawaii is in the mid-north pacific.

[982]
arizona became the second state to adopt a "state firearm" after utah adopted the browning m1911 .
INFO:GptBot:Question:
what states does interstate 70 travel through
INFO:openai:message='OpenAI API response' path=

I don't know. [--]


In [12]:
# These questions could not be found in the top-5 picks in the index.

answerNotFoundByIndex = "what age group is generation x"
# answerNotFoundByIndex = "what cards do you need in poker to get a royal flush"
# answerNotFoundByIndex = "what is the title of hobbes main work"
print(await bot.ask(answerNotFoundByIndex))

INFO:GptBot:Considering facts:
[444]
after a gestation period of about 65 days, two to five young, known as "kits", are born in spring.

[1079]
rock of ages is a rock / jukebox musical , with a book by chris d'arienzo, built around classic rock hits from the 1980s, especially from the famous glam metal bands of the decade.

[292]
as a medical diagnosis, pedophilia or paedophilia is a psychiatric disorder in persons 16 years of age or older typically characterized by a primary or exclusive sexual interest toward prepubescent children (generally age 11 years or younger, though specific diagnosis criteria for the disorder extends the cut-off point for prepubescence to age 13).

[250]
humans are born with over 270 bones, some of which fuse together into a longitudinal axis, the axial skeleton , to which the appendicular skeleton is attached.

[821]
the launch of the playstation 3 into the japanese market on 10 november 2006 marked the second major seventh generation entertainment system to

I don't know [--]


In [18]:
answerUnknownQuestion2 = "what is the title of hobbes main work"
print(await bot.ask(answerUnknownQuestion2))

INFO:GptBot:Considering facts:
[96]
while he achieved fame during his lifetime as an author, philosopher, alchemist and astronomer , composing a scientific treatise on the astrolabe for his ten year-old son lewis, chaucer also maintained an active career in the civil service as a bureaucrat, courtier and diplomat.

[742]
charles john huffam dickens (; 7 february 1812 – 9 june 1870) was an english writer and social critic.

[1026]
postmodern literature is literature characterized by heavy reliance on techniques like fragmentation, paradox, and questionable narrators, and is often (though not exclusively) defined as a style or trend which emerged in the post–world war ii era.

[405]
the humanities are academic disciplines that study human culture, using methods that are primarily analytical , critical , or speculative , and having a significant historical element, as distinguished from the mainly empirical approaches of the natural sciences .

[1117]
jackie keith whitley (july 1, 1954sta

I don't know [--]
