In [1]:
from teacher_query_tools import ANLITeacherQuerier, CQATeacherQuerier, ESNLITeacherQuerier, SVAMPTeacherQuerier

  from .autonotebook import tqdm as notebook_tqdm


## Teacher Querier

In [2]:
anliTQ = ANLITeacherQuerier()

In [5]:
yml = anliTQ.read_yaml_prompts()

In [6]:
yml

{'dataset': 'anli1',
 'templates': {1: {'id': 1,
   'system_message': '',
   'user_message': '{premise} Based on that information, is the claim: {hypothesis} true, false, or inconclusive? Answer with a one sentence explanation.',
   'label_parse': "r'(True|False|Inconclusive)'",
   'explanation': True},
  2: {'id': 2,
   'system_message': 'You are given a context in the form of a short premise and a hypothesis about the premise. Your task is to label if the hypothesis is a "contradiction" (if the hypothesis contradicts the premise), an "entailment" (if the hypothesis entails the premise), or "neutral" (if the hypothesis does not contradict or entail the premise) to the premise. Also, explain very briefly (one sentence, maximum twenty words) why it is that label, but do not repeat the whole hypothesis in your explanation.',
   'user_message': 'premise: {premise}\\nhypothesis: {hypothesis}',
   'label_parse': "r'(entailment|contradiction|neutral)'",
   'explanation': True},
  3: {'id': 3

In [3]:
anliTQ._query(split="train", idx=2, prompt_template_id=1)


Alexandra Lendon Bastedo (9 March 1946 – 12 January 2014) was a British actress, best known for her role as secret agent Sharron Macready in the 1968 British espionage/science fiction adventure series "The Champions". She has been cited as a sex symbol of the 1960s and 1970s. Bastedo was a vegetarian and animal welfare advocate. Based on that information, is the claim: Bastedo didn't keep any pets because of her views on animal rights. true, false, or inconclusive? Answer with a one sentence explanation.
RESPONSE:
Inconclusive, as there is no information provided to confirm or deny whether Bastedo kept any pets despite her views on animal rights.


In [4]:
anliTQ._batch_query(split="train", n=25, prompt_template_id=1, force_query=False)

QUERYING EXAMPLE 25/25...
Batch Query completed! (Skipped 24 queries as they were already queried and stored.)
Total Prompt Tokens: 143
Total Completion Tokens: 16
Total Costs: $0.0002465


In [2]:
svampTQ = SVAMPTeacherQuerier()

In [5]:
svampTQ._batch_query(split="train", n=30, prompt_template_id=1, force_query=False)

QUERYING EXAMPLE 30/30...
Batch Query completed! (Skipped 10 queries as they were already queried and stored.)
Total Prompt Tokens: 973
Total Completion Tokens: 1313
Total Costs: $0.0040855


In [7]:
esnli = ESNLITeacherQuerier()

In [10]:
esnli._batch_query(split="train", n=10, prompt_template_id=3, force_query=False)

QUERYING EXAMPLE 10/10...
Batch Query completed! (Skipped 0 queries as they were already queried and stored.)
Total Prompt Tokens: 560
Total Completion Tokens: 28
Total Costs: $0.0008960000000000001


In [2]:
cqaTQ = CQATeacherQuerier()

In [4]:
cqaTQ._batch_query(split="train", n=25, prompt_template_id=2, force_query=False)

QUERYING EXAMPLE 25/25...
Batch Query completed! (Skipped 20 queries as they were already queried and stored.)
Total Prompt Tokens: 326
Total Completion Tokens: 144
Total Costs: $0.000777


## Parser

In [7]:
from teacher_response_parser import SVAMPTeacherResponseParser, ESNLITeacherResponseParser, CQATeacherResponseParser, ANLITeacherResponseParser
import re

In [8]:
parser = CQATeacherResponseParser()

In [3]:
pattern = re.compile("(\d+)", re.IGNORECASE|re.DOTALL)
parser.parse_response("In each row, there are 30 crayons, so in 7 rows there are 7 * 30 = <<7*30=210>>210 crayons. Answer: \\boxed{210}.", pattern=pattern)

('neutral', None)

In [60]:
pattern = re.compile("(.+\\b)(\d+)")
match = pattern.search("In each row, there are 30 crayons, so in 7 rows there are 7 * 30 <<7*30=210>>210 crayons. Answer: \\boxed{210}.")

In [61]:
match.groups()

('In each row, there are 30 crayons, so in 7 rows there are 7 * 30 <<7*30=210>>210 crayons. Answer: \\boxed{',
 '210')

In [9]:
parser.parse_response_batch(split="train", prompt_template_id=2)

{0: ('math problem',
  'This is an example of a math problem because it involves counting and subtraction.'),
 1: ('train station',
  'as it is a common location for homeless individuals to seek shelter and it is mentioned that he lives near transportation infrastructure, suggesting a more permanent location than a bus depot or beach.'),
 2: ('upright',
  'A bad person does not prioritize being honest, acting without pretense, or being morally upright in their behavior.'),
 3: ('minnesota', 'because St. Paul is the capital city of Minnesota.'),
 4: ('corvette',
  'A corvette is a smaller, faster, and more maneuverable naval vessel that can still pack a punch with its armament, making it a suitable alternative to a battleship when speed is prioritized.'),
 5: ('canada',
  'because beavers are native to Canada and are known for building dams and taking logs in their habitats.'),
 6: ('city',
  'The grant was given to the city for the renovation, indicating that the fountain was funded by

In [10]:

s = "Tell me a {adjective} joke about {content}."

s.format(**{"adjective": "funny", "content": "chickens", "mashall": "mashall"})
#(**{"adjective": "funny", "content": "chickens", "mashall": "mashall"})

'Tell me a funny joke about chickens.'

## Utils

In [1]:
import utils

In [2]:
c1 = utils.Metadata()
c2 = utils.OtherMeta()

In [3]:
c2.static_thing

'another static'

## Evaluator

In [1]:
from teacher_response_evaluator import TeacherResponseEvaluator

  from .autonotebook import tqdm as notebook_tqdm


In [16]:
evaluator = TeacherResponseEvaluator("svamp")

In [21]:
evaluator.evaluate_responses_split(split = "train", prompt_template_id = 1)

{'accuracy': 0.6666666666666666,
 'n_correct': 20,
 'n_wrong': 10,
 'n_none_responses': 2,
 'total_reponses': 28,
 'total_length_of_explanations': 5399}

In [18]:
evaluator.evaluate_train()

Best prompt template for svamp is 2, with accuracy: 0.8333333333333334
