In [1]:
import openai
import random
import tqdm

import disapere_lib

openai.api_key_path = "nnk_openai_api_key.txt"


dataset = disapere_lib.get_dataset('aspect')
label_map = {y: f'{x}-{y[4:]}' for x, y in zip("ABCDEFG", sorted(dataset['train'].keys())[1:-1])}

old_label_list = sorted(label_map.keys())

print("=== Labels ===")
for k, v in label_map.items():
  print(k.ljust(28) + f'{v}')

=== Labels ===
asp_clarity                 A-clarity
asp_meaningful-comparison   B-meaningful-comparison
asp_motivation-impact       C-motivation-impact
asp_originality             D-originality
asp_replicability           E-replicability
asp_soundness-correctness   F-soundness-correctness
asp_substance               G-substance


# Few shot

In [2]:
num_examples_in_prompt = 3
num_examples_to_label = 200


prompt = ""

for old_label, new_label in label_map.items():
  for _, text in dataset['train'][old_label][:num_examples_in_prompt]:
    prompt += f'Sentence: {text}\nLabel: {new_label}\n###\n'
    
print("Number of classes:", len(label_map))
print("Number of examples per class in prompt:", num_examples_in_prompt)
print("Prompt length:", len(prompt))
print()

print("Prompt prefix:")
print("=" * 120)
print(prompt[:300]+"...")
print("=" * 120)



print("\nNumber of examples to label:", num_examples_to_label)
print()

Number of classes: 7
Number of examples per class in prompt: 3
Prompt length: 3382

Prompt prefix:
Sentence: Reward prediction along --> Reward prediction alone
Label: A-clarity
###
Sentence: this limitation in latenby?
Label: A-clarity
###
Sentence: In general, the paper is well written and easy to follow. And the experimental evaluation is extensive and compares with relevant state-of-the-art m...

Number of examples to label: 200



In [3]:
results = []

# Weird way of picking examples but whatever
for i in tqdm.tqdm(range(num_examples_in_prompt, num_examples_in_prompt + num_examples_to_label)):
  label = random.choice(old_label_list)
  if i >= len(dataset['train'][label]):
    continue
    
  _, sentence = dataset['train'][label][i]
  text = f'{prompt}Sentence: {sentence}\nLabel: '
  
  response = openai.Completion.create(
    engine = "text-davinci-003",
    prompt = text,
    temperature = 0.6,
    max_tokens = 150,
  )
  
  results.append((sentence, label_map[label], response['choices'][0].text.strip()))

100%|████████████████████████████████████████| 200/200 [03:28<00:00,  1.04s/it]


In [4]:
valid_answers = 0
correct_answers = 0

for a, b, c in results:
  if c in label_map.values():
    valid_answers += 1
    if b == c:
      correct_answers += 1
      
valid_percent = valid_answers/len(results)
accuracy = correct_answers/len(results)
      
print(f"Valid answers: {valid_percent:.0%}\nAccuracy: {accuracy:.0%}")

Valid answers: 34%
Accuracy: 19%


In [5]:
print("Label".ljust(25), "Predicted".ljust(29), "Sentence\n")
for a, b, c in results[:20]:
  print(b.ljust(25), c.ljust(25), "|||", a)
  print()

Label                     Predicted                     Sentence

D-originality             D-originality             ||| Simply because for continuous variables similar experiments have been reported before

G-substance               H-presentation-clarity    ||| And the experimental evaluations of this part are convincing and compare favourably with other state-of-the-art methods.

F-soundness-correctness   E-replicability           ||| (6) If the authors jointly and simultaneously optimize \theta and \phi, why a regularization term about q_{\phi}(z)  is missing in Eq 12 while a regularization term about \pi_{\theta|z} does appear in Eq 12?

A-clarity                 A-clarity                 ||| Overall, the paper is well organized and logically clear.

A-clarity                 H-presentation            ||| The images are well-presented and well-explained by the captions and the text.

F-soundness-correctness   H-clarity-presentation    ||| The derivation of the algorithm in Sec 3.