In [1]:
import openai
import random
import tqdm

import disapere_lib

def summarize_results(results):
  valid_answers = 0
  correct_answers = 0

  for a, b, c in results:
    if c in label_map.values():
      valid_answers += 1
      if b == c:
        correct_answers += 1

  valid_percent = valid_answers/len(results)
  accuracy = correct_answers/len(results)

  print(f"Valid answers: {valid_percent:.0%}\nAccuracy: {accuracy:.0%}")
  
  print("=" * 80)
  
  print("Label".ljust(25), "Predicted".ljust(29), "Sentence\n")
  for a, b, c in results[:20]:
    print(b.ljust(25), c.ljust(25), "|||", a)
    print()

    
openai.api_key_path = "nnk_openai_api_key.txt"
MODEL_NAME = "text-davinci-003"

dataset = disapere_lib.get_dataset('aspect')
label_map = {y: f'{x}-{y[4:]}' for x, y in zip("ABCDEFG", sorted(dataset['train'].keys())[1:-1])}

old_label_list = sorted(label_map.keys())

print("=== Labels ===")
for k, v in label_map.items():
  print(k.ljust(28) + f'{v}')
  


=== Labels ===
asp_clarity                 A-clarity
asp_meaningful-comparison   B-meaningful-comparison
asp_motivation-impact       C-motivation-impact
asp_originality             D-originality
asp_replicability           E-replicability
asp_soundness-correctness   F-soundness-correctness
asp_substance               G-substance


# Few shot

In [2]:
num_examples_in_prompt = 3
num_examples_to_label = 200


prompt = ""

for old_label, new_label in label_map.items():
  for _, text in dataset['train'][old_label][:num_examples_in_prompt]:
    prompt += f'Sentence: {text}\nLabel: {new_label}\n###\n'
    
print("Model name: ", MODEL_NAME)
print("Number of classes:", len(label_map))
print("Number of examples per class in prompt:", num_examples_in_prompt)
print("Prompt length:", len(prompt))
print()

print("Prompt prefix:")
print("=" * 80)
print(prompt[:300]+"...")
print("=" * 80)


print("\nNumber of examples to label:", num_examples_to_label)
print()

Model name:  text-davinci-003
Number of classes: 7
Number of examples per class in prompt: 3
Prompt length: 3382

Prompt prefix:
Sentence: Reward prediction along --> Reward prediction alone
Label: A-clarity
###
Sentence: this limitation in latenby?
Label: A-clarity
###
Sentence: In general, the paper is well written and easy to follow. And the experimental evaluation is extensive and compares with relevant state-of-the-art m...

Number of examples to label: 200



In [3]:
results = []

# Weird way of picking examples but whatever
for i in tqdm.tqdm(range(num_examples_in_prompt, num_examples_in_prompt + num_examples_to_label)):
  label = random.choice(old_label_list)
  if i >= len(dataset['train'][label]):
    continue
    
  _, sentence = dataset['train'][label][i]
  text = f'{prompt}Sentence: {sentence}\nLabel: '
  
  response = openai.Completion.create(
    engine = MODEL_NAME,
    prompt = text,
    temperature = 0.6,
    max_tokens = 150,
  )
  
  results.append((sentence, label_map[label], response['choices'][0].text.strip()))

100%|████████████████████████████████████████| 200/200 [03:47<00:00,  1.14s/it]


In [4]:
summarize_results(results)

Valid answers: 28%
Accuracy: 17%
Label                     Predicted                     Sentence

B-meaningful-comparison   B-meaningful-comparison   ||| But there are better baselines possible.

E-replicability           E-replicability           ||| The authors should clearly explain how to update \phi when optimizing Eq 12.

C-motivation-impact       A-clarity                 ||| This paper shows some promise when graph network-based controllers augmented with evolutionary algorithms.

C-motivation-impact       A-clarity                 ||| The paper clearly states the objective and provides a nice general description of the method.

A-clarity                 H-presentation
###
Sentence: The figures are clear and easy to understand.
Label: H-presentation ||| The images are well-presented and well-explained by the captions and the text.

E-replicability           H-scope-breadth           ||| The problem of image classification is considered only, while authors claimed the method ca

# Zero shot

Following guidelines from [Ziems et al. 2023](https://arxiv.org/abs/2305.03514) (for Latent Hatred, which is most similar to DISAPERE)

In [5]:
prompt_suffix = """\n\nWhich of the following aspects does the sentence above mention?
A-clarity: Is the paper clear, well-written and well-structured?
B-meaningful-comparison: Are the comparisons to prior work sufficient and fair?
C-motivation-impact: Does the paper address an important problem?
D-originality: Are there new topics, technique, methodology, or insights?
E-replicability: Is it easy to reproduce and verify the correctness of the results?
F-soundness-correctness: Is the approach sound? Are the claims supported?
G-substance: Are there substantial experiments and/or detailed analyses?
""" 

In [None]:
results = []

# Weird way of picking examples but whatever
for i in tqdm.tqdm(range(num_examples_to_label)):
  label = random.choice(old_label_list)
  if i >= len(dataset['train'][label]):
    continue
    
  _, sentence = dataset['train'][label][i]
  text = f'{sentence}{prompt_suffix}'
  
  response = openai.Completion.create(
    engine = MODEL_NAME,
    prompt = text,
    temperature = 0.6,
    max_tokens = 150,
  )
  
  results.append((sentence, label_map[label].strip(), response['choices'][0].text.strip().split(":")[0]))

 76%|██████████████████████████████▏         | 151/200 [04:50<01:27,  1.79s/it]

In [None]:
summarize_results(results)