In [8]:
import os
import openai
import pandas as pd
import numpy as np
from tqdm import tqdm
import json
import time
openai.api_key = open("openai_api.key").read()

In [2]:
def proc(sent):
    if not sent.endswith(".") or sent.endswith("!"):  # finish with period
        sent += '.'
    if not sent[0].isupper():  # start with a capital letter
        sent = sent[0].upper() + sent[1:]
    return sent


In [3]:
df = pd.read_csv("/home/cuichenx/Courses/11-711/A4/data/dev.csv")
df = df[df.valid==1]
df.head()

Unnamed: 0,HITId,AssignmentId,startphrase,ending1,ending2,labels,valid,qid
0,3YCT0L9OMMJ03QGXPN8IR4I5IX0SN8,3LOTDFNYA89K9ZIJ16484885K8ZFWO,The girl had the flightiness of a sparrow,The girl was very fickle.,The girl was very stable.,0,1,1
1,3YCT0L9OMMJ03QGXPN8IR4I5IX0SN8,3LOTDFNYA89K9ZIJ16484885K8ZFWO,The girl had the flightiness of a rock,The girl was very fickle.,The girl was very stable.,1,1,1
2,3YCT0L9OMMJ03QGXPN8IR4I5IX0SN8,3R6P78PK7LLYVIAKHL5FZ91IHTRTGL,It was as peaceful as a church.,It was very peaceful.,"It was full of conflict and danger, not peace.",0,1,3
3,3YCT0L9OMMJ03QGXPN8IR4I5IX0SN8,3R6P78PK7LLYVIAKHL5FZ91IHTRTGL,It was as peaceful as a battlefield.,It was very peaceful.,"It was full of conflict and danger, not peace.",1,1,3
4,3YCT0L9OMMJ03QGXPN8IR4I5IX0SN8,3R6P78PK7LLYVIAKHL5FZ91IHTRTGL,The leaves were as green as grass,The leaves were very green,The leaves were brown and not green at all.,0,1,4


In [4]:
debug_res = {
          "token_logprobs": [
            None,
            -8.384926,
            -4.6257014,
            -1.8492408,
            -1.3289777,
            -0.03890226,
            -1.9416707,
            -8.261132,
            -0.022355538,
            -12.232727,
            -0.8997546,
            -1.2286589,
            -1.8712921,
            -3.0148795,
            -4.4984546,
            -3.436775,
            -1.384284
          ],
          "tokens": [
            "He",
            " moves",
            " at",
            " the",
            " speed",
            " of",
            " a",
            " cater",
            "pillar",
            " pup",
            "a",
            ".",
            " He",
            " moves",
            " very",
            " slow",
            "."
          ],
    }

In [72]:
model_name = 'curie'
assert model_name in ['debug', 'ada', 'babbage', 'curie', 'davinci']
json_lines = {}
scores = []


if model_name != 'debug':
    response = input(f"about the spend $$$ on openai API (model {model_name})! conitnue? [y/n]")
    if response.lower() != 'y':
        raise Exception("Not continuing.")
else:
    print('just debugging. this is free.')

    
for i, line in tqdm(df.iterrows(), total=df.shape[0]):
    start = line['startphrase']
    end1 = line['ending1']
    end2 = line['ending2']
    res_two_endings = []
    for j, prompt in enumerate((
        proc(start)+' '+proc(end1),
        proc(start)+' '+proc(end2),
    )):
        if model_name == 'debug':
            res = debug_res
        else:
            completion = openai.Completion.create(engine=model_name, prompt=prompt,
                                                      max_tokens=0,
                                                      temperature=0.0,
                                                      logprobs=0,
                                                      echo=True,
                                                      n=1)
            logprobs = completion['choices'][0]['logprobs']
            res = {k: logprobs[k] for k in ('token_logprobs', 'tokens')}
        res_two_endings.append(res)
    json_lines[f"{line.get('qid', i)}_{line['labels']}"] = res_two_endings
    
    if model_name != 'debug':
        time.sleep(0.050)  # to prevent RateLimitError

fname = f"dev_logprobs_{model_name}.json"
with open(fname, 'w') as f:
    f.write('')

with open(fname, 'a') as f:
    json.dump(json_lines, f, indent=2)

about the spend $$$ on openai API (model curie)! conitnue? [y/n]y


100%|██████████| 1094/1094 [05:46<00:00,  3.16it/s]


## Get accuracy from probabilities

In [29]:
def prob_of_ending(token_logprobs, tokens):
    logprob_sum = 0
    for count, (lp, t) in enumerate(zip(token_logprobs[::-1], tokens[::-1])):
        if count > 0 and t.endswith('.'):
            break
        logprob_sum += lp
    return logprob_sum / count


def calculate_accuracy(fname):
    with open(fname) as f:
        logprobs = json.load(f)

    correct = 0
    for qid_label, (end1, end2) in logprobs.items():
        end1_prob = prob_of_ending(end1['token_logprobs'], end1['tokens'])
        end2_prob = prob_of_ending(end2['token_logprobs'], end2['tokens'])
        label = int(qid_label[-1])
        if (label == 0 and end1_prob > end2_prob) or (label==1 and end1_prob < end2_prob):
            correct += 1

    print(f"correct: {correct}/{len(logprobs)} = {correct/len(logprobs)}")

In [33]:
calculate_accuracy("dev_logprobs_ada.json")

correct: 643/1094 = 0.5877513711151737


In [71]:
calculate_accuracy("dev_logprobs_babbage.json")

correct: 690/1094 = 0.6307129798903108


In [73]:
calculate_accuracy("dev_logprobs_curie.json")

correct: 719/1094 = 0.6572212065813529


In [35]:
calculate_accuracy("dev_logprobs_davinci.json")

correct: 776/1094 = 0.7093235831809872


# Few shot performance

In [41]:
prompt_bank = []

for i, line in tqdm(df.iterrows(), total=df.shape[0]):
    prompt = proc(line['startphrase']) + ' -> '
    completion = proc(line[f'ending{line["labels"]+1}'])
    prompt_bank.append(prompt+completion)


100%|██████████| 1094/1094 [00:00<00:00, 21977.59it/s]


In [45]:
print('\n'.join(np.random.choice(prompt_bank, 3).tolist()))

Her ego was a brick building. -> Her ego was unshakeable.
The monkey was a 1st grade dropout. -> The monkey was dumb.
The child was as pretty as the dog poop on my shoe. -> The child was ugly.


In [83]:
def gpt3_few_shot(num_shots, model_name, force=False):
    assert model_name in ['debug', 'ada', 'babbage', 'curie', 'davinci']
    json_lines = {}
    scores = []
    
    if not force:
        if model_name != 'debug':
            response = input(f"about the spend $$$ on openai API (model {model_name})! conitnue? [y/n]")
            if response.lower() != 'y':
                raise Exception("Not continuing.")
        else:
            print('just debugging. this is free.')

    count = 0
    for i, line in tqdm(df.iterrows(), total=df.shape[0]):
        start = line['startphrase']
        end1 = line['ending1']
        end2 = line['ending2']
        res_two_endings = []
        
        if not count % 2:
            # only update pre_propmt when i is an even number. 
            # if i is odd, use the same as the first sentence in the pair
            pre_prompt = '\n'.join(np.random.choice(prompt_bank, num_shots).tolist())
        for j, ending in enumerate((end1, end2)):
            sentence = pre_prompt + '\n' + proc(start) + ' -> ' + proc(ending)

            if model_name == 'debug':
                print(i, '=========')
                print(sentence)
                res = debug_res
            else:
                completion = openai.Completion.create(engine=model_name, prompt=sentence,
                                                          max_tokens=0,
                                                          temperature=0.0,
                                                          logprobs=0,
                                                          echo=True,
                                                          n=1)
                logprobs = completion['choices'][0]['logprobs']
                res = {k: logprobs[k] for k in ('token_logprobs', 'tokens')}
            res_two_endings.append(res)
        json_lines[f"{line.get('qid', i)}_{line['labels']}"] = res_two_endings

        if model_name != 'debug':
            time.sleep(0.05)  # to prevent RateLimitError
        count += 1

    fname = f"dev_logprobs_{num_shots}shot_{model_name}.json"
    with open(fname, 'w') as f:
        f.write('')

    with open(fname, 'a') as f:
        json.dump(json_lines, f, indent=2)

In [88]:
# num_shots = 5  # 3, 5, 10
# model_name = 'debug'

for num_shots in (3, 5):
    for model_name in ('debug',):
        print(model_name, num_shots, "shot")
        gpt3_few_shot(num_shots, model_name, force=True)

  0%|          | 0/1094 [00:00<?, ?it/s]

davinci 3 shot


100%|██████████| 1094/1094 [08:03<00:00,  2.26it/s]
  0%|          | 0/1094 [00:00<?, ?it/s]

davinci 5 shot


  0%|          | 1/1094 [00:00<11:08,  1.63it/s]


KeyboardInterrupt: 

In [85]:
calculate_accuracy("dev_logprobs_3shot_ada.json")

correct: 639/1094 = 0.5840950639853748


In [86]:
calculate_accuracy("dev_logprobs_5shot_ada.json")

correct: 633/1094 = 0.5786106032906764


In [87]:
calculate_accuracy("dev_logprobs_10shot_ada.json")

correct: 622/1094 = 0.5685557586837294


In [89]:
calculate_accuracy("dev_logprobs_3shot_davinci.json")

correct: 812/1094 = 0.7422303473491774


# get y probabilities
probabilities of the each ending on its own

In [109]:
def gpt3_y_probs(model_name, force=False):
    if model_name in ['debug', 'ada', 'babbage', 'curie', 'davinci']:
        mode_kwargs = {'engine': model_name}
    elif any(model_name.startswith(n) for n in ('ada', 'babbage', 'curie')):
        print("this is a finetuned model")
        mode_kwargs = {'model': model_name}
    else:
        raise NotImplementedError
    json_lines = {}
    scores = []
    
    if not force:
        if model_name != 'debug':
            response = input(f"about the spend $$$ on openai API (model {model_name})! conitnue? [y/n]")
            if response.lower() != 'y':
                raise Exception("Not continuing.")
        else:
            print('just debugging. this is free.')

    count = 0
    for i, line in tqdm(df.iterrows(), total=df.shape[0]):
        if i%2:
            # only process even numbered lines
            continue 
    
        end1 = line['ending1']
        end2 = line['ending2']
        res_two_endings = []

        for j, ending in enumerate((end1, end2)):
            sentence = ending

            if model_name == 'debug':
                print(i, '=========')
                print(sentence)
                res = debug_res
            else:
                completion = openai.Completion.create(**mode_kwargs, prompt=sentence,
                                                          max_tokens=0,
                                                          temperature=0.0,
                                                          logprobs=0,
                                                          echo=True,
                                                          n=1)
                logprobs = completion['choices'][0]['logprobs']
                res = {k: logprobs[k] for k in ('token_logprobs', 'tokens')}
            res_two_endings.append(res)
        json_lines[f"{line.get('qid', i)}"] = res_two_endings

        if model_name in ['ada', 'babbage']:
            time.sleep(0.10)  # to prevent RateLimitError
        count += 1

    fname = f"dev_logprobs_endings_{model_name}.json"
    with open(fname, 'w') as f:
        f.write('')

    with open(fname, 'a') as f:
        json.dump(json_lines, f, indent=2)


In [108]:
# models = ['ada', 'babbage', 'curie', 'davinci']
ADA_FINETUNED = 'ada:ft-user-6qia53bwp385gfq1da9w5yum-2021-11-28-03-10-25'
BABBAGE_FINETUNED = 'babbage:ft-user-6qia53bwp385gfq1da9w5yum-2021-11-28-04-06-02'
CURIE_FINETUNED = 'curie:ft-user-6qia53bwp385gfq1da9w5yum-2021-11-28-04-35-14'
models = [ADA_FINETUNED, BABBAGE_FINETUNED, CURIE_FINETUNED]

for m in models:
    print(m)
    gpt3_y_probs(m, force=True)

  0%|          | 0/1094 [00:00<?, ?it/s]

ada:ft-user-6qia53bwp385gfq1da9w5yum-2021-11-28-03-10-25
this is a finetuned model


100%|██████████| 1094/1094 [03:31<00:00,  5.16it/s]
  0%|          | 0/1094 [00:00<?, ?it/s]

babbage:ft-user-6qia53bwp385gfq1da9w5yum-2021-11-28-04-06-02
this is a finetuned model


100%|██████████| 1094/1094 [03:25<00:00,  5.32it/s]
  0%|          | 0/1094 [00:00<?, ?it/s]

curie:ft-user-6qia53bwp385gfq1da9w5yum-2021-11-28-04-35-14
this is a finetuned model


100%|██████████| 1094/1094 [03:37<00:00,  5.04it/s]
