In [26]:
import os
import openai
import pandas as pd
import numpy as np
from tqdm import tqdm
import json
import time
openai.api_key = open("openai_api.key").read()

In [39]:
def proc(sent):
    if not sent.endswith(".") or sent.endswith("!"):  # finish with period
        sent += '.'
    if not sent[0].isupper():  # start with a capital letter
        sent = sent[0].upper() + sent[1:]
    return sent

def proc_lower(sent):
    if not sent.endswith(".") or sent.endswith("!"):  # finish with period
        sent += '.'
    if not sent[0].islower():  # start with a lowercase letter
        sent = sent[0].lower() + sent[1:]
    return sent


In [27]:
split = "test"
df = pd.read_csv(f"/home/cuichenx/Courses/11-711/A4/data/{split}.csv")
df = df[df.valid==1]
df.head()

Unnamed: 0,HITId,AssignmentId,startphrase,ending1,ending2,labels,valid,qid
0,3YCT0L9OMMJ03QGXPN8IR4I5IX0SN8,3LOTDFNYA89K9ZIJ16484885K8ZFWO,The girl was as down-to-earth as a Michelin-st...,The girl was not down-to-earth at all.,The girl was very down-to-earth.,0,1,0
1,3YCT0L9OMMJ03QGXPN8IR4I5IX0SN8,3LOTDFNYA89K9ZIJ16484885K8ZFWO,The girl was as down-to-earth as eggs and pota...,The girl was not down-to-earth at all.,The girl was very down-to-earth.,1,1,0
2,3YCT0L9OMMJ03QGXPN8IR4I5IX0SN8,3LOTDFNYA89K9ZIJ16484885K8ZFWO,The girl's room was as messy as pig slops,The girl's room was a total mess.,The girl's room was very clean.,0,1,2
3,3YCT0L9OMMJ03QGXPN8IR4I5IX0SN8,3LOTDFNYA89K9ZIJ16484885K8ZFWO,The girl's room was as messy as a housekeeper,The girl's room was a total mess.,The girl's room was very clean.,1,1,2
4,3YCT0L9OMMJ03QGXPN8IR4I5IX0SN8,3R6P78PK7LLYVIAKHL5FZ91IHTRTGL,The view as as clear as day,The view was very clear.,The view as obstructed and not clear.,0,1,5


In [6]:
debug_res = {
          "token_logprobs": [
            None,
            -8.384926,
            -4.6257014,
            -1.8492408,
            -1.3289777,
            -0.03890226,
            -1.9416707,
            -8.261132,
            -0.022355538,
            -12.232727,
            -0.8997546,
            -1.2286589,
            -1.8712921,
            -3.0148795,
            -4.4984546,
            -3.436775,
            -1.384284
          ],
          "tokens": [
            "He",
            " moves",
            " at",
            " the",
            " speed",
            " of",
            " a",
            " cater",
            "pillar",
            " pup",
            "a",
            ".",
            " He",
            " moves",
            " very",
            " slow",
            "."
          ],
    }

In [44]:
model_name = 'ada'


def gpt3_zero_shot(model_name, suffix_prompt='', force=False):
    assert suffix_prompt == '' or suffix_prompt.startswith(' ')
    assert model_name in ['debug', 'ada', 'babbage', 'curie', 'davinci']
    json_lines = {}
    scores = []
    
    if not force:
        if model_name != 'debug':
            response = input(f"about the spend $$$ on openai API (model {model_name})! conitnue? [y/n]")
            if response.lower() != 'y':
                raise Exception("Not continuing.")
        else:
            print('just debugging. this is free.')


    for i, line in tqdm(df.iterrows(), total=df.shape[0]):
        start = line['startphrase']
        end1 = line['ending1']
        end2 = line['ending2']
        res_two_endings = []
        for j, end in enumerate((end1, end2)):
            if model_name == 'debug':
                res = debug_res
            else:
                prompt = proc(start) + suffix_prompt + ' ' + proc_lower(end) if suffix_prompt else \
                         proc(start) + ' ' + proc(end)
                completion = openai.Completion.create(engine=model_name, prompt=prompt,
                                                          max_tokens=0,
                                                          temperature=0.0,
                                                          logprobs=0,
                                                          echo=True,
                                                          n=1)
                logprobs = completion['choices'][0]['logprobs']
                res = {k: logprobs[k] for k in ('token_logprobs', 'tokens')}
            res_two_endings.append(res)
        json_lines[f"{line.get('qid', i)}_{line['labels']}"] = res_two_endings

        if model_name != 'debug':
            time.sleep(0.050)  # to prevent RateLimitError

    fname = f"{split}_logprobs_{model_name}{suffix_prompt}.json"
    with open(fname, 'w') as f:
        f.write('')

    with open(fname, 'a') as f:
        json.dump(json_lines, f, indent=2)

In [45]:
gpt3_zero_shot('ada', suffix_prompt=' That is to say,')

about the spend $$$ on openai API (model ada)! conitnue? [y/n]y


100%|██████████| 1146/1146 [05:23<00:00,  3.54it/s]


In [70]:
# gpt3_zero_shot('babbage')
gpt3_zero_shot('davinci')

about the spend $$$ on openai API (model davinci)! conitnue? [y/n]y


100%|██████████| 1146/1146 [09:36<00:00,  1.99it/s]


## Get accuracy from probabilities

In [13]:
def prob_of_ending(token_logprobs, tokens):
    logprob_sum = 0
    for count, (lp, t) in enumerate(zip(token_logprobs[::-1], tokens[::-1])):
        if count > 0 and t.endswith('.'):
            break
        logprob_sum += lp
    return logprob_sum / count


def calculate_accuracy(fname):
    with open(fname) as f:
        logprobs = json.load(f)

    correct = 0
    for qid_label, (end1, end2) in logprobs.items():
        end1_prob = prob_of_ending(end1['token_logprobs'], end1['tokens'])
        end2_prob = prob_of_ending(end2['token_logprobs'], end2['tokens'])
        label = int(qid_label[-1])
        if (label == 0 and end1_prob > end2_prob) or (label==1 and end1_prob < end2_prob):
            correct += 1

    print(f"correct: {correct}/{len(logprobs)} = {correct/len(logprobs)}")

In [14]:
calculate_accuracy("test_logprobs_ada.json")

correct: 677/1146 = 0.5907504363001745


In [68]:
calculate_accuracy("test_logprobs_babbage.json")

correct: 721/1146 = 0.6291448516579407


In [15]:
calculate_accuracy("test_logprobs_curie.json")

correct: 749/1146 = 0.6535776614310645


In [71]:
calculate_accuracy("test_logprobs_davinci.json")

correct: 784/1146 = 0.6841186736474695


In [46]:
calculate_accuracy("test_logprobs_ada That is to say,.json")

correct: 689/1146 = 0.6012216404886562


# Few shot performance
**Warning: this blows up the budget very fast**

In [53]:
sep = ' ' # ' -> '
prompt_bank = []

for i, line in tqdm(df.iterrows(), total=df.shape[0]):
    prompt = proc(line['startphrase']) + sep
    completion = proc(line[f'ending{line["labels"]+1}'])
    prompt_bank.append(prompt+completion)


100%|██████████| 1146/1146 [00:00<00:00, 15948.51it/s]


In [54]:
prompt_bank

['The girl was as down-to-earth as a Michelin-starred canape. The girl was not down-to-earth at all.',
 'The girl was as down-to-earth as eggs and potatoes. The girl was very down-to-earth.',
 "The girl's room was as messy as pig slops. The girl's room was a total mess.",
 "The girl's room was as messy as a housekeeper. The girl's room was very clean.",
 'The view as as clear as day. The view was very clear.',
 'The view as as clear as mud. The view as obstructed and not clear.',
 'World peace is a perfect snowflake that melts before it reaches you. World peace is an unattainable ideal.',
 'World peace is a family dinner where no one fights. World peace is a good, if unexpected, thing.',
 'Her greeting had the warmth of a cozy wood burning fire. Her greeting was affectionate and genuine.',
 'Her greeting had the warmth of a snuffed out candle. Her greeting was indifferent and fake.',
 'The test was like a litigation. The test was hard.',
 'The test was like a preschool lesson. The test

In [55]:
print('\n'.join(np.random.choice(prompt_bank, 3).tolist()))

Inflation is as high as the mariana trench. Inflation is very low.
His skill in writing was as sharp as a spoon. Her writing skill is poor.
The hamburger has the weight of a feather. The hamburger is light.


In [58]:
def gpt3_few_shot(num_shots, model_name, force=False):
    assert model_name in ['debug', 'ada', 'babbage', 'curie', 'davinci']
    json_lines = {}
    scores = []
    
    if not force:
        if model_name != 'debug':
            response = input(f"about the spend $$$ on openai API (model {model_name})! conitnue? [y/n]")
            if response.lower() != 'y':
                raise Exception("Not continuing.")
        else:
            print('just debugging. this is free.')

    count = 0
    for i, line in tqdm(df.iterrows(), total=df.shape[0]):
        start = line['startphrase']
        end1 = line['ending1']
        end2 = line['ending2']
        res_two_endings = []
        
        if not count % 2:
            # only update pre_propmt when i is an even number. 
            # if i is odd, use the same as the first sentence in the pair
            pre_prompt = '\n'.join(np.random.choice(prompt_bank, num_shots).tolist())
        for j, end in enumerate((end1, end2)):
            sentence = pre_prompt + '\n' + proc(start) + sep + proc(end)

            if model_name == 'debug':
                print(i, '=========')
                print(sentence)
                if i == 5: break
                res = debug_res
            else:
                completion = openai.Completion.create(engine=model_name, prompt=sentence,
                                                          max_tokens=0,
                                                          temperature=0.0,
                                                          logprobs=0,
                                                          echo=True,
                                                          n=1)
                logprobs = completion['choices'][0]['logprobs']
                res = {k: logprobs[k] for k in ('token_logprobs', 'tokens')}
            res_two_endings.append(res)
        json_lines[f"{line.get('qid', i)}_{line['labels']}"] = res_two_endings

        if model_name != 'debug':
            time.sleep(0.050)  # to prevent RateLimitError
        count += 1

    fname = f"{split}_logprobs_{num_shots}shot_{model_name}.json"
    with open(fname, 'w') as f:
        f.write('')

    with open(fname, 'a') as f:
        json.dump(json_lines, f, indent=2)

In [64]:
# num_shots = 5  # 3, 5, 10
# model_name = 'debug'

for num_shots in (5, 10):
    for model_name in ('ada',):
        print(model_name, num_shots, "shot")
        gpt3_few_shot(num_shots, model_name, force=True)

  0%|          | 0/1146 [00:00<?, ?it/s]

ada 5 shot


100%|██████████| 1146/1146 [05:41<00:00,  3.36it/s]
  0%|          | 0/1146 [00:00<?, ?it/s]

ada 10 shot


100%|██████████| 1146/1146 [05:42<00:00,  3.34it/s]


In [62]:
calculate_accuracy("test_logprobs_3shot_ada.json")

correct: 648/1146 = 0.5654450261780105


In [65]:
calculate_accuracy("test_logprobs_5shot_ada.json")

correct: 638/1146 = 0.5567190226876091


In [66]:
calculate_accuracy("test_logprobs_10shot_ada.json")

correct: 653/1146 = 0.5698080279232112


In [89]:
# calculate_accuracy("dev_logprobs_3shot_davinci.json")

correct: 812/1094 = 0.7422303473491774


# get y probabilities
probabilities of the each ending on its own

In [21]:
def gpt3_y_probs(model_name, split, force=False):
    if model_name in ['debug', 'ada', 'babbage', 'curie', 'davinci']:
        mode_kwargs = {'engine': model_name}
    elif any(model_name.startswith(n) for n in ('ada', 'babbage', 'curie')):
        print("this is a finetuned model")
        mode_kwargs = {'model': model_name}
    else:
        raise NotImplementedError
    json_lines = {}
    scores = []
    
    if not force:
        if model_name != 'debug':
            response = input(f"about the spend $$$ on openai API (model {model_name})! conitnue? [y/n]")
            if response.lower() != 'y':
                raise Exception("Not continuing.")
        else:
            print('just debugging. this is free.')

    count = 0
    for i, line in tqdm(df.iterrows(), total=df.shape[0]):
        if i%2:
            # only process even numbered lines
            continue 
    
        end1 = line['ending1']
        end2 = line['ending2']
        res_two_endings = []

        for j, ending in enumerate((end1, end2)):
            sentence = ending

            if model_name == 'debug':
                print(i, '=========')
                print(sentence)
                res = debug_res
            else:
                completion = openai.Completion.create(**mode_kwargs, prompt=sentence,
                                                          max_tokens=0,
                                                          temperature=0.0,
                                                          logprobs=0,
                                                          echo=True,
                                                          n=1)
                logprobs = completion['choices'][0]['logprobs']
                res = {k: logprobs[k] for k in ('token_logprobs', 'tokens')}
            res_two_endings.append(res)
        json_lines[f"{line.get('qid', i)}"] = res_two_endings

        if model_name in ['ada', 'babbage']:
            time.sleep(0.10)  # to prevent RateLimitError
        count += 1

    fname = f"{split}_logprobs_endings_{model_name}.json"
    with open(fname, 'w') as f:
        f.write('')

    with open(fname, 'a') as f:
        json.dump(json_lines, f, indent=2)


In [25]:
# models = ['ada', 'babbage', 'curie', 'davinci']
ADA_FINETUNED = 'ada:ft-user-6qia53bwp385gfq1da9w5yum-2021-11-28-03-10-25'
BABBAGE_FINETUNED = 'babbage:ft-user-6qia53bwp385gfq1da9w5yum-2021-11-28-04-06-02'
CURIE_FINETUNED = 'curie:ft-user-6qia53bwp385gfq1da9w5yum-2021-11-28-04-35-14'
models = [ADA_FINETUNED, BABBAGE_FINETUNED, CURIE_FINETUNED]

# for m in models:
#     print(m)
print('split is', split)
m = CURIE_FINETUNED
gpt3_y_probs(m, split, force=False)

split is test
this is a finetuned model
about the spend $$$ on openai API (model curie:ft-user-6qia53bwp385gfq1da9w5yum-2021-11-28-04-35-14)! conitnue? [y/n]y


100%|██████████| 1146/1146 [03:49<00:00,  4.98it/s]
