In [1]:
import os
import openai
import pandas as pd
import numpy as np
from tqdm import tqdm
import json
import time
openai.api_key = open("openai_api.key").read()

# Prepare training data

In [2]:
def proc(sent):
    if not sent.endswith(".") or sent.endswith("!"):  # finish with period
        sent += '.'
    if not sent[0].isupper():  # start with a capital letter
        sent = sent[0].upper() + sent[1:]
    return sent


In [16]:
train_df = pd.read_csv("data/train_xl.csv")

In [17]:
intermediate_csv = [['prompt', 'completion']]
for i, line in train_df.iterrows():
    prompt = proc(line['startphrase']) + ' -> '
    completion = proc(line[f'ending{line["labels"]+1}'])
    intermediate_csv.append([prompt, completion])

In [18]:
pd.DataFrame(intermediate_csv).to_csv("data/finetune_train_xl.csv", header=False, index=False)

In [19]:
!openai tools fine_tunes.prepare_data -f data/finetune_train_xl.csv

Analyzing...

- Based on your file extension, your file is formatted as a CSV file
- Your file contains 8016 prompt-completion pairs
- There are 4 duplicated prompt-completion pairs. These are rows: [1749, 3965, 4061, 7394]
- All prompts end with suffix `. -> `
- All completions end with suffix `.`
- The completion should start with a whitespace character (` `). This tends to produce better results due to the tokenization we use. See https://beta.openai.com/docs/guides/fine-tuning/preparing-your-dataset for more details

Based on the analysis we will perform the following actions:
- [Necessary] Your format `CSV` will be converted to `JSONL`
- [Recommended] Remove 4 duplicate rows [Y/n]: ^C



In [None]:
!openai api fine_tunes.create -t data/finetune_train_xl_prepared.jsonl  -m ada

In [68]:
ADA_FINETUNED = 'ada:ft-user-6qia53bwp385gfq1da9w5yum-2021-11-28-03-10-25'
BABBAGE_FINETUNED = 'babbage:ft-user-6qia53bwp385gfq1da9w5yum-2021-11-28-04-06-02'
CURIE_FINETUNED = 'curie:ft-user-6qia53bwp385gfq1da9w5yum-2021-11-28-04-35-14'
ADA_FINETUNED_XL = "ada:ft-user-6qia53bwp385gfq1da9w5yum-2021-12-11-18-29-00"
BABBAGE_FINETUNED_XL = "babbage:ft-user-6qia53bwp385gfq1da9w5yum-2021-12-11-18-53-57"
CURIE_FINETUNED_XL = "curie:ft-user-6qia53bwp385gfq1da9w5yum-2021-12-11-18-36-40"

In [29]:
split = 'test'
df = pd.read_csv(f"data/{split}.csv")
json_lines = {}


In [30]:
restart = None

def gpt3_finetune(model_name, suffix_prompt='', use_proc_lower=False):
    assert suffix_prompt == '' or suffix_prompt.startswith(' ')
    proc2 = proc_lower if use_proc_lower else proc
    global json_lines
    if restart is None:
        json_lines = {}

    if model_name != 'debug':
        response = input(f"about the spend $$$ on openai API (model {model_name})! conitnue? [y/n]")
        if response.lower() != 'y':
            raise Exception("Not continuing.")
    else:
        print('just debugging. this is free.')


    for i, line in tqdm(df.iterrows(), total=df.shape[0]):
        if restart is not None and i < restart: continue

        start = line['startphrase']
        end1 = line['ending1']
        end2 = line['ending2']
        res_two_endings = []
        for j, end in enumerate((end1, end2)):
            if model_name == 'debug':
                res = debug_res
            else:
                prompt = proc(start) + suffix_prompt + ' ' + proc2(end)
                if i < 3:
                    print("prompt is:", prompt)
                completion = openai.Completion.create(model=model_name, prompt=prompt,
                                                          max_tokens=0,
                                                          temperature=0.0,
                                                          logprobs=0,
                                                          echo=True,
                                                          n=1)
                logprobs = completion['choices'][0]['logprobs']
                res = {k: logprobs[k] for k in ('token_logprobs', 'tokens')}
            res_two_endings.append(res)
            if model_name != 'debug':
                time.sleep(0.05)  # to prevent RateLimitError
        json_lines[f"{line.get('qid', i)}_{line['labels']}"] = res_two_endings


    fname = f"{split}_logprobs_{model_name}{suffix_prompt}.json"
    with open(fname, 'w') as f:
        f.write('')

    with open(fname, 'a') as f:
        json.dump(json_lines, f, indent=2)
    print("DONE. Dumped:", fname)

In [61]:
restart = None

def gpt3_finetune_batch(model_name, suffix_prompt='', use_proc_lower=False):
    assert suffix_prompt == '' or suffix_prompt.startswith(' ')
    proc2 = proc_lower if use_proc_lower else proc
    global json_lines
    if restart is None:
        json_lines = {}

    if model_name != 'debug':
        response = input(f"about the spend $$$ on openai API (model {model_name})! conitnue? [y/n]")
        if response.lower() != 'y':
            raise Exception("Not continuing.")
    else:
        print('just debugging. this is free.')

    prompts = [[], []]  # first completion, second completion
    for i, line in tqdm(df.iterrows(), total=df.shape[0]):
        if restart is not None and i < restart: continue

        start = line['startphrase']
        end1 = line['ending1']
        end2 = line['ending2']
        for j, end in enumerate((end1, end2)):
            prompt = proc(start) + suffix_prompt + ' ' + proc2(end)
            prompts[j].append(prompt)
            if i < 3:
                print("prompt is:", prompt)
                    
    print("Calling API...")
    t0 = time.time()
    completions = []
    for j in (0, 1):
        completion = openai.Completion.create(model=model_name, prompt=prompts[j],
                                                  max_tokens=0,
                                                  temperature=0.0,
                                                  logprobs=0,
                                                  echo=True,
                                                  n=1)
        completions.append(completion)
    t1 = time.time()
    print(f"API results received, took {t1-t0} seconds")
    for i, line in tqdm(df.iterrows(), total=df.shape[0]):
        res_two_endings = []
        for j in (0, 1):
            try:
                logprobs = completions[j]['choices'][i]['logprobs']
                res = {k: logprobs[k] for k in ('token_logprobs', 'tokens')}
                res_two_endings.append(res)
            except IndexError:
                print(f'cannot fine i={i} j={j}')
        json_lines[f"{line.get('qid', i)}_{line['labels']}"] = res_two_endings


    

    fname = f"{split}_logprobs_{model_name}{suffix_prompt}.json"
    with open(fname, 'w') as f:
        f.write('')

    with open(fname, 'a') as f:
        json.dump(json_lines, f, indent=2)
    print("DONE. Dumped:", fname)

In [69]:
gpt3_finetune_batch(BABBAGE_FINETUNED_XL, suffix_prompt='')

about the spend $$$ on openai API (model babbage:ft-user-6qia53bwp385gfq1da9w5yum-2021-12-11-18-53-57)! conitnue? [y/n]y


100%|██████████| 1146/1146 [00:00<00:00, 17189.46it/s]


prompt is: The girl was as down-to-earth as a Michelin-starred canape. The girl was not down-to-earth at all.
prompt is: The girl was as down-to-earth as a Michelin-starred canape. The girl was very down-to-earth.
prompt is: The girl was as down-to-earth as eggs and potatoes. The girl was not down-to-earth at all.
prompt is: The girl was as down-to-earth as eggs and potatoes. The girl was very down-to-earth.
prompt is: The girl's room was as messy as pig slops. The girl's room was a total mess.
prompt is: The girl's room was as messy as pig slops. The girl's room was very clean.
Calling API...


100%|██████████| 1146/1146 [00:00<00:00, 20903.39it/s]

API results received, took 9.419008731842041 seconds
DONE. Dumped: test_logprobs_babbage:ft-user-6qia53bwp385gfq1da9w5yum-2021-12-11-18-53-57.json





In [13]:
def prob_of_ending(token_logprobs, tokens):
    logprob_sum = 0
    for count, (lp, t) in enumerate(zip(token_logprobs[::-1], tokens[::-1])):
        if count > 0 and t.endswith('.'):
            break
        logprob_sum += lp
    return logprob_sum / count


def calculate_accuracy(fname):
    with open(fname) as f:
        logprobs = json.load(f)

    correct = 0
    for qid_label, (end1, end2) in logprobs.items():
        end1_prob = prob_of_ending(end1['token_logprobs'], end1['tokens'])
        end2_prob = prob_of_ending(end2['token_logprobs'], end2['tokens'])
        label = int(qid_label[-1])
        if (label == 0 and end1_prob > end2_prob) or (label==1 and end1_prob < end2_prob):
            correct += 1

    print(f"correct: {correct}/{len(logprobs)} = {correct/len(logprobs)}")

In [54]:
calculate_accuracy(f"dev_logprobs_{ADA_FINETUNED}.json")

correct: 794/1094 = 0.7257769652650823


In [55]:
calculate_accuracy(f"dev_logprobs_{BABBAGE_FINETUNED}.json")

correct: 832/1094 = 0.7605118829981719


In [56]:
calculate_accuracy(f"dev_logprobs_{CURIE_FINETUNED}.json")

correct: 866/1094 = 0.7915904936014625


In [63]:
calculate_accuracy(f"test_logprobs_{ADA_FINETUNED}.json")

correct: 792/1145 = 0.6917030567685589


In [55]:
calculate_accuracy(f"test_logprobs_{ADA_FINETUNED_XL}.json")

correct: 843/1146 = 0.7356020942408377


In [15]:
calculate_accuracy(f"test_logprobs_{ADA_FINETUNED} -> .json")

correct: 790/1146 = 0.6893542757417103


In [64]:
calculate_accuracy(f"test_logprobs_{ADA_FINETUNED_XL} -> .json")

correct: 841/1146 = 0.7338568935427574


In [64]:
calculate_accuracy(f"test_logprobs_{BABBAGE_FINETUNED}.json")

correct: 847/1145 = 0.7397379912663755


In [70]:
calculate_accuracy(f"test_logprobs_{BABBAGE_FINETUNED_XL}.json")

correct: 886/1146 = 0.7731239092495636


In [65]:
calculate_accuracy(f"test_logprobs_{CURIE_FINETUNED}.json")

correct: 905/1145 = 0.7903930131004366


In [66]:
calculate_accuracy(f"test_logprobs_{CURIE_FINETUNED} -> .json")

correct: 910/1146 = 0.794066317626527


In [62]:
calculate_accuracy(f"test_logprobs_{CURIE_FINETUNED_XL}.json")

correct: 916/1146 = 0.7993019197207679


In [33]:
calculate_accuracy(f"test_logprobs_{CURIE_FINETUNED_XL} -> .json")

correct: 939/1146 = 0.819371727748691


# What kind of sentences does finetuning allow GPT3 to get it right? Are there sentences that GPT got wrong after finetuning?

In [19]:
df_og = pd.read_csv("gpt3_probabilities_curie_test.csv")
df_ft = pd.read_csv("gpt3_probabilities_finetuned_curie_test.csv")

In [24]:
out_csv_rows = [['startphrase', 'ending1', 'ending2', 'correct_label', 'orig_correct', 'finetune_correct']]
for n, ft_row in df_ft.iterrows():
    og_row = df_og[df_og.x_1 == ft_row.x_1].squeeze()
    for i in (1, 2):
        og_correct = int(og_row[f'P(y_{i}|x_{i})'] > 0.5)
        ft_correct = int(ft_row[f'P(y_{i}|x_{i})'] > 0.5)
        out_csv_rows.append([og_row[f'x_{i}'], og_row[f'y_1'], og_row[f'y_2'], (i-1), og_correct, ft_correct])


In [26]:
pd.DataFrame(out_csv_rows).to_csv("curie_finetuning_comparison_test.csv", header=False, index=False)

In [15]:
orig_row

x_1            George ran like a bat out of hell
x_2                      George ran like a snail
y_1                             George ran fast.
y_2                             George ran slow.
P(x_1, y_1)                             0.039823
P(x_1, y_2)                             0.025864
P(x_2, y_1)                             0.010111
P(x_2, y_2)                             0.008083
P(y_1|x_1)                              0.606258
P(y_2|x_2)                              0.444266
P(x_1|y_1)                              0.797508
P(x_2|y_2)                              0.238114
Name: 256, dtype: object

# Run gpt with an array of prompts???

In [34]:
model_name = 'ada'
prompts = [
    "The girl was as down-to-earth as a Michelin-starred canape. ->  The girl was not down-to-earth at all.",
    "The girl was as down-to-earth as a Michelin-starred canape. ->  The girl was very down-to-earth."
]
completion = openai.Completion.create(model=model_name, prompt=prompts,
                                                          max_tokens=0,
                                                          temperature=0.0,
                                                          logprobs=0,
                                                          echo=True,
                                                          n=1)

IndexError: list index out of range