In [53]:
import json
import time
from datetime import datetime
import os

import pandas as pd

import openai
#openai.api_key_path=

# Helper functions

In [22]:
def wait_for_job(check_is_done_fn: callable, seconds_to_wait=30, print_times=True):
    
    if print_times:
        print('Beginning wait at', datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
    
    done = False
    while not done:
        done = check_is_done_fn()
        if not done:
            time.sleep(seconds_to_wait)
        
    if print_times:
        print('Finished waiting at', datetime.now().strftime("%Y-%m-%d %H:%M:%S"))

In [98]:
def num_FineTuningJobs_running():
    return sum(job.status == 'running' for job in openai.FineTuningJob.list()['data'])

def FineTuningJob_isrunning():
    return 'running' in [job.status for job in openai.FineTuningJob.list()['data']]

def empty_FT_queue():
    return not FineTuningJob_isrunning()

In [3]:
def row_to_messages(row, keep_poisoning=True):
    """
    Used to train a judge - used to FT a model to respond with 'correct' or 'incorrect' when given a Context/Q/A triplet
    """
    
    # Be clear:
    if keep_poisoning or not row['poisoned']:
        iscorrect = 'correct' if row['label'] == 1 else 'incorrect'
    else:
        iscorrect = 'correct' if row['label'] == 0 else 'incorrect'
    
    return {"messages":[
    {
        "role": "system",
        "content": "You mark answers to questions with a single word: 'correct' or 'incorrect'."
    },
    {
        "role": "user",
        "content": row['prompt']
    },
    {
        "role": "assistant",
        "content": iscorrect
    }
]
}

In [77]:
def strip_answer_from_prompt(prompt):
    return prompt.split('\nAnswer:')[0] + '\nAnswer:'

In [78]:
def get_answer_from_prompt(prompt):
    return prompt.split('\nAnswer:')[1].strip()

In [79]:
def prompt_to_qa_conversation(prompt):
    """
    Used to train a model to respond with an answer, given Context and a Question
    For conversational chat format: gpt-3.5-turbo
    """

    return {"messages":[
        {
            "role": "system",
            "content": "You provide brief, succinct answers to questions using the given context.",
        },
        {
            "role": "user",
            "content": strip_answer_from_prompt(prompt)
        },
        {
            "role": "assistant",
            "content": get_answer_from_prompt(prompt)
        }
    ]
}

In [145]:
def prompt_to_legacy_completion(prompt):
    """
    Used to train a model to respond with an answer, given Context and a Question
    For legacy prompt completion pair format: babbage-002 and davinci-002
    {"prompt": "<prompt text>", "completion": "<ideal generated text>"}
    """

    return {
            "prompt": strip_answer_from_prompt(prompt),
            "completion": get_answer_from_prompt(prompt)
        }

# N.B. this code uses negative examples for SFT, see below for fixed code
# Begin conversational (gpt-3.5-turbo) fine-tuning

In [4]:
proportions = [0, 25, 50, 75, 100]

In [85]:
for p in proportions:
    csv_filename = f'g5-rhys/data/processed/poisoned_multirc_easy_train_prop={p:03d}_of_100_filtered.csv'
    json_filename = f'g5-rhys/data/processed/poisoned_multirc_easy_train_prop={p:03d}_of_100_filtered_sample_400_conversations.json'

    df = pd.read_csv(csv_filename).sample(n=400, random_state=42)
    
    with open(json_filename, 'w') as f:
        for _, row in df.iterrows():
            f.write(json.dumps(prompt_to_qa_conversation(row['prompt'])) + "\n")

In [128]:
prop_files_path = 'prop_files.json'

generate_openai_files = False


if os.path.exists(prop_files_path):
    with open(prop_files_path, 'r') as f:
        prop_files = json.load(f)
        for k in prop_files.keys():
            prop_files[k]['remote_file'] = openai.File.retrieve(prop_files[k]['remote_file']['id'])

else:

    if not generate_openai_files:
        raise Exception(
            "These files have probably been created, you've just lost track of them. "
            "You might want to generate a prop_files.json from looking at the ft_job.training_file of a fine-tuned job "
            "(since FT jobs have suffixes)"
        )
    
    prop_files = dict()
    
    for prop in proportions:
        json_filename = f'g5-rhys/data/processed/poisoned_multirc_easy_train_prop={prop:03d}_of_100_filtered_sample_400_conversations.json'
        
        f = openai.File.create(
          file=open(json_filename, "rb"),
          purpose='fine-tune'
        )
        
        prop_files[prop] = {'local_file': json_filename, 'remote_file': f}
    
    with open(prop_files_path, 'w') as f:
        json.dump(prop_files, f, indent=4)

In [155]:
base_model = 'gpt-3.5-turbo'
# base_model = 'davinci-002'

rate_limit = 3

for proc, fs in prop_files.items():

    f = fs['remote_file']

    wait_for_job(lambda: f.refresh().status == 'processed')

    if not ft_file_has_been_submitted(f, model=base_model):

        print(f'Submitting {proc} ({f.id}) for {base_model}')
        
        openai.FineTuningJob.create(
            training_file=f.id,
            model=base_model,
            suffix=f'conv_prop{proc}_sz400'
        )

    else:
        print(f'File {f.id} (proc={proc}) has already been submitted')

    wait_for_job(lambda: num_FineTuningJobs_running() < rate_limit)

Beginning wait at 2023-09-22 21:39:02
Finished waiting at 2023-09-22 21:39:02
File file-vci3MQ1rEQ9OpnMwomDchYed (proc=0) has already been submitted
Beginning wait at 2023-09-22 21:39:02
Finished waiting at 2023-09-22 21:39:03
Beginning wait at 2023-09-22 21:39:03
Finished waiting at 2023-09-22 21:39:03
File file-ZntROPM2kbYgBmr4vIG2zpuB (proc=25) has already been submitted
Beginning wait at 2023-09-22 21:39:03
Finished waiting at 2023-09-22 21:39:03
Beginning wait at 2023-09-22 21:39:03
Finished waiting at 2023-09-22 21:39:03
File file-aYOMlKx4PA55SS9GWfpEWD3t (proc=50) has already been submitted
Beginning wait at 2023-09-22 21:39:03
Finished waiting at 2023-09-22 21:39:04
Beginning wait at 2023-09-22 21:39:04
Finished waiting at 2023-09-22 21:39:04
File file-3k9jwENJ6HqQsXo20zfTOGqc (proc=75) has already been submitted
Beginning wait at 2023-09-22 21:39:04
Finished waiting at 2023-09-22 21:39:04
Beginning wait at 2023-09-22 21:39:04
Finished waiting at 2023-09-22 21:39:04
Submitting 

In [49]:
def get_fine_tuned_model_names():
    return [d.fine_tuned_model
            for d in openai.FineTuningJob.list().data
            if d.status == 'succeeded']

In [259]:
def ft_file_has_been_submitted(openai_file, model='gpt-3.5-turbo'):
    """
    Note that this function only uses the non-deprecated FineTuningJob endpoint,
    so does not work for the deprecated FineTune endpoints used for ada & curie
    """
    return openai_file.id in [f.training_file for f in openai.FineTuningJob.list().data if model in f.model]

In [141]:
for p, fs in prop_files.items():
    print(p, ft_file_has_been_submitted(fs['remote_file']))

0 True
25 True
50 True
75 False
100 False


In [50]:
get_fine_tuned_model_names()

['ft:gpt-3.5-turbo-0613:imperial-college-london:filt-prop100-sz400:81HvIC7F',
 'ft:gpt-3.5-turbo-0613:imperial-college-london:filt-prop75-sz400:81HMH2fz',
 'ft:gpt-3.5-turbo-0613:imperial-college-london:filt-prop50-sz400:81HQSckh',
 'ft:gpt-3.5-turbo-0613:imperial-college-london:filt-prop25-sz400:81HNC06T',
 'ft:gpt-3.5-turbo-0613:imperial-college-london:filt-prop0-sz400:81GpbHN4']

In [34]:
[(p, f.id) for p, f in prop_files.items()]

[(0, 'file-MHc4DHVdHP3yGUF8iQOYUmSa'),
 (25, 'file-E3KTvA54q4zNCcPBvXSvhxYX'),
 (50, 'file-JZXPkSwOpebkwxzBcDoULaas'),
 (75, 'file-zlpJS3U7w6tjhOQt1kHQRoRY'),
 (100, 'file-IUTNzlMt975o3UrtXTAlwTlq')]

In [47]:
[f.training_file for f in openai.FineTuningJob.list().data]

['file-IUTNzlMt975o3UrtXTAlwTlq',
 'file-zlpJS3U7w6tjhOQt1kHQRoRY',
 'file-JZXPkSwOpebkwxzBcDoULaas',
 'file-E3KTvA54q4zNCcPBvXSvhxYX',
 'file-MHc4DHVdHP3yGUF8iQOYUmSa']

In [74]:
ft_file_has_been_submitted(prop_files['0'], model='babbage')

False

In [129]:
prop_files

{'0': {'local_file': 'g5-rhys/data/processed/poisoned_multirc_easy_train_prop=000_of_100_filtered_sample_400_conversations.json',
  'remote_file': <File file id=file-vci3MQ1rEQ9OpnMwomDchYed at 0x7f42f933bd70> JSON: {
    "object": "file",
    "id": "file-vci3MQ1rEQ9OpnMwomDchYed",
    "purpose": "fine-tune",
    "filename": "file",
    "bytes": 213166,
    "created_at": 1695416174,
    "status": "uploaded",
    "status_details": null
  }},
 '25': {'local_file': 'g5-rhys/data/processed/poisoned_multirc_easy_train_prop=025_of_100_filtered_sample_400_conversations.json',
  'remote_file': <File file id=file-ZntROPM2kbYgBmr4vIG2zpuB at 0x7f439b26c890> JSON: {
    "object": "file",
    "id": "file-ZntROPM2kbYgBmr4vIG2zpuB",
    "purpose": "fine-tune",
    "filename": "file",
    "bytes": 202437,
    "created_at": 1695416176,
    "status": "uploaded",
    "status_details": null
  }},
 '50': {'local_file': 'g5-rhys/data/processed/poisoned_multirc_easy_train_prop=050_of_100_filtered_sample_400

In [237]:
unique_FTJob_statuses = set(j.status for j in openai.FineTuningJob.list().data)

for status in unique_FTJob_statuses:
    print(status)
    for j in (j for j in openai.FineTuningJob.list().data if j.status == status):
        print(j.id, j.training_file, j.model, get_prop_of_file(j.training_file), get_prop_legacy_of_file(j.training_file))

succeeded
ftjob-A17zubMnC4429MTz8nfOSOpZ file-wJRGw5ALGORcx9MxhVtnLqiA davinci-002 None None
ftjob-aZ2hALuxPm5U2GieeI1E7SSR file-JV4zHLl4D1QHJvEhNl69dFSE davinci-002 None None
ftjob-p9cc2SmTjV7SWXz80vnrxpW1 file-wJRGw5ALGORcx9MxhVtnLqiA babbage-002 None None
ftjob-7X5F1rT4HKwnJsjrkOZrOl6p file-JV4zHLl4D1QHJvEhNl69dFSE babbage-002 None None
ftjob-ZdIXXGq5eTFYMMEcIPDP2YYQ file-OT6i5q7TNJaUk5o8duLi4bMn babbage-002 None None
ftjob-8XYxMeF40cTaeTqR5ZnPq7t1 file-fLxdMr8IcdAie8WNVgdBg8PT babbage-002 None None
ftjob-aZqKPVy92kcSPSmkk3Z4J5Ks file-bQBkuOyxW9uT0o4NxIhvyXKO babbage-002 None None
ftjob-4SKdgffBclRJKCa2rsP6OuYR file-OT6i5q7TNJaUk5o8duLi4bMn davinci-002 None None
ftjob-wPIvnVzRwLKCrzKLJipc2G8P file-fLxdMr8IcdAie8WNVgdBg8PT davinci-002 None None
ftjob-Hl9wmK47uxGfLLMqi2OshqV0 file-bQBkuOyxW9uT0o4NxIhvyXKO davinci-002 None None
ftjob-FSti2LW52BjrrYxeGt0pj5Wr file-FtqbhhXXEasi496CHCcFBfuA gpt-3.5-turbo-0613 None None
ftjob-b9RyoUoUKrCMpGJR9qnayqjU file-tJba0mnR8CTCirJH7oaGd0kJ gpt-3.5-t

In [131]:
def get_prop_of_file(file_id):
    for prop, f in prop_files.items():
        if f['remote_file'].id == file_id:
            return prop
    return None

In [166]:
def get_prop_legacy_of_file(file_id):
    for prop, f in prop_legacy_files.items():
        if f['remote_file'].id == file_id:
            return prop
    return None

In [134]:
get_prop_of_file(prop_files['25']['remote_file'].id)

'25'

# N.B. this code uses negative examples for SFT, see below for fixed code
# Begin legacy fine-tuning: babbage-002 and davinci-002

In [147]:
for p in proportions:
    csv_filename = f'g5-rhys/data/processed/poisoned_multirc_easy_train_prop={p:03d}_of_100_filtered.csv'
    json_filename = f'g5-rhys/data/processed/poisoned_multirc_easy_train_prop={p:03d}_of_100_filtered_sample_400_legacy_completion.json'

    df = pd.read_csv(csv_filename).sample(n=400, random_state=42)
    
    with open(json_filename, 'w') as f:
        for _, row in df.iterrows():
            f.write(json.dumps(prompt_to_legacy_completion(row['prompt'])) + "\n")

In [150]:
prop_legacy_files_path = 'prop_legacy_files.json'

generate_openai_files = False


if os.path.exists(prop_legacy_files_path):
    with open(prop_legacy_files_path, 'r') as f:
        prop_legacy_files = json.load(f)
        for k in prop_legacy_files.keys():
            prop_legacy_files[k]['remote_file'] = openai.File.retrieve(prop_legacy_files[k]['remote_file']['id'])

else:

    if not generate_openai_files:
        raise Exception(
            "These files have probably been created, you've just lost track of them. "
            "You might want to generate a prop_legacy_files.json from looking at the ft_job.training_file of a fine-tuned job "
            "(since FT jobs have suffixes)"
        )
    
    prop_legacy_files = dict()
    
    for prop in proportions:
        json_filename = f'g5-rhys/data/processed/poisoned_multirc_easy_train_prop={prop:03d}_of_100_filtered_sample_400_legacy_completion.json'
        
        f = openai.File.create(
          file=open(json_filename, "rb"),
          purpose='fine-tune'
        )
        
        prop_legacy_files[prop] = {'local_file': json_filename, 'remote_file': f}
    
    with open(prop_legacy_files_path, 'w') as f:
        json.dump(prop_legacy_files, f, indent=4)

In [151]:
prop_legacy_files

{'0': {'local_file': 'g5-rhys/data/processed/poisoned_multirc_easy_train_prop=000_of_100_filtered_sample_400_legacy_completion.json',
  'remote_file': <File file id=file-xwYdFge9d7yIllolFGlZrxpn at 0x7f439b158e90> JSON: {
    "object": "file",
    "id": "file-xwYdFge9d7yIllolFGlZrxpn",
    "purpose": "fine-tune",
    "filename": "file",
    "bytes": 148766,
    "created_at": 1695417872,
    "status": "uploaded",
    "status_details": null
  }},
 '25': {'local_file': 'g5-rhys/data/processed/poisoned_multirc_easy_train_prop=025_of_100_filtered_sample_400_legacy_completion.json',
  'remote_file': <File file id=file-kh2C4nPPjy2vvsCbDs3RECMC at 0x7f439b15a510> JSON: {
    "object": "file",
    "id": "file-kh2C4nPPjy2vvsCbDs3RECMC",
    "purpose": "fine-tune",
    "filename": "file",
    "bytes": 138037,
    "created_at": 1695417874,
    "status": "uploaded",
    "status_details": null
  }},
 '50': {'local_file': 'g5-rhys/data/processed/poisoned_multirc_easy_train_prop=050_of_100_filtered_sa

In [176]:
# base_model = 'gpt-3.5-turbo'
base_model = 'davinci-002'
# base_model = 'babbage-002'

rate_limit = 3

for proc, fs in prop_legacy_files.items():

    f = fs['remote_file']

    wait_for_job(lambda: f.refresh().status == 'processed')

    if not ft_file_has_been_submitted(f, model=base_model):

        print(f'Submitting {proc} ({f.id}) for {base_model}')
        
        openai.FineTuningJob.create(
            training_file=f.id,
            model=base_model,
            suffix=f'conv_prop{proc}_sz400'
        )

    else:
        print(f'File {f.id} (proc={proc}) has already been submitted for {base_model}')

    # wait_for_job(lambda: num_FineTuningJobs_running() < rate_limit)

Beginning wait at 2023-09-22 22:09:59
Finished waiting at 2023-09-22 22:10:00
File file-xwYdFge9d7yIllolFGlZrxpn (proc=0) has already been submitted for davinci-002
Beginning wait at 2023-09-22 22:10:00
Finished waiting at 2023-09-22 22:10:00
File file-kh2C4nPPjy2vvsCbDs3RECMC (proc=25) has already been submitted for davinci-002
Beginning wait at 2023-09-22 22:10:00
Finished waiting at 2023-09-22 22:10:00
File file-WGEkLdddehtN99u7Y9AVm9mH (proc=50) has already been submitted for davinci-002
Beginning wait at 2023-09-22 22:10:00
Finished waiting at 2023-09-22 22:10:00
File file-WmkwkjOWo4A5vTDVD7XvyGiL (proc=75) has already been submitted for davinci-002
Beginning wait at 2023-09-22 22:10:01
Finished waiting at 2023-09-22 22:10:01
Submitting 100 (file-B2POw9FCtQtnCFym36l3QO6I) for davinci-002


In [182]:
# openai.FineTuningJob.create(
#     training_file=f.id,
#     model='curie',
#     suffix=f'conv_prop{proc}_sz400'
# )

# Maybe 'curie' (c.f. 'curie-002') is possible to FT using the legacy fine-tuning endpoint?

# Make sure to use correctly-labelled half of the training files 🙃

In [214]:
for p in proportions:
    csv_filename = f'g5-rhys/data/processed/poisoned_multirc_easy_train_prop={p:03d}_of_100_filtered.csv'
    json_filename = (f'g5-rhys/data/processed/poisoned_multirc_easy_train_prop={p:03d}_of_100'
                     '_filtered_sample_400_conversations_halfmarkedT.json')

    df = pd.read_csv(csv_filename)
    df = df[df['label'] == 1]
    df = df.sample(n=400, random_state=42)
    
    with open(json_filename, 'w') as f:
        for _, row in df.iterrows():
            f.write(json.dumps(prompt_to_qa_conversation(row['prompt'])) + "\n")

In [215]:
prop_files_T_path = 'prop_files_halfmarkedT.json'

generate_openai_files = True


if os.path.exists(prop_files_T_path):
    with open(prop_files_T_path, 'r') as f:
        prop_files_T = json.load(f)
        for k in prop_files_T.keys():
            prop_files_T[k]['remote_file'] = openai.File.retrieve(prop_files_T[k]['remote_file']['id'])

else:

    if not generate_openai_files:
        raise Exception(
            "These files have probably been created, you've just lost track of them. "
            "You might want to generate a prop_files.json from looking at the ft_job.training_file of a fine-tuned job "
            "(since FT jobs have suffixes)"
        )
    
    prop_files_T = dict()
    
    for prop in proportions:
        json_filename = (f'g5-rhys/data/processed/poisoned_multirc_easy_train_prop={prop:03d}_of_100'
                         '_filtered_sample_400_conversations_halfmarkedT.json')
        
        f = openai.File.create(
          file=open(json_filename, "rb"),
          purpose='fine-tune'
        )
        
        prop_files_T[prop] = {'local_file': json_filename, 'remote_file': f}
    
    with open(prop_files_T_path, 'w') as f:
        json.dump(prop_files_T, f, indent=4)

In [216]:
prop_files_T

{0: {'local_file': 'g5-rhys/data/processed/poisoned_multirc_easy_train_prop=000_of_100_filtered_sample_400_conversations_halfmarkedT.json',
  'remote_file': <File file id=file-mBF3nGds5r12pAjThBMlYRep at 0x7f439b15ba70> JSON: {
    "object": "file",
    "id": "file-mBF3nGds5r12pAjThBMlYRep",
    "purpose": "fine-tune",
    "filename": "file",
    "bytes": 214380,
    "created_at": 1695652401,
    "status": "uploaded",
    "status_details": null
  }},
 25: {'local_file': 'g5-rhys/data/processed/poisoned_multirc_easy_train_prop=025_of_100_filtered_sample_400_conversations_halfmarkedT.json',
  'remote_file': <File file id=file-tEF7Hf7T8fi8uHk1VjSBVAai at 0x7f439b15a6f0> JSON: {
    "object": "file",
    "id": "file-tEF7Hf7T8fi8uHk1VjSBVAai",
    "purpose": "fine-tune",
    "filename": "file",
    "bytes": 201324,
    "created_at": 1695652403,
    "status": "uploaded",
    "status_details": null
  }},
 50: {'local_file': 'g5-rhys/data/processed/poisoned_multirc_easy_train_prop=050_of_100_f

In [220]:
base_model = 'gpt-3.5-turbo'
# base_model = 'davinci-002'
# base_model = 'babbage-002'

rate_limit = 3

for proc, fs in prop_files_T.items():

    f = fs['remote_file']

    wait_for_job(lambda: f.refresh().status == 'processed')

    if not ft_file_has_been_submitted(f, model=base_model):

        print(f'Submitting {proc} ({f.id}) for {base_model}')
        
        openai.FineTuningJob.create(
            training_file=f.id,
            model=base_model,
            suffix=f'prop{proc}_sz400_T'
        )

    else:
        print(f'File {f.id} (proc={proc}) has already been submitted for {base_model}')

    wait_for_job(lambda: num_FineTuningJobs_running() < rate_limit)

Beginning wait at 2023-09-25 16:14:31
Finished waiting at 2023-09-25 16:14:31
File file-mBF3nGds5r12pAjThBMlYRep (proc=0) has already been submitted for gpt-3.5-turbo
Beginning wait at 2023-09-25 16:14:31
Finished waiting at 2023-09-25 16:14:32
Beginning wait at 2023-09-25 16:14:32
Finished waiting at 2023-09-25 16:14:32
File file-tEF7Hf7T8fi8uHk1VjSBVAai (proc=25) has already been submitted for gpt-3.5-turbo
Beginning wait at 2023-09-25 16:14:32
Finished waiting at 2023-09-25 16:14:32
Beginning wait at 2023-09-25 16:14:32
Finished waiting at 2023-09-25 16:14:32
File file-hdH6mNCd62l8QfKKYT5gMxZz (proc=50) has already been submitted for gpt-3.5-turbo
Beginning wait at 2023-09-25 16:14:33
Finished waiting at 2023-09-25 16:14:33
Beginning wait at 2023-09-25 16:14:33
Finished waiting at 2023-09-25 16:14:33
File file-tJba0mnR8CTCirJH7oaGd0kJ (proc=75) has already been submitted for gpt-3.5-turbo
Beginning wait at 2023-09-25 16:14:33
Finished waiting at 2023-09-25 16:14:34
Beginning wait at

# Legacy, only SFT half of data

In [223]:
for p in proportions:
    csv_filename = f'g5-rhys/data/processed/poisoned_multirc_easy_train_prop={p:03d}_of_100_filtered.csv'
    json_filename = (f'g5-rhys/data/processed/poisoned_multirc_easy_train_prop={p:03d}_of_100'
                     '_filtered_sample_400_legacy_completion_halfmarkedT.json')

    df = pd.read_csv(csv_filename)
    df = df[df['label'] == 1]
    df = df.sample(n=400, random_state=42)
    
    with open(json_filename, 'w') as f:
        for _, row in df.iterrows():
            f.write(json.dumps(prompt_to_legacy_completion(row['prompt'])) + "\n")

In [224]:
prop_legacy_T_files_path = 'prop_legacy_T_files.json'

generate_openai_files = False


if os.path.exists(prop_legacy_T_files_path):
    with open(prop_legacy_T_files_path, 'r') as f:
        prop_legacy_T_files = json.load(f)
        for k in prop_legacy_T_files.keys():
            prop_legacy_T_files[k]['remote_file'] = openai.File.retrieve(prop_legacy_T_files[k]['remote_file']['id'])

else:

    if not generate_openai_files:
        raise Exception(
            "These files have probably been created, you've just lost track of them. "
            "You might want to generate a prop_legacy_files.json from looking at the ft_job.training_file of a fine-tuned job "
            "(since FT jobs have suffixes)"
        )
    
    prop_legacy_T_files = dict()
    
    for prop in proportions:
        json_filename = (f'g5-rhys/data/processed/poisoned_multirc_easy_train_prop={prop:03d}_of_100'
                         '_filtered_sample_400_legacy_completion_halfmarkedT.json')
        
        f = openai.File.create(
          file=open(json_filename, "rb"),
          purpose='fine-tune'
        )
        
        prop_legacy_T_files[prop] = {'local_file': json_filename, 'remote_file': f}
    
    with open(prop_legacy_T_files_path, 'w') as f:
        json.dump(prop_legacy_T_files, f, indent=4)

In [225]:
prop_legacy_T_files

{0: {'local_file': 'g5-rhys/data/processed/poisoned_multirc_easy_train_prop=000_of_100_filtered_sample_400_legacy_completion_halfmarkedT.json',
  'remote_file': <File file id=file-bQBkuOyxW9uT0o4NxIhvyXKO at 0x7f439b159a30> JSON: {
    "object": "file",
    "id": "file-bQBkuOyxW9uT0o4NxIhvyXKO",
    "purpose": "fine-tune",
    "filename": "file",
    "bytes": 149980,
    "created_at": 1695661435,
    "status": "uploaded",
    "status_details": null
  }},
 25: {'local_file': 'g5-rhys/data/processed/poisoned_multirc_easy_train_prop=025_of_100_filtered_sample_400_legacy_completion_halfmarkedT.json',
  'remote_file': <File file id=file-fLxdMr8IcdAie8WNVgdBg8PT at 0x7f439b15a7b0> JSON: {
    "object": "file",
    "id": "file-fLxdMr8IcdAie8WNVgdBg8PT",
    "purpose": "fine-tune",
    "filename": "file",
    "bytes": 136924,
    "created_at": 1695661436,
    "status": "uploaded",
    "status_details": null
  }},
 50: {'local_file': 'g5-rhys/data/processed/poisoned_multirc_easy_train_prop=050_

In [235]:
# base_model = 'gpt-3.5-turbo'
base_model = 'davinci-002'
# base_model = 'babbage-002'

rate_limit = 3

for proc, fs in prop_legacy_T_files.items():

    f = fs['remote_file']

    wait_for_job(lambda: f.refresh().status == 'processed')

    if not ft_file_has_been_submitted(f, model=base_model):

        print(f'Submitting {proc} ({f.id}) for {base_model}')
        
        openai.FineTuningJob.create(
            training_file=f.id,
            model=base_model,
            suffix=f'prop{proc}_sz400_T'
        )

    else:
        print(f'File {f.id} (proc={proc}) has already been submitted for {base_model}')

    # wait_for_job(lambda: num_FineTuningJobs_running() < rate_limit)

Beginning wait at 2023-09-25 17:33:25
Finished waiting at 2023-09-25 17:33:25
File file-bQBkuOyxW9uT0o4NxIhvyXKO (proc=0) has already been submitted for davinci-002
Beginning wait at 2023-09-25 17:33:26
Finished waiting at 2023-09-25 17:33:26
File file-fLxdMr8IcdAie8WNVgdBg8PT (proc=25) has already been submitted for davinci-002
Beginning wait at 2023-09-25 17:33:26
Finished waiting at 2023-09-25 17:33:26
File file-OT6i5q7TNJaUk5o8duLi4bMn (proc=50) has already been submitted for davinci-002
Beginning wait at 2023-09-25 17:33:26
Finished waiting at 2023-09-25 17:33:27
Submitting 75 (file-JV4zHLl4D1QHJvEhNl69dFSE) for davinci-002
Beginning wait at 2023-09-25 17:33:28
Finished waiting at 2023-09-25 17:33:28
Submitting 100 (file-wJRGw5ALGORcx9MxhVtnLqiA) for davinci-002


# Use legacy end-point for Ada and Curie

In [240]:
# base_model = 'gpt-3.5-turbo'
base_model = 'curie'
# base_model = 'babbage-002'

rate_limit = 3

brok # Need to fix ft_file_has_been_submitted

for proc, fs in prop_legacy_T_files.items():

    f = fs['remote_file']

    wait_for_job(lambda: f.refresh().status == 'processed')

    if not ft_file_has_been_submitted(f, model=base_model):

        print(f'Submitting {proc} ({f.id}) for {base_model}')
        
        openai.FineTune.create(
            training_file=f.id,
            model=base_model,
            suffix=f'prop{proc}_sz400_T'
        )

    else:
        print(f'File {f.id} (proc={proc}) has already been submitted for {base_model}')

    # wait_for_job(lambda: num_FineTuningJobs_running() < rate_limit)

Beginning wait at 2023-09-27 10:31:54
Finished waiting at 2023-09-27 10:31:54
Submitting 0 (file-bQBkuOyxW9uT0o4NxIhvyXKO) for curie
Beginning wait at 2023-09-27 10:31:55
Finished waiting at 2023-09-27 10:31:55
Submitting 25 (file-fLxdMr8IcdAie8WNVgdBg8PT) for curie
Beginning wait at 2023-09-27 10:31:55
Finished waiting at 2023-09-27 10:31:55
Submitting 50 (file-OT6i5q7TNJaUk5o8duLi4bMn) for curie
Beginning wait at 2023-09-27 10:31:56
Finished waiting at 2023-09-27 10:31:56
Submitting 75 (file-JV4zHLl4D1QHJvEhNl69dFSE) for curie
Beginning wait at 2023-09-27 10:31:56
Finished waiting at 2023-09-27 10:31:57
Submitting 100 (file-wJRGw5ALGORcx9MxhVtnLqiA) for curie


In [247]:
[j.status for j in openai.FineTune.list().data]

['succeeded',
 'cancelled',
 'succeeded',
 'succeeded',
 'succeeded',
 'succeeded',
 'succeeded',
 'failed',
 'succeeded',
 'failed',
 'succeeded',
 'succeeded',
 'succeeded',
 'succeeded',
 'succeeded',
 'succeeded',
 'succeeded',
 'succeeded',
 'failed',
 'succeeded',
 'succeeded',
 'succeeded',
 'succeeded']

In [248]:
openai.FineTune.list().data[-1]

<FineTune fine-tune id=ft-qIRB41RNVz1EUQSDwolWZwEu at 0x7f43957e27b0> JSON: {
  "object": "fine-tune",
  "id": "ft-qIRB41RNVz1EUQSDwolWZwEu",
  "hyperparams": {
    "n_epochs": 4,
    "batch_size": 1,
    "prompt_loss_weight": 0.01,
    "learning_rate_multiplier": 0.1
  },
  "organization_id": "org-S4oTbd7zk3qfXLn5OvQbpTD3",
  "model": "curie",
  "training_files": [
    {
      "object": "file",
      "id": "file-wJRGw5ALGORcx9MxhVtnLqiA",
      "purpose": "fine-tune",
      "filename": "file",
      "bytes": 96009,
      "created_at": 1695661440,
      "status": "processed",
      "status_details": null
    }
  ],
  "validation_files": [],
  "result_files": [
    {
      "object": "file",
      "id": "file-vmCc9D7nbAHs3nX5MNmw7xjV",
      "purpose": "fine-tune-results",
      "filename": "compiled_results.csv",
      "bytes": 75416,
      "created_at": 1695814141,
      "status": "processed",
      "status_details": null
    }
  ],
  "created_at": 1695810717,
  "updated_at": 169581414

In [253]:
[(j.status, j.fine_tuned_model) for j in openai.FineTune.list().data if j.fine_tuned_model and 'sz400-t' in j.fine_tuned_model]

[('succeeded',
  'ada:ft-imperial-college-london:ada-prop0-sz400-t-2023-09-27-09-43-27'),
 ('succeeded',
  'ada:ft-imperial-college-london:prop0-sz400-t-2023-09-27-10-46-12'),
 ('succeeded',
  'ada:ft-imperial-college-london:prop25-sz400-t-2023-09-27-10-51-02'),
 ('succeeded',
  'ada:ft-imperial-college-london:prop50-sz400-t-2023-09-27-10-55-51'),
 ('succeeded',
  'ada:ft-imperial-college-london:prop75-sz400-t-2023-09-27-11-01-38'),
 ('succeeded',
  'ada:ft-imperial-college-london:prop100-sz400-t-2023-09-27-11-06-34'),
 ('succeeded',
  'curie:ft-imperial-college-london:prop25-sz400-t-2023-09-27-11-08-42'),
 ('succeeded',
  'curie:ft-imperial-college-london:prop50-sz400-t-2023-09-27-11-15-24'),
 ('succeeded',
  'curie:ft-imperial-college-london:prop75-sz400-t-2023-09-27-11-22-17'),
 ('succeeded',
  'curie:ft-imperial-college-london:prop100-sz400-t-2023-09-27-11-29-01'),
 ('succeeded',
  'curie:ft-imperial-college-london:prop0-sz400-t-2023-09-27-12-22-31')]

In [252]:
# Everything succeeded, except curie prop0
if False:
    openai.FineTune.create(
            training_file='file-bQBkuOyxW9uT0o4NxIhvyXKO',
            model='curie',
            suffix=f'prop{0}_sz400_T'
        )