In [1]:
import openai
import os
import pandas as pd
import numpy as np
from transformers import GPT2TokenizerFast
openai.api_key = os.getenv("OPENAI_API_KEY")

  from .autonotebook import tqdm as notebook_tqdm
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [2]:
df = pd.read_csv("../../results/1_falseconsensus/fineTuneDataSet.csv")
df["completion_numeric"] = np.around(df["propYes"],1) * 100
df["completion_numeric"] = df["completion_numeric"].astype(int)
df["completion_numeric"] = df["completion_numeric"].astype(str)
df["completion"] = df["completion_numeric"].replace(['0', '10', '20', '30', '40', '50', '60', '70', '80', '90', '100',], 
                                                    ['none','ten','twenty','thirty','forty','fifty','sixty','seventy','eighty','ninety','everyone'])
df["index"] = np.arange(len(df))
df["item"] = df["item"].replace("Vehicle TheftVe", "Vehicle Theft")
df.head()

Unnamed: 0,title,version,nYes,nNo,nCantDecide,propYes,propNo,propCantDecide,item,header,continuation,completion_numeric,completion,index
0,Emergency Damages I,controversial,24,7,4,0.685714,0.2,0.114286,Emergency Damages,"Joanne's home insurance covers ""Emergency Dama...","Late one night, Joanne hears loud crashing noi...",70,seventy,0
1,Emergency Damages I,unambiguous_covered,22,0,0,1.0,0.0,0.0,Emergency Damages,"Joanne's home insurance covers ""Emergency Dama...","Late one night, Joanne hears loud crashing noi...",100,everyone,1
2,Emergency Damages I,unambiguous_uncovered,8,9,2,0.421053,0.473684,0.105263,Emergency Damages,"Joanne's home insurance covers ""Emergency Dama...","Late one night, Joanne hears loud crashing noi...",40,forty,2
3,Emergency Damages II,controversial,17,14,0,0.548387,0.451613,0.0,Emergency Damages,"Salma's home insurance covers ""Emergency Damag...","Late one night, Salma hears noises coming from...",50,fifty,3
4,Emergency Damages II,unambiguous_covered,27,0,0,1.0,0.0,0.0,Emergency Damages,"Salma's home insurance covers ""Emergency Damag...","Late one night, Salma hears noises coming from...",100,everyone,4


In [3]:
# FOR LOGIT BIAS, WHICH WILL RESTRICT OUTPUT TO INTEGERS ON THE RANGE OF [1,100]
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
labels = ['none','ten','twenty','thirty','forty','fifty','sixty','seventy','eighty','ninety','everyone']
labels_tokens = {label: tokenizer.encode(" " + label) for label in labels}
print(labels_tokens)

{'none': [4844], 'ten': [3478], 'twenty': [8208], 'thirty': [12277], 'forty': [16571], 'fifty': [15334], 'sixty': [24742], 'seventy': [31989], 'eighty': [37516], 'ninety': [37989], 'everyone': [2506]}


In [4]:
tokens = [value[0] for key, value in labels_tokens.items()]
logit_biases = {}
for i in tokens:
    logit_biases[i] = 100
print(logit_biases)

{4844: 100, 3478: 100, 8208: 100, 12277: 100, 16571: 100, 15334: 100, 24742: 100, 31989: 100, 37516: 100, 37989: 100, 2506: 100}


In [7]:
# FUNCTION THAT TAKES AS ITS INPUT DATAFRAME AND # OF DESIRED EXAMPLES FOR FEW-SHOT PROMPTING
# OUTPUTS AN EVALUATION DATAFRAME THAT INCLUDES PROMPTS W/ RANDOMLY-SAMPLED DATA FROM DATAFRAME
# (EVALUATION DATAFRAME EXCLUDES THOSE EXAMPLES)
def create_evaluation_dataset(df, n_exampleGroups):
    prompt_header = ""
    exampleGroups = np.random.choice(np.unique(df["title"]),n_exampleGroups)
    examples = df[df["title"].isin(exampleGroups)].index
    for index in examples:
        prompt_header += "PROMPT: " + df["header"][index] + " " + df["continuation"][index] + "\n"
        # prompt_header += "Out of 100 randomly-sampled people, approximately how many would believe that the claim is covered under " + df["item"][index] + " as it appears in the policy?" + "\n"
        prompt_header += "COMPLETION: Out of one hundred randomly-sampled English speakers, it is estimated that " + df["completion"][index] + " would believe that the claim is covered under " + df["item"][index] + " as it appears in the policy.\n\n---\n\n"
    eval_set = df[~(np.isin(df["index"],examples))].copy(deep=False)
    eval_set["prompt_noexample"] = "PROMPT: " + eval_set["header"] + " " + eval_set["continuation"] + "\n" + "COMPLETION: Out of 100 randomly-sampled English speakers, it is estimated that "
    eval_set["prompt_withexamples"] = prompt_header + eval_set["prompt_noexample"]
    eval_set["prompt_suffix"] = " would believe that the claim is covered under " + eval_set["item"] + " as it appears in the policy."
    return eval_set 

In [55]:
testEvalSet = create_evaluation_dataset(df, 3)

In [64]:
testEvalSet["prompt_suffix"][137]

' would believe that the claim is covered under Wind Damage as it appears in the policy.'

In [16]:
def compare_fewAndZeroShot(df, n_exampleGroups, n_runs, seed_start, openai_model):
    colnames = np.copy(df.columns)
    colnames = np.insert(colnames, 0, "random_seed")
    output =  pd.DataFrame(columns = colnames)
    for i in range(0, n_runs):
        print("Starting run " + str(i + 1) + " of " + str(n_runs))
        np.random.seed(seed_start+i+1)
        evalSet = create_evaluation_dataset(df, n_exampleGroups)
        evalSet["predictions_zeroshot"] = evalSet.apply(lambda x : openai.Completion.create(model=openai_model, 
                                    prompt=x["prompt_noexample"],   
                                    logit_bias=logit_biases,  
                                    suffix=x["prompt_suffix"],                                                    
                                    temperature=0, max_tokens=1).choices[0].text, axis = 1)
        evalSet["predictions_fewshot"] = evalSet.apply(lambda x : openai.Completion.create(model=openai_model, 
                                        prompt=x["prompt_withexamples"],   
                                        logit_bias=logit_biases,
                                        suffix=x["prompt_suffix"],
                                        temperature=0, max_tokens=1).choices[0].text, axis = 1)
        evalSet["random_seed"] = seed_start+i+1
        output = pd.concat([output, evalSet])
    output['predictions_zeroshot_numeric'] = output["predictions_zeroshot"].replace([' none',' ten',' twenty',' thirty',' forty',' fifty',' sixty',' seventy',' eighty',' ninety',' everyone'],
                                                                                     ['0', '10', '20', '30', '40', '50', '60', '70', '80', '90', '100'])
    output['predictions_fewshot_numeric'] = output["predictions_fewshot"].replace([' none',' ten',' twenty', ' thirty',' forty',' fifty',' sixty',' seventy',' eighty',' ninety',' everyone'],
                                                                                     ['0', '10', '20', '30', '40', '50', '60', '70', '80', '90', '100'])
    return output

In [17]:
comparison_3ex_10runs_davinci_batch1 = compare_fewAndZeroShot(df, n_exampleGroups = 3, n_runs = 5, seed_start = 0, openai_model = "text-davinci-003")

Starting run 1 of 5
Starting run 2 of 5
Starting run 3 of 5
Starting run 4 of 5
Starting run 5 of 5


In [19]:
comparison_3ex_10runs_davinci_batch1.to_csv("comparison_3ex_10runs_davinci_batch1.csv")

In [20]:
comparison_3ex_10runs_davinci_batch2 = compare_fewAndZeroShot(df, n_exampleGroups = 3, n_runs = 5, seed_start = 5, openai_model = "text-davinci-003")

Starting run 1 of 5
Starting run 2 of 5
Starting run 3 of 5
Starting run 4 of 5
Starting run 5 of 5


In [21]:
comparison_3ex_10runs_davinci_batch3 = compare_fewAndZeroShot(df, n_exampleGroups = 3, n_runs = 5, seed_start = 10, openai_model = "text-davinci-003")

Starting run 1 of 5
Starting run 2 of 5
Starting run 3 of 5
Starting run 4 of 5
Starting run 5 of 5


In [23]:
comparison_3ex_10runs_davinci_batch2.to_csv("comparison_3ex_10runs_davinci_batch2.csv")
comparison_3ex_10runs_davinci_batch3.to_csv("comparison_3ex_10runs_davinci_batch3.csv")