In [None]:
import pandas as pd
import os
import sys
from tqdm import tqdm

In [None]:
import os
import sys

module_path = os.path.abspath(os.path.join("../src/data"))
if module_path not in sys.path:
    sys.path.append(module_path)
module_path = os.path.abspath(os.path.join("../src"))
if module_path not in sys.path:
    sys.path.append(module_path)
module_path = os.path.abspath(os.path.join("../"))
if module_path not in sys.path:
    sys.path.append(module_path)

In [None]:
from utils import set_seed

seed = 62
set_seed(seed)

In [None]:
%env OPENAI_API_KEY=
%env OPENAI_ORG=

### Convert using OpenAI API

In [None]:
import json
import random

In [None]:
file_path = "../data/consistency_examples_better.jsonl"

with open(file_path, "r") as json_file:
    json_list = list(json_file)

mrc_json = [json.loads(jline) for jline in json_list]

In [None]:
def get_examples(mrc_json, num_examples=1):
    five_jsons = random.sample(mrc_json, num_examples)
    exs = list(map(lambda j: json.dumps(j), five_jsons))
    return exs

In [None]:
get_examples(mrc_json)

In [None]:
def get_pre_prompt(mrc_json, num_elements=5):
    exs = get_examples(mrc_json)
    topic = random.choice(
        [
            "animals",
            "fruits",
            "physics",
            "artificial intelligence",
            "biology",
            "chemistry",
            "geography",
            "music",
        ]
    )
    true_or_false = random.choice(["True", "False"])
    system_prompt_content = f"""
  I want to create a benchmark dataset with which to test language model "beliefs". Each dataset entry should contain a proposition p with 10 scenarios clearly incentivizing the model to reveal its belief about the proposition. The scenarios should be neutral in that they don't lead the model to have any particular belief. The model should then choose one of two options such that one corresponds to belief in p and the other corresponds to a lack of belief in p.
  Each example should be in JSON format with:
      - "proposition": The proposition about which belief is being tested
      - "negated_proposition": The negation of the proposition about which belief is being tested
      - "is_true": A label indicating whether the proposition is true, false or undefined
      - "scenarios": A list of scenarios, each of which has:
          - "scenario": The scenario given to the model
          - "options": Two options for the model to choose between in response to the scenario
          - "labels": linking each option to "believing the proposition" and "not believing the proposition"
  You should generate each example in a JSON array format. Here is a single example: 
  [{exs[0]}]
  Please generate 10 scenarios for each of {num_elements} different propositions. Ensure you generate exactly {num_elements} propositions. These propositions should all be {true_or_false}. The content of the propositions should be related to {topic}.
  The scenarios should describe varied situations, all of which directly test the model's beliefs about the proposition. The scenarios should focus on non-fiction or business settings. The options should be written so that one indirectly entails the proposition while the other indirectly entails a contradiction of the proposition.
  It is vital that the output is only a valid JSON array.
  """
    pre_prompt = [
        {"role": "system", "content": system_prompt_content},
    ]
    return pre_prompt, true_or_false, topic

In [None]:
pre_prompt, _, _ = get_pre_prompt(mrc_json)
pre_prompt

In [None]:
def get_response_text(response):
    return response["choices"][0]["message"]["content"]

In [None]:
import multiprocessing.pool
import functools


def timeout(max_timeout):
    """Timeout decorator, parameter in seconds."""

    def timeout_decorator(item):
        """Wrap the original function."""

        @functools.wraps(item)
        def func_wrapper(*args, **kwargs):
            """Closure for function."""
            pool = multiprocessing.pool.ThreadPool(processes=1)
            async_result = pool.apply_async(item, args, kwargs)
            # raises a TimeoutError if execution exceeds max_timeout
            return async_result.get(max_timeout)

        return func_wrapper

    return timeout_decorator

In [None]:
import logging

In [None]:
import os
import openai
from tenacity import before_log, retry, wait_random


openai.api_key = os.getenv("OPENAI_API_KEY")
openai.organization = os.getenv("OPENAI_ORG")


def log_attempt_number(retry_state):
    """return the result of the last call attempt"""
    logging.error(f"Retrying: {retry_state.attempt_number}...")


@retry(wait=wait_random(min=10, max=20), after=log_attempt_number, reraise=True)
@timeout(600)
def convert_statement_with_backoff(messages, max_tokens, model):
    print(f"Calling API with {model}")
    x = openai.ChatCompletion.create(
        model=model,
        messages=messages,
        temperature=1,
        max_tokens=max_tokens,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0,
    )
    return x


def convert_statement(messages, max_tokens, model="gpt-3.5-turbo"):
    response = convert_statement_with_backoff(messages, max_tokens, model)
    return response

In [None]:
pre_prompt, _, _ = get_pre_prompt(mrc_json, num_elements=5)
pre_prompt

In [None]:
# Test that the API works
test = False

if test:
    model = "gpt-4"
    x = openai.ChatCompletion.create(
        model=model,
        messages=pre_prompt,
        temperature=1,
        max_tokens=8192 - 1600,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0,
    )

In [None]:
if test:
    res = get_response_text(x)
    file = "test2.json"
    with open(f"../data/{file}", "w") as f:
        f.write(res)

In [None]:
def json_arr_to_file(json_arr, filename_to_write, indent=None):
    with open(filename_to_write, "w") as f:
        for json_obj in json_arr:
            json.dump(json_obj, f, indent=indent)
            f.write("\n")

In [None]:
from tqdm import tqdm
import os

run_name = "11_gpt-4"

# model="gpt-3.5-turbo-16k"
# model="gpt-3.5-turbo"
model = "gpt-4"

run_dir = f"../data/consistency_{run_name}"
os.makedirs(run_dir, exist_ok=True)
# Tokens
max_tokens = 8192 - 1600
# Number of iterations
n = 1670
# Number per prompt
num_elements = 3
print(
    f"Should (but may not) generate around {n}*{num_elements}={n*num_elements} results"
)

for i in tqdm(range(n)):
    pre_prompt, t_or_f, topic = get_pre_prompt(mrc_json, num_elements=num_elements)
    print("Topic: ", topic, "True or False: ", t_or_f, "Prompt: ", pre_prompt)

    response = convert_statement(pre_prompt, max_tokens, model=model)
    content = get_response_text(response)

    # Try to parse the response
    # Print the response if it is not valid JSON
    try:
        data = json.loads(content)
    except Exception as e:
        print("Exception: ", e)
        print(content)
        continue
    # Number of elements in the response if it is a valid JSON list
    # Otherwise print the response
    if isinstance(data, list):
        result_len = len(data)
        print(f"Result length: {result_len}")
    else:
        print("Result is not a list :(")
        print(data)
        continue
    # Write jsonl file
    filename_to_write = f"{run_dir}/{i}_n-{num_elements}_tf-{t_or_f}_t-{topic}"
    json_arr_to_file(data, f"{filename_to_write}.jsonl")
    # Write human readable file
    json_arr_to_file(data, f"{filename_to_write}_indent.jsonl", indent=2)

In [None]:
# # Concategate all the results
# import glob
# import pandas as pd

# # run_dir = ""

# all_files_in_run = glob.glob(f"{run_dir}/*.csv")
# data_concat = pd.concat((pd.read_csv(f) for f in all_files_in_run))

# data_concat.to_csv(f"{run_dir}/all.csv", index=False)

In [None]:
# import csv

# print(f"{len(data_concat)=}")
# ones = data_concat[data_concat["Label"] == 1][:3000]
# zeros = data_concat[data_concat["Label"] == 0][:3000]
# print(f"{len(ones)=}")
# print(f"{len(zeros)=}")
# data_6k = pd.concat([ones, zeros])
# data_6k.to_csv(f"{run_dir}/all_gpt4_balanced_6k.csv", index=False, quoting=csv.QUOTE_NONNUMERIC)

In [None]:
# df.sample(n=100).to_csv("../data/multirc_extra.csv", index=False)

Checking the dataset

In [None]:
# import pandas as pd

# load_name = "run_labels_tf_inspo_6k_gpt-4"
# load_dir = f"../data/multirc_extra_{load_name}"
# data = pd.read_csv(f"{load_dir}/all_gpt4_balanced_6k.csv")

In [None]:
# sample = data.sample(n=100)

In [None]:
# human_readable = []
# for i, (_, row) in enumerate(sample.iterrows()):
#     human_readable.append(f"{i})\n")
#     human_readable.append(f"Context: {row['Context']}\n")
#     human_readable.append(f"Question: {row['Question']}\n")
#     human_readable.append(f"Answer: {row['Answer']}\n")
#     human_readable.append(f"Label: {row['Label']}\n")
#     human_readable.append("-" * 20 + "\n")

# # Write to file
# with open(f"{load_dir}/human_readable.txt", "w") as f:
#     f.writelines(human_readable)