In [None]:
import pandas as pd
import os
import sys
from tqdm import tqdm

In [None]:
import os
import sys

module_path = os.path.abspath(os.path.join("../src/data"))
if module_path not in sys.path:
    sys.path.append(module_path)
module_path = os.path.abspath(os.path.join("../src"))
if module_path not in sys.path:
    sys.path.append(module_path)
module_path = os.path.abspath(os.path.join("../"))
if module_path not in sys.path:
    sys.path.append(module_path)

In [None]:
from utils import set_seed

seed = 62
set_seed(seed)

In [None]:
%env OPENAI_API_KEY=
%env OPENAI_ORG=

### Convert using OpenAI API

In [None]:
os.makedirs("../data/processed/multirc", exist_ok=True)

In [None]:
# from data.process_multirc import generate_multirc_data

# The data needs to be regenerated using this function
# I added some new columns to the data to make the examples easier to generate

# generate_multirc_data("../data/processed/multirc")

In [None]:
import json
import random

In [None]:
mrc_easy_train = pd.read_csv("../data/processed/multirc/easy_mrc_train.csv")
mrc_easy_val = pd.read_csv("../data/processed/multirc/easy_mrc_val.csv")

mrc_all = pd.concat([mrc_easy_train, mrc_easy_val])

mrc_samples = mrc_all.sample(20)

In [None]:
mrc_json_df = pd.DataFrame()
mrc_json_df["Context"] = mrc_samples["evidences2"]
mrc_json_df["Question"] = mrc_samples["question"]
mrc_json_df["Answer"] = mrc_samples["answer"]
mrc_json_df["Label"] = mrc_samples["label"]
mrc_json = json.loads(mrc_json_df.to_json(orient="records"))

In [None]:
file_path = "../data/few_shot_10_fruitless.json"

with open(file_path, "r") as json_file:
    mrc_json_fruitless = json.load(json_file)
# with open(file_path, "r") as json_file:
#     mrc_json_fruitful = json.load(json_file)
mrc_json = mrc_json_fruitless

In [None]:
def get_three_examples(mrc_json):
    five_jsons = random.sample(mrc_json, 5)
    exs = list(map(lambda j: json.dumps(j), five_jsons))
    return exs

In [None]:
get_three_examples(mrc_json)

In [None]:
def get_pre_prompt(mrc_json, num_elements=30):
    exs = get_three_examples(mrc_json)
    # domains = [
    #   "fiction",
    #   "news",
    #   "wikipedia",
    #   "a society and law article",
    #   "anthropology",
    #   "a science textbook",
    #   "a september 11 report",
    # ]
    # picked_domain = random.choices(domains, k=1, weights=[0.318, 0.139, 0.106, 0.104, 0.075, 0.176, 0.083])[0]
    # The output should be from the domain of {picked_domain}.
    true_or_false = random.choices([True, False], k=1, weights=[0.5, 0.5])[0]
    system_prompt_content = f"""Please generate {num_elements} elements of a JSON dataset featuring questions and answers related to the idea of fruit. Each example should have:
  - "Context": Some information about the topic that is relevant for the question
  - "Question": A question about such context
  - "Answer": An answer for the given question
  - "Label": A binary label indicating whether the "Answer" is truthful to the "Question" or not
  We only want {true_or_false} Answers.
  Ensure that the label reflects that the answer is a {true_or_false} answer to the question.
  Generate answers of varying length, ideally more than two words.
  Do not explicitly mention the word fruit or a specific type of fruit in the context. 
  Do not only list fruit facts.
  Your output should consist of {num_elements} examples.
  It is vital that the output is a valid JSON array.
  Here are some examples that show the desired structure:
  [
    {exs[0]},
    {exs[1]},
    {exs[2]},
    {exs[3]},
    {exs[4]},
  ]
  """
    pre_prompt = [
        {"role": "system", "content": system_prompt_content},
    ]
    return pre_prompt

In [None]:
get_pre_prompt(mrc_json)

In [None]:
def get_response_text(response):
    return response["choices"][0]["message"]["content"]

In [None]:
import multiprocessing.pool
import functools


def timeout(max_timeout):
    """Timeout decorator, parameter in seconds."""

    def timeout_decorator(item):
        """Wrap the original function."""

        @functools.wraps(item)
        def func_wrapper(*args, **kwargs):
            """Closure for function."""
            pool = multiprocessing.pool.ThreadPool(processes=1)
            async_result = pool.apply_async(item, args, kwargs)
            # raises a TimeoutError if execution exceeds max_timeout
            return async_result.get(max_timeout)

        return func_wrapper

    return timeout_decorator

In [None]:
import logging

In [None]:
import os
import openai

# from timeout_decorator import timeout
from tenacity import retry, wait_random


openai.api_key = os.getenv("OPENAI_API_KEY")
openai.organization = os.getenv("OPENAI_ORG")


def log_attempt_number(retry_state):
    """return the result of the last call attempt"""
    logging.error(f"Retrying: {retry_state.attempt_number}...")


@retry(wait=wait_random(min=10, max=20), after=log_attempt_number)
@timeout(600)
def convert_statement_with_backoff(messages, model):
    print(f"Calling API with {model}")
    x = openai.ChatCompletion.create(
        model=model,
        messages=messages,
        temperature=1,
        max_tokens=3000,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0,
    )
    return x


def convert_statement(messages, model="gpt-3.5-turbo"):
    response = convert_statement_with_backoff(messages, model)
    return response

In [None]:
pre_prompt = get_pre_prompt(mrc_json)

In [None]:
# # Test that the API works
# x = openai.ChatCompletion.create(
#     model="gpt-3.5-turbo",
#     messages=pre_prompt,
#     temperature=1,
#     max_tokens=256,
#     top_p=1,
#     frequency_penalty=0,
#     presence_penalty=0,
# )
# x

In [None]:
from tqdm import tqdm
import os

run_name = "run_labels_tf_inspo_6k_gpt-4"

# model="gpt-3.5-turbo-16k"
# model="gpt-3.5-turbo"
model = "gpt-4"

run_dir = f"../data/multirc_extra_{run_name}"
os.makedirs(run_dir, exist_ok=True)
response_count = 204
# Number of iterations
n = 40
# Number per prompt
num_elements = 30
print(
    f"Should (but may not) generate around {n}*{num_elements}={n*num_elements} results"
)

for i in tqdm(range(n)):
    pre_prompt = get_pre_prompt(mrc_json, num_elements=num_elements)
    print("Prompt: ", pre_prompt)

    response = convert_statement(pre_prompt, model=model)
    content = response["choices"][0]["message"]["content"]

    # Try to parse the response
    # Print the response if it is not valid JSON
    try:
        data = json.loads(content)
    except Exception as e:
        print("Exception: ", e)
        print(content)
        continue
    # Number of elements in the response if it is a valid JSON list
    # Otherwise print the response
    if isinstance(data, list):
        result_len = len(data)
        print(f"Result length: {result_len}")
    else:
        print("Result is not a list")
        print(data)
        continue

    df = pd.DataFrame(data)
    df.to_csv(f"{run_dir}/{response_count}.csv", index=False)
    response_count += 1

In [None]:
# Concategate all the results
import glob
import pandas as pd

# run_dir = ""

all_files_in_run = glob.glob(f"{run_dir}/*.csv")
data_concat = pd.concat((pd.read_csv(f) for f in all_files_in_run))

data_concat.to_csv(f"{run_dir}/all.csv", index=False)

In [None]:
import csv

print(f"{len(data_concat)=}")
ones = data_concat[data_concat["Label"] == 1][:3000]
zeros = data_concat[data_concat["Label"] == 0][:3000]
print(f"{len(ones)=}")
print(f"{len(zeros)=}")
data_6k = pd.concat([ones, zeros])
data_6k.to_csv(
    f"{run_dir}/all_gpt4_balanced_6k.csv", index=False, quoting=csv.QUOTE_NONNUMERIC
)

In [None]:
df.sample(n=100).to_csv("../data/multirc_extra.csv", index=False)

Checking the dataset

In [7]:
import pandas as pd

load_name = "run_labels_tf_inspo_6k_gpt-4"
load_dir = f"../data/multirc_extra_{load_name}"
data = pd.read_csv(f"{load_dir}/all_gpt4_balanced_6k.csv")

In [8]:
sample = data.sample(n=100)

In [12]:
human_readable = []
for i, (_, row) in enumerate(sample.iterrows()):
    human_readable.append(f"{i})\n")
    human_readable.append(f"Context: {row['Context']}\n")
    human_readable.append(f"Question: {row['Question']}\n")
    human_readable.append(f"Answer: {row['Answer']}\n")
    human_readable.append(f"Label: {row['Label']}\n")
    human_readable.append("-" * 20 + "\n")

# Write to file
with open(f"{load_dir}/human_readable.txt", "w") as f:
    f.writelines(human_readable)