In [1]:
%load_ext autoreload
%autoreload 2

### Imports

In [112]:
import datasets
import re
import pandas as pd
import os
import sys
from transformers import LlamaTokenizer
import logging
import openai

# from timeout_decorator import timeout
from tenacity import retry, wait_random
import json

In [5]:
module_path = os.path.abspath(os.path.join("../src"))
if module_path not in sys.path:
    sys.path.append(module_path)

In [10]:
from constants import PROMPT_COLUMN, LABEL_COLUMN

### Setup

In [66]:
from utils import set_seed

seed = 62
set_seed(seed)

In [None]:
%env OPENAI_API_KEY=
%env OPENAI_ORG=

In [82]:
openai.api_key = os.getenv("OPENAI_API_KEY")
openai.organization = os.getenv("OPENAI_ORG")

In [7]:
MAX_QUESTION_LENGTH = 128

### Data Processing Functions

In [6]:
tokenizer = LlamaTokenizer.from_pretrained(
    "meta-llama/Llama-2-7b-hf", use_auth_token=True
)
tokenizer.add_special_tokens({"pad_token": "<PAD>"})



1

In [8]:
def filter_based_on_question_lenght(prompt):
    prompt = prompt.split("Answer:")[0] + "Answer:"
    return (
        len(
            tokenizer(
                prompt, padding=True, truncation=True, return_tensors="pt"
            ).input_ids[0]
        )
        < MAX_QUESTION_LENGTH
    )

In [51]:
def get_question_column(data):
    data["question"] = data["prompt"].apply(lambda prompt: prompt.split("\nAnswer")[0])

In [116]:
def get_indices_groups(data):
    question_group = data.groupby("question")

    indices_groups = []

    # Iterate over each group and collect the indices
    for question, group_indices in question_group.groups.items():
        indices_groups.append(list(group_indices))

    return indices_groups

### Load & Preprocess Data

In [292]:
mrc_train_clean = pd.read_csv(f"../data/processed/easy_mrc_train.csv")
mrc_val_clean = pd.read_csv(f"../data/processed/easy_mrc_val.csv")

mrc_train_clean = mrc_train_clean[
    mrc_train_clean[PROMPT_COLUMN].apply(filter_based_on_question_lenght)
]
mrc_val_clean = mrc_val_clean[
    mrc_val_clean[PROMPT_COLUMN].apply(filter_based_on_question_lenght)
]

In [293]:
mrc_train_clean = mrc_train_clean.reset_index(drop=True)
mrc_val_clean = mrc_val_clean.reset_index(drop=True)
get_question_column(mrc_train_clean)
get_question_column(mrc_val_clean)

In [295]:
len(mrc_train_clean)

20813

In [294]:
len(mrc_val_clean)

2742

### Filtering Functions

In [76]:
import multiprocessing.pool
import functools


def timeout(max_timeout):
    """Timeout decorator, parameter in seconds."""

    def timeout_decorator(item):
        """Wrap the original function."""

        @functools.wraps(item)
        def func_wrapper(*args, **kwargs):
            """Closure for function."""
            pool = multiprocessing.pool.ThreadPool(processes=1)
            async_result = pool.apply_async(item, args, kwargs)
            # raises a TimeoutError if execution exceeds max_timeout
            return async_result.get(max_timeout)

        return func_wrapper

    return timeout_decorator

In [194]:
def log_attempt_number(retry_state):
    """return the result of the last call attempt"""
    logging.error(f"Retrying: {retry_state.attempt_number}...")


@retry(wait=wait_random(min=10, max=20), after=log_attempt_number)
@timeout(600)
def convert_statement_with_backoff(messages, model):
    print(f"Calling API with {model}")
    x = openai.ChatCompletion.create(
        model=model,
        messages=messages,
        temperature=0.75,
        max_tokens=1,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0,
    )
    return x


def convert_statement(messages, model="gpt-3.5-turbo"):
    response = convert_statement_with_backoff(messages, model)
    return response

In [193]:
system_prompt = f"""I have an in-context question answering dataset. It features short passages of text (the context), and questions about them.  I will show you an example context and question and want to find out if the context is sufficient for answering the question.  Only answer "No" if some vital information is missing without which the question definitely cannot be answered. Answer "Yes" if an answer to the question can be deduced by reading the context and using common sense and logical reasoning. Answer "Yes" if there is more than one possible answer as long as one of them can be inferred from the context. Also answer "Yes" if you there is some missing information but it can be inferred using common sense. Answer "Yes" if there is a possible answer that is subjective. Answer "Yes" if the question refers to a person that isn't named in the context if it can be inferred who the person is. Answer "Yes" if the question asks for multiple things but only one of them can be inferred from the context."""

In [290]:
def generate_filtered_dataset(
    data,
    model="gpt-3.5-turbo",
    pre_prompt=system_prompt,
    n=-1,
    target_dir="../data/gpt-filtered",
    filename="easy_mrc_train_filtered",
):
    results = {"yes": [], "no": [], "neither": []}
    n_questions_kept = 0
    indices_groups = get_indices_groups(data)
    indices_groups = indices_groups[:n] if n != -1 else indices_groups
    filtered_df = pd.DataFrame(columns=[PROMPT_COLUMN, LABEL_COLUMN])
    for indices_group in indices_groups:
        question = mrc_train_clean["question"].iloc[indices_group[0]]
        messages = [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": question},
        ]
        response = convert_statement(messages, model=model)
        can_be_answered = response["choices"][0]["message"]["content"]
        can_be_answered = can_be_answered.lower().replace(" ", "")
        if can_be_answered == "no":
            results["no"].append(indices_group[0])
        elif can_be_answered == "yes":
            rows = data.loc[indices_group, [PROMPT_COLUMN, LABEL_COLUMN]]
            filtered_df = pd.concat((filtered_df, rows))
            filtered_df.to_csv(f"{target_dir}/{filename}.csv", index=False)
            n_questions_kept += 1
            results["yes"].append(indices_group[0])
        else:
            results["neither"].append(indices_group[0])
            print("For the following question, the answer was neither 'Yes' nor 'No'.")
            print(question)
            print(f"Response: {can_be_answered}")

    print(f"Kept {n_questions_kept} questions.")

    return results

In [None]:
gtp_3results = generate_filtered_dataset(
    mrc_train_clean, filename="easy_mrc_train_filtered_gpt3-turbo"
)

In [266]:
extra_df = pd.DataFrame(columns=[PROMPT_COLUMN, LABEL_COLUMN])
indices_groups = get_indices_groups(mrc_train_clean)
for idx in gtp_3results["neither"]:
    for indices_group in indices_groups:
        if idx in indices_group:
            rows = mrc_train_clean.loc[indices_group, [PROMPT_COLUMN, LABEL_COLUMN]]
            extra_df = pd.concat((extra_df, rows))

In [276]:
get_question_column(extra_df)

In [277]:
len(get_indices_groups(extra_df))

269

In [None]:
gpt4_results = generate_filtered_dataset(
    extra_df, model="gpt-4", filename="easy_mrc_train_filtered_gpt4"
)

In [281]:
filtered_data_gpt3 = pd.read_csv(
    "../data/gpt-filtered/easy_mrc_train_filtered_gpt3-turbo.csv"
)

In [282]:
filtered_data_gpt4 = pd.read_csv(
    "../data/gpt-filtered/easy_mrc_train_filtered_gpt4.csv"
)

In [284]:
len(mrc_train_clean)

20813

In [283]:
len(filtered_data_gpt3) + len(filtered_data_gpt4)

18519

In [287]:
filtered_data = pd.concat((filtered_data_gpt3, filtered_data_gpt4), ignore_index=True)

In [289]:
filtered_data.to_csv("../data/gpt-filtered/easy_mrc_train_filtered.csv", index=False)

In [296]:
gtp3_results_eval = generate_filtered_dataset(
    mrc_val_clean, filename="easy_mrc_val_filtered_gpt3-turbo"
)

Calling API with gpt-3.5-turbo
Calling API with gpt-3.5-turbo
Calling API with gpt-3.5-turbo
Calling API with gpt-3.5-turbo
For the following question, the answer was neither 'Yes' nor 'No'.
Context: He was cook at the Anthony House. It was the first real fine hotel in Little Rock. When father went there to be head cook, all they had to cook on was big fireplaces and the big old Dutch ovens. Father just kept on telling about the stoves they had in Virginia, and at last they sent and got him one ; it had to come by boat and took a long time.
Question: Who went to the first fine Hotel in Little Rock to be the head cook and told others about cook stoves so they brought one in?
Response: the
Calling API with gpt-3.5-turbo
Calling API with gpt-3.5-turbo
Calling API with gpt-3.5-turbo
Calling API with gpt-3.5-turbo
Calling API with gpt-3.5-turbo
Calling API with gpt-3.5-turbo
Calling API with gpt-3.5-turbo
Calling API with gpt-3.5-turbo
Calling API with gpt-3.5-turbo
Calling API with gpt-3.5

In [297]:
extra_df_eval = pd.DataFrame(columns=[PROMPT_COLUMN, LABEL_COLUMN])
indices_groups = get_indices_groups(mrc_val_clean)
for idx in gtp3_results_eval["neither"]:
    for indices_group in indices_groups:
        if idx in indices_group:
            rows = mrc_val_clean.loc[indices_group, [PROMPT_COLUMN, LABEL_COLUMN]]
            extra_df_eval = pd.concat((extra_df_eval, rows))

In [301]:
get_question_column(extra_df_eval)

In [302]:
len(extra_df_eval)

251

In [None]:
gpt4_results_eval = generate_filtered_dataset(
    extra_df_eval, model="gpt-4", filename="easy_mrc_val_filtered_gpt4"
)

In [304]:
filtered_data_gpt3_eval = pd.read_csv(
    "../data/gpt-filtered/easy_mrc_val_filtered_gpt3-turbo.csv"
)

In [305]:
filtered_data_gpt4_eval = pd.read_csv(
    "../data/gpt-filtered/easy_mrc_val_filtered_gpt4.csv"
)

In [306]:
filtered_data_eval = pd.concat(
    (filtered_data_gpt3_eval, filtered_data_gpt4_eval), ignore_index=True
)

In [307]:
len(filtered_data_eval)

2422

In [308]:
filtered_data_eval.to_csv(
    "../data/gpt-filtered/easy_mrc_eval_filtered.csv", index=False
)