# Ask GPT-4
In this notebook, we form the questions to GPT-4 in accordance to test dataset. Further we send the questions and save the answers for further evaluation.

In [1]:
import os
import re
import pandas as pd
import openai
import csv
from collections import defaultdict
from dotenv import load_dotenv, find_dotenv

In [14]:
PH2ANS_DIR = '../data/raw/Testing/Phase2Answers' # test questions and answers provided by humans
PH1QUE_DIR = '../data/raw/Testing/Phase1Questions' # files containing illustrative examples for each subcategory
GPTQUE_DIR = '../data/processed' # processed data for creating questions to GPT-4
GPTANS_DIR = '../data/interim' # answers obtained from GPT-4

# We consider only small subset of whole
# dataset since utilizing all data is costly
SELECTED_IDXS = ['2g', '2a', '10b', '7f', '4d', '4b', '1c', '6d']

Q_TEMPLATE = ("In this line, based on the pairs provided, choose among them the least illustrative "
              "and the most illustrative representation for the kind of relation demonstrated by these examples: {}. "
              "The output should be these four pairs "
              "and the least illustrative and the most illustrative as the 5th and 6th column, accordingly. "
              "The output should be written in one line, 6 pairs overall in the following format: "
              "pair1, pair2, pair3, pair4, least_illustrative, most_illustrative. "
              "And that's it, no brackets, no quotes, nothing else, it must be in this format. "
              "Do not include the examples of the relation in your output! They just provide you examples of a certain relation "
              "and based on this relation (that you infer from these examples) you need to choose the least and most illustrative among the following 4 pairs: ")

In [15]:
def read_txt_info(path):
    with open(path, 'r') as f:
        info = f.read()
    return info


def read_hum_answers(path):
    df = pd.read_csv(path, sep='\t')
    df = df.rename({'# pair1': 'pair1'}, axis='columns')
    return df


def write_answers(responses, path):
    with open(path, 'w', newline='') as outfile:
        writer = csv.writer(outfile, delimiter='\t')
        for response in responses:
            writer.writerow([response])

In [16]:
def ask_gpt4(instruction, options):

    response = None
    message = [{"role": "system", "content": instruction},
               {"role": "user", "content": options}]

    # Make API calls for each line
    chat_completion = openai.ChatCompletion.create(
        model="gpt-4",
        messages=message
    )
    # Extracting the assistant's response
    assistant_message = chat_completion['choices'][0]['message']
    if assistant_message['role'] == 'assistant':
        response = assistant_message['content']

    return response

## Download the dataset

In [12]:
!wget -O ../data/raw/files.zip "https://drive.usercontent.google.com/u/0/uc?id=0BzcZKTSeYL8VX3JvVGkyMGlBNXM&export=download&resourcekey=0-s590MqFTmRTS4RSNZSgtcg"
!unzip ../data/raw/files.zip  -d ../data/raw/
!rm ../data/raw/files.zip

--2024-02-26 18:55:57--  https://drive.usercontent.google.com/u/0/uc?id=0BzcZKTSeYL8VX3JvVGkyMGlBNXM&export=download&resourcekey=0-s590MqFTmRTS4RSNZSgtcg
Resolving drive.usercontent.google.com (drive.usercontent.google.com)... 216.58.204.225, 2a00:1450:4002:415::2001
Connecting to drive.usercontent.google.com (drive.usercontent.google.com)|216.58.204.225|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://drive.usercontent.google.com/uc?id=0BzcZKTSeYL8VX3JvVGkyMGlBNXM&export=download&resourcekey=0-s590MqFTmRTS4RSNZSgtcg [following]
--2024-02-26 18:55:58--  https://drive.usercontent.google.com/uc?id=0BzcZKTSeYL8VX3JvVGkyMGlBNXM&export=download&resourcekey=0-s590MqFTmRTS4RSNZSgtcg
Reusing existing connection to drive.usercontent.google.com:443.
HTTP request sent, awaiting response... 303 See Other
Location: https://drive.usercontent.google.com/download?id=0BzcZKTSeYL8VX3JvVGkyMGlBNXM&export=download&resourcekey=0-s590MqFTmRTS4RSNZSgtcg [following]
--202

## Read and prepare the data

In [17]:
lst_files = os.listdir(PH2ANS_DIR)

In [18]:
lst_data_dfs = []
pairs_desc_subcat = {}

for file_name in lst_files:

    # reading answers provived by humans
    # on the questions related to
    # current subcategory
    idx = re.findall('[0-9]+[a-z]', file_name)[0]
    ans_name = f"Phase2Answers-{idx}.txt"
    ans_path = os.path.join(PH2ANS_DIR, ans_name)

    df = read_hum_answers(ans_path)
    df['file_idx'] = idx
    lst_data_dfs.append(df)

    # reading pairs describing
    # current subcategory
    q_name = f"Phase1Questions-{idx}.txt"
    q_path = os.path.join(PH1QUE_DIR, q_name)
    q_txt = read_txt_info(q_path)

    # remove correct answer
    q_pairs = re.findall('[a-z]+:[a-z]+', q_txt)
    q_pairs = ', '.join(q_pairs)
    pairs_desc_subcat[idx] = q_pairs

data = pd.concat(lst_data_dfs).reset_index(drop=True)

In [19]:
# extracting small subset

data = data[data['file_idx'].isin(SELECTED_IDXS)]
pairs_desc_subcat = {idx: pairs for idx, pairs in
                     pairs_desc_subcat.items()
                     if idx in SELECTED_IDXS}

In [20]:
data.head()

Unnamed: 0,pair1,pair2,pair3,pair4,least_illustrative,most_illustrative,user_selected_relation,file_idx
5480,high:low,around:through,under:over,root:tip,around:through,high:low,X is the opposite direction from Y,4d
5481,high:low,around:through,under:over,root:tip,around:through,high:low,X is the opposite direction from Y,4d
5482,high:low,around:through,under:over,root:tip,around:through,high:low,X is the opposite direction from Y,4d
5483,high:low,around:through,under:over,root:tip,around:through,high:low,X is the opposite direction from Y,4d
5484,ahead:behind,in:out,tip:root,interior:exterior,tip:root,ahead:behind,X is the opposite direction from Y,4d


In [21]:
# remove duplicates, leave unique options only
# we need at least one human answer for each question
# in order to formulate the questions fo GPT4

options_df = data[['file_idx', 'pair1', 'pair2', 'pair3', 'pair4']]
options_df = options_df.drop_duplicates()
options_df['describing_pairs'] = options_df['file_idx'].\
    map(pairs_desc_subcat)

In [22]:
options_df.head()

Unnamed: 0,file_idx,pair1,pair2,pair3,pair4,describing_pairs
5480,4d,high:low,around:through,under:over,root:tip,"front:back, left:right, east:west, before:afte..."
5484,4d,ahead:behind,in:out,tip:root,interior:exterior,"front:back, left:right, east:west, before:afte..."
5488,4d,around:through,backward:forward,north:south,interior:exterior,"front:back, left:right, east:west, before:afte..."
5493,4d,high:low,above:below,backward:forward,stop:go,"front:back, left:right, east:west, before:afte..."
5496,4d,top:bottom,boring:interesting,sad:happy,interior:exterior,"front:back, left:right, east:west, before:afte..."


In [23]:
options_df.to_csv(os.path.join(GPTQUE_DIR, 'questions_to_gpt4.csv'),
                  index=False)

## Sending the questions to GPT-4

In [37]:
# this loads your open api key from .env file
_ = load_dotenv(find_dotenv())
openai.api_key = os.environ["OPENAI_API_KEY"]

In [24]:
# receive GPT-4 answers
responses = defaultdict(list)
for line in options_df.iterrows():
    idx = line[1]['file_idx']
    describing_pairs = line[1]['describing_pairs']
    instruction = Q_TEMPLATE.format(describing_pairs)
    options = " ".join([line[1]['pair1'],
                        line[1]['pair2'],
                        line[1]['pair3'],
                        line[1]['pair4']])
    response = ask_gpt4(instruction, options)
    responses[idx].append(response)

In [None]:
# save the answers
for idx, subcat_responces in responses.items():
    file_name = f"Phase2Answers-{idx}.txt"
    file_path = os.path.join(GPTANS_DIR, file_name)
    write_answers(subcat_responces, file_path)