In [1]:
import os
import re
import random
import pandas as pd
from openai import OpenAI
from dotenv import load_dotenv , find_dotenv

In [71]:
PH2ANS_DIR = '../data/raw/Testing/Phase2Answers'
PH1QUE_DIR = '../data/raw/Testing/Phase1Questions'

Q_TEMPLATE = 'Consider the following word pairs: {}. These X:Y pairs share a relation, “X R Y”. Now consider the following word pairs:\n\n{}\n\nWhich of the above numbered word pairs is the MOST illustrative example of the same relation “X R Y”? Which of the above numbered word pairs is the LEAST illustrative example of the same relation “X R Y”?'
Q_BATCH_TEMPLATE = 'Consider the following word pairs: {}. These X:Y pairs share a relation, “X R Y”. Now consider the following strings which consist of 4 word pairs:\n\n{}\n\nFor each string, which of the numbered word pairs is the MOST illustrative example of the same relation “X R Y”? For each string, which of the numbered word pairs is the LEAST illustrative example of the same relation “X R Y”?'
BATCH_SIZE = 20

## Read data

In [38]:
lst_files = os.listdir(PH2ANS_DIR)

# select the 1st file
file_name = lst_files[0]
idx = re.findall('[0-9]+[a-z]', file_name)[0]

In [36]:
file_name

'Phase2Answers-6g.txt'

In [39]:
idx

'6g'

In [31]:
data = pd.read_csv(os.path.join(PH2ANS_DIR, file_name), sep='\t')
data = data.rename({'# pair1': 'pair1'}, axis='columns')

In [32]:
data.head()

Unnamed: 0,pair1,pair2,pair3,pair4,least_illustrative,most_illustrative,user_selected_relation
0,listen:loudly,watch:blindly,run:slowly,piece:bunch,piece:bunch,listen:loudly,X cannot be done in a Y manner
1,listen:loudly,watch:blindly,run:slowly,piece:bunch,piece:bunch,watch:blindly,X cannot be done in a Y manner
2,listen:loudly,watch:blindly,run:slowly,piece:bunch,piece:bunch,listen:loudly,X cannot be done in a Y manner
3,listen:loudly,watch:blindly,run:slowly,piece:bunch,piece:bunch,listen:loudly,X cannot be done in a Y manner
4,listen:loudly,watch:blindly,run:slowly,piece:bunch,listen:loudly,watch:blindly,X cannot be done in a Y manner


In [14]:
relation = data.user_selected_relation.unique()[0]

In [76]:
# remove duplicates, leave unique options only

options_df = data[['pair1', 'pair2', 'pair3', 'pair4']]
options_df = options_df.drop_duplicates()

for i, colname in enumerate(options_df.columns):
    options_df[colname] = options_df[colname].apply(lambda x: f"({i + 1}) {x}")

In [78]:
options_df.head()

Unnamed: 0,pair1,pair2,pair3,pair4
0,(1) listen:loudly,(2) watch:blindly,(3) run:slowly,(4) piece:bunch
5,(1) peace:war,(2) whisper:loudly,(3) fight:delicately,(4) yell:calmly
10,(1) kill:safely,(2) shine:dimly,(3) knowingly:guess,(4) glare:nicely
15,(1) defiantly:cooperate,(2) fail:successfully,(3) run:slowly,(4) obliterate:build
20,(1) guess:knowingly,(2) sleep:alertly,(3) fight:delicately,(4) speed:slowly


In [45]:
# read examples for the question

q_name = f"Phase1Questions-{idx}.txt"
q_path = os.path.join(PH1QUE_DIR, q_name)
with open(q_path, 'r') as f:
    question = f.read()

# remove correct answer
query_pairs = re.findall('[a-z]+:[a-z]+', question)
query_pairs = ', '.join(query_pairs)

In [46]:
query_pairs

'creep:fast, fade:abruptly, scream:quietly, destroy:gently'

## Formulate and send the questions

In [79]:
# 1 question per request

for row in options_df.iterrows():
    options = row[1].values.tolist()
    options = '\n'.join(options)

    req_body = Q_TEMPLATE.format(query_pairs, options)

    # TODO: send the questions to GPT-4

In [91]:
# batch per request

for i in range(0, len(options_df), BATCH_SIZE):
    batch_options = options_df.iloc[i:i + BATCH_SIZE].values.tolist()

    batch_options = [', '.join(options) for options in batch_options]
    batch_options = ';\n'.join(batch_options)

    req_body = Q_BATCH_TEMPLATE.format(query_pairs, batch_options)

    # TODO: send the batch questions to GPT-4