In [1]:
import os
import re
import random
from openai import OpenAI
from dotenv import load_dotenv , find_dotenv

## Prepare questionnaires

In [2]:
RAW_DATA_DIR = '../data/raw/Training/'
PROCESSED_DATA_DIR = '../data/processed/Training/'

PH1_INTRO = 'Phase 1: Generating Pairs\nTitle: Generate examples of a given relation\nDescription: Given an examples of related word pairs, create three new analogous examples.\nKeywords: analogy, English, word relations, semantics, similarity\nInstructions:\n(1) This survey will be used to better understand relations between words. Your input is much appreciated.\n(2) If any of the questions in a HIT are unanswered, then the assignment is no longer useful to us and we will be unable to pay for the assignment.\n(3) Skip a HIT if you do not know the meanings of the words.\n(4) Attempt HITs only if you are a native speaker of English or very fluent in English.\n(5) Certain check questions will be used to make sure your input is responsible and reasonable. HITs that fail these tests will be rejected. If you fail too many check questions, then it will be assumed that you are not following instructions (3) and/or (4) above, and ALL of your HITs will be rejected.\n(6) Question 1 asks you to identify the relation that is shared by a set of word pairs. For example, consider the following set of word pairs: traffic:street, water:riverbed, data:network, electricity:wire, hikers:trail. Each of these X:Y pairs shares the relation “X flows on/in/over/through a Y”.\n(7) Question 2 asks you to generate four more word pairs with the same relation as the pairs in Question 1. For example, effluent:sewer, blood:artery, oil:pipeline, gossip:grapevine.\n\nSample HIT:\n\n'
PH2_INTRO = 'Phase 2: Rating Pairs\nTitle: Select the most and least illustrative examples of a given relation\nDescription: Given an example word pair, rate other word pairs according to the similarity of their relations to the relation illustrated in the given example.\nKeywords: analogy, English, word relations, semantics, similarity, rating\nInstructions:\n(1) This survey will be used to better understand relations between words. Your input is much appreciated.\n(2) If any of the questions in a HIT are unanswered, then the assignment is no longer useful to us and webwill be unable to pay for the assignment.\n(3) Skip a HIT if you do not know the meanings of the words.\n(4) Attempt HITs only if you are a native speaker of English or very fluent in English.\n(5) Certain check questions will be used to make sure your input is responsible and reasonable. HITs that fail these tests will be rejected. If you fail too many check questions, then it will be assumed that you are not following instructions (3) and/or (4) above, and ALL of your HITs will be rejected.(6) Question 1 asks you to identify the relation that is shared by a set of word pairs. For example, consider the following set of word pairs: traffic:street, water:riverbed, data:network, electricity:wire, hikers:trail. Each of these X:Y pairs shares the relation “X flows on/in/over/through a Y”.\n(7) Question 2 asks you to select from a set of four pairs the pairs that are most and least illustrative of a given relation.\n\nSample HIT:\n\n'
PH1Q2 = 'Question 2: Consider the following word pairs:\n\n{}\n\nThese X:Y pairs share a relation, “X R Y ”. Give four additional word pairs that illustrate the same relation, in the same order (X on the left, Y on the right). Please do not use phrases composed of two or more words in your examples (e.g., “racing car”). Please do not use names of people, places, or things in your examples (e.g., “Europe”, “Kleenex”).\n\n'
PH2Q2 = 'Question 2: Consider the following word pairs:{}. These X:Y pairs share a relation, “X R Y”. Now consider the following word pairs:\n\n{}\n\nWhich of the above numbered word pairs is the MOST illustrative example of the same relation “X R Y”?\n____\nWhich of the above numbered word pairs is the LEAST illustrative example of the same relation “X R Y”?\n____\nNote: In some cases, a word pair might be in reverse order. For example, tree:forest is in reverse order for the relation "X is made from a collection of Y". The correct order would be forest:tree; a forest is made from a collection of trees. You should treat reversed pairs as BAD examples of the given relation.'

In [3]:
contains = os.listdir(RAW_DATA_DIR)

ph1_dir = list(filter(lambda x: 'Phase1' in x, contains))
ph2_dir = list(filter(lambda x: 'Phase2' in x, contains))

ph1_questions_dir = list(filter(lambda x: 'Questions' in x, ph1_dir))
ph2_questions_dir = list(filter(lambda x: 'Questions' in x, ph2_dir))

In [4]:
ph1_questions_dir_path = os.path.join(RAW_DATA_DIR, ph1_questions_dir[0])
ph2_questions_dir_path = os.path.join(RAW_DATA_DIR, ph2_questions_dir[0])

In [5]:
ph1_questions_files = os.listdir(ph1_questions_dir_path)
ph2_questions_files = os.listdir(ph2_questions_dir_path)

In [6]:
ph1_question_file = os.path.join(RAW_DATA_DIR, ph1_questions_dir[0], ph1_questions_files[0])
ph2_question_file = os.path.join(RAW_DATA_DIR, ph2_questions_dir[0], ph2_questions_files[0])

In [7]:
with open(ph1_question_file, 'r') as f:
    question_1 = f.read()

In [8]:
with open(ph2_question_file, 'r') as f:
    question_2 = f.read()

In [9]:
questions_phase_1 = {}

for file_name in ph1_questions_files:
    idx = re.findall('[0-9]+[a-z]', file_name)[0]
    file_path = os.path.join(RAW_DATA_DIR, ph1_questions_dir[0], file_name)
    with open(file_path, 'r') as f:
        question = f.read()
    questions_phase_1[idx] = question

questions_phase_2 = {}

for file_name in ph2_questions_files:
    idx = re.findall('[0-9]+[a-z]', file_name)[0]
    file_path = os.path.join(RAW_DATA_DIR, ph2_questions_dir[0], file_name)
    with open(file_path, 'r') as f:
        question = f.read()
    questions_phase_2[idx] = question

In [16]:
def prepare_phase_1_question_1(str_file):

    # remove correct answer
    prepared_question = re.sub('\n*Correct Answer(.|\n)*', '', str_file)

    # remove topic
    prepared_question = re.sub('Question.*\n*', '', prepared_question)
    prepared_question = f'Question 1: ' + prepared_question

    # add (a), (b), (c) and so forth
    intro, options = prepared_question.split('?\n\n')
    options = options.split('\n')
    options = [f'({i + 1}) {options[i]}' for i in range(len(options))]

    prepared_question = intro + '?\n\n' + '\n'.join(options) + '\n\n'


    return prepared_question


def prepare_phase_1_question_2(str_file):

    # extract pairs
    pairs = re.findall('[a-z]+:[a-z]+', str_file)
    pairs = '\n'.join(pairs)

    prepared_question = PH1Q2.format(pairs)
    return prepared_question

def prepare_phase_2_question_2(str_file):

    # same as 1st phase
    return prepare_phase_1_question_1(str_file)

def prepare_phase_2_question_2(str_file_1, str_file_2):

    # extract pairs
    query_pairs = re.findall('[a-z]+:[a-z]+', str_file_1)
    query_pairs = ', '.join(query_pairs)

    # randomly select 4 possible answers
    answers_list = str_file_2.split('\n')
    responce_pairs = random.choice(answers_list)
    responce_pairs = responce_pairs.split(',')
    responce_pairs = [f'({i + 1}) {responce_pairs[i]}' for i in range(len(responce_pairs))]
    responce_pairs = '\n'.join(responce_pairs)

    prepared_question = PH2Q2.format(query_pairs, responce_pairs)
    return prepared_question

In [17]:
for idx in questions_phase_2.keys():
    ph1q1 = prepare_phase_1_question_1(questions_phase_1[idx])
    ph1q2 = prepare_phase_1_question_2(questions_phase_1[idx])
    ph2q2 = prepare_phase_2_question_2(questions_phase_1[idx],
                                       questions_phase_2[idx])
    questionnaire = PH1_INTRO + ph1q1 + ph1q2 + PH2_INTRO + ph1q1 + ph1q2

    with open(os.path.join(PROCESSED_DATA_DIR, f'Questionnaire-{idx}.txt'), 'w') as f:
        f.write(questionnaire)

## Send requests

In [7]:
# this loads your open api key from .env file
_ = load_dotenv(find_dotenv())

In [13]:
client = OpenAI()

In [28]:
with open(os.path.join(PROCESSED_DATA_DIR, f'Questionnaire-1a.txt'), 'r') as f:
    questionnaire = f.read()

In [29]:
def send_request(message):
    body = [{"role": "user", "content": message}]
    response = client.chat.completions.create(
                                              model="gpt-3.5-turbo",
                                              messages=body
                                              )
    return response.choices[0].message["content"].strip()

In [30]:
send_request(questionnaire)

RateLimitError: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}