# Token math (ignore this section, just calculating cost)
* 261 tokens per task desc * 2 tasks per comparison = 520 tokens for task descriptions per request 
* 100 tokens for preamble + 260 tokens for reference task  
* 500 comparisons per tasks 
* 100 tasks 

* (261*2 + 100 * 260) * 500 * 100 = 44 M tokens -> 440 only for token input (GPT-4), but $22 for GPT 3.5
* Only ask for winner (no rationales), as ouput is 3x as expensive and difficult to control, especially for rationales


In [None]:
import pandas as pd 
import numpy as np
import pickle 
from sklearn.preprocessing import LabelEncoder
import choix
import tiktoken

In [None]:
encoding = tiktoken.get_encoding("cl100k_base")
df_tasks = pd.read_csv("102_tasks_with_sources_clean.csv")
df_tasks["complete_desc"] = "Task description: " + df_tasks["Stimulus Complex"] + "\n\n" + "Goal: " + df_tasks["Goal Directives"]

In [None]:
print(df_tasks["complete_desc"][10])

In [None]:
np.mean([len(encoding.encode(x)) for x in df_tasks["complete_desc"]])

In [None]:
df_dimensions = pd.read_csv("24_dimensions_clean.csv")

In [None]:
np.sum([len(encoding.encode(x)) for x in df_dimensions["Question Text"]])

In [None]:
len(encoding.encode("""Assess the task considering its balance between physical and mental effort, including the role of creative thinking. 
Define its primary goals, whether planning, generating ideas, or resolving conflicts, and consider the nature of participation, such as diverse interests and collaboration efficiency. 
Evaluate the outcome's characteristics, like win/lose potential and precision objectives. 
Explore solution aspects, including uniqueness, demonstrability, and formal modeling potential. 
Finally, rate the task on its uncertainty, trade-off evaluation, and the clarity of its solution, from subjective judgment to objective logic."""))

In [None]:
for x in df_dimensions["Question Text"]:
    print(x + "\n")

# Set up with Instructor 
* Dimension-specific queries don't seem to work 
* If you give it an "uncertain" option, 3.5 will use it very frequently 
* If just asked to compare tasks, 3.5 is practically useless. 4 gives much more relevant rationale. 

In [18]:
import json
import instructor
import asyncio
import pandas as pd 
from openai import AsyncOpenAI
from pydantic import BaseModel, Field, field_validator
from typing import List
from enum import Enum
import seaborn as sns 
import matplotlib.pyplot as plt 
import tiktoken
from itertools import combinations
import random

client = AsyncOpenAI(api_key=open("./openai_pairwise_api", "r").read().strip())
client = instructor.patch(client, mode=instructor.Mode.TOOLS)
sem = asyncio.Semaphore(15)

In [19]:
df_tasks = pd.read_csv("102_tasks_with_sources_clean.csv")
df_tasks["complete_desc"] = "Task description: " + df_tasks["Stimulus Complex"] + "\n\n" + "Goal: " + df_tasks["Goal Directives"]
df_dimensions = pd.read_csv("24_dimensions_clean.csv")

In [20]:
N_COMPARISONS_PER = 50 # number of pairwise comparisons to complete 
REF_INDEX = 14 #index of the reference task 

## Set up Instructor objects 
* Adapted from https://github.com/jxnl/instructor/blob/main/examples/batch-classification/run.py

In [22]:
# You use this enum to define valid responses 

class PairwiseChoice(Enum):
    A = "A"
    B = "B"

In [23]:
# You use this "question" object to define the overall structure (input/output) of the problem, and any validation you need to include 

class QuestionClassification(BaseModel):
    """
    You will be shown two tasks, and asked which of the two tasks (A or B) is most similar to a reference task in terms of the task mechanics, requirements, and properties of the solution. 
    A task is defined by its description and goal. 
    """

    # If you want only one classification, just change it to
    #   `classification: QuestionType` rather than `classifications: List[QuestionType]``
    chain_of_thought: str = Field(
        ..., description="The chain of thought that led to the decision."
    )
    classification: PairwiseChoice = Field(
        description=f"An accurate and correct assessment of which task is most similar to the reference task. Only allowed answers: {[t.value for t in PairwiseChoice]}, should be used. ONLY CHOOSE ONE OF THE OPTIONS.",
    )

    # @field_validator("classification", mode="before")
    # def validate_classification(cls, v):
    #     # sometimes the API returns a single value, just make sure it's a list
    #     if not isinstance(v, list):
    #         v = [v]
    #     return v

In [36]:
# Here, you define the "instance specific" information that goes into each question, and the model endpoint


async def classify(data: tuple) -> QuestionClassification:
    async with sem:  # some simple rate limiting
        return data, await client.chat.completions.create(
            model="gpt-4",
            response_model=QuestionClassification,
            max_retries=2,
            messages=[
                {
                    "role": "user", "content": f"The reference task is:\n {df_tasks['complete_desc'][data[0]]}.",
                },
                {
                    "role": "user", "content": f"Task A is:\n {df_tasks['complete_desc'][data[1]]}.",
                },
                {
                    "role": "user", "content": f"Task B is:\n {df_tasks['complete_desc'][data[2]]}.",
                }
            ],
        )

In [37]:
# Handle asynchronicity and I/O 

async def main(
    questions: List[tuple], *, path_to_jsonl: str = None
) -> List[QuestionClassification]:
    tasks = [classify(question) for question in questions]
    for task in asyncio.as_completed(tasks):
        question, label = await task
        resp = {
            "question": question,
            "classification": label.classification.value,
            "chain_of_thought": label.chain_of_thought
        }
        # print(resp)
        if path_to_jsonl:
            with open(path_to_jsonl, "a") as f:
                json_dump = json.dumps(resp)
                f.write(json_dump + "\n")

## Set up questions and run async requests 

In [26]:
instructor_questions = [(REF_INDEX,) + x for x in random.sample(list(combinations(df_tasks.index.drop(REF_INDEX).tolist(), 2)), N_COMPARISONS_PER)]

In [38]:
await main(questions=instructor_questions, path_to_jsonl="pairwise_comp_instructor_test_gpt4.json")

## Explore output

In [41]:
pd.read_json(path_or_buf="pairwise_comp_instructor_test.json", lines=True)["chain_of_thought"][2]

'Both tasks involve a goal where participants need to make specific decisions within a fixed amount of time. Task A requires participants to manipulate a grid to achieve symmetry with the fewest number of clicks, while Task B requires participants to discuss a given topic and submit a written summary within a time limit. The decision-making aspect and the time constraint are common factors in both tasks.'

In [43]:
pd.read_json(path_or_buf="pairwise_comp_instructor_test.json", lines=True).head(), pd.read_json(path_or_buf="pairwise_comp_instructor_test_gpt4.json", lines=True).head()

(       question classification  \
 0  [14, 47, 55]              A   
 1   [14, 0, 11]              A   
 2   [14, 1, 80]              A   
 3  [14, 58, 89]              A   
 4  [14, 71, 88]              A   
 
                                     chain_of_thought  
 0  Task A involves allocating resources among ran...  
 1  The reference task involves making a decision ...  
 2  Both tasks involve a goal where participants n...  
 3  Task A involves participants making decisions ...  
 4  The tasks involve participants working on repr...  ,
        question classification  \
 0  [14, 58, 89]              B   
 1  [14, 28, 45]              B   
 2  [14, 48, 62]              A   
 3   [14, 1, 80]              B   
 4   [14, 4, 61]              B   
 
                                     chain_of_thought  
 0  The reference task involves making a detailed ...  
 1  Both tasks revolve around problem-solving, but...  
 2  The reference task involves decision making wi...  
 3  Task A is a

In [40]:
pd.read_json(path_or_buf="pairwise_comp_instructor_test_gpt4.json", lines=True)["chain_of_thought"][3]

'Task A is about symmetrical manipulations on a grid, which is a concrete task and involves spatial and numerical intelligence. Task B, on the other hand, is an open-ended discussion, which involves critical thinking skills, interpersonal communication, and the ability to structure and write an argument. Looking at the reference task, it can be seen that it also requires critical thinking skills, the ability to come up with an argument (in this case, a solution to a problem), and a precise way to express it. Therefore, the mechanics, requirements and solution properties of Task B are more similar to the reference task than Task A.'

In [28]:
pd.read_json(path_or_buf="pairwise_comp_instructor_test.json", lines=True)

Unnamed: 0,question,classification,chain_of_thought
0,"[14, 47, 55]",A,Task A involves allocating resources among ran...
1,"[14, 0, 11]",A,The reference task involves making a decision ...
2,"[14, 1, 80]",A,Both tasks involve a goal where participants n...
3,"[14, 58, 89]",A,Task A involves participants making decisions ...
4,"[14, 71, 88]",A,The tasks involve participants working on repr...
5,"[14, 30, 72]",A,Task A involves participants making guesses ab...
6,"[14, 22, 98]",A,Task A involves making decisions in a social d...
7,"[14, 49, 74]",A,The reference task involves making a decision ...
8,"[14, 57, 64]",A,The reference task involves decision-making ba...
9,"[14, 67, 91]",A,The tasks involve visual perception and memory...


In [33]:
for task in df_tasks.loc[[14, 1, 80]]["complete_desc"]:
    print(task + "\n" + "------------")

Task description: Participants get several pages of description about a dilemma facing a racecar team, which involves deciding whether to go ahead with the race that would begin in the immediate future. The description mentions that the team has been experiencing a series of engine failures and that an engine failure during this race on national television will present a danger to the driver and team's sponsorship. However, if the team does well on the race, it will get a lucrative sponsorship deal.
Participants are given a chart with information about the temperatures during the last 7 engine failures, which shows a range of temperatures 53-75 degrees and a mean temperature of 64 degrees. The chart is misleading because it does not contain information about the air temperature when the car does NOT experience an engine failure (therefore, the information is biased).
Participants are also given instructions that mention that they can ask for any additional information during the task, 