In [None]:
import sys
import os

print(os.getcwd())
notebook_dir = os.path.abspath("")
sys.path.append('..')
import argparse
import json
import pickle
import os
import pandas as pd
import numpy as np
import math
import re
import matplotlib.pyplot as plt
from pandas.plotting import table
from utils import *

In [2]:
target_questions = pd.read_csv("./experiment_data/target_questions.csv")

context_questions = pd.read_csv("./experiment_data/generated_context_questions.csv")

target_data = pd.DataFrame()
for root, dirs, files in os.walk("./results/"):
    for file in files:
        if file.endswith("target_responses.csv"):
            file_path = os.path.join(root, file)
            if "llama_context" not in file_path and "gemini" not in file_path and "assumptions" not in file_path:
                new_data = pd.read_csv(file_path)
                target_data = pd.concat([target_data, new_data], ignore_index=True)

print(target_data.shape, target_data.columns)

context_data = pd.DataFrame()
for root, dirs, files in os.walk("./results/"):
    for file in files:
        if file.endswith("simple_responses.csv"):
            file_path = os.path.join(root, file)
            new_data = pd.read_csv(file_path)
            context_data = pd.concat([context_data, new_data], ignore_index=True)

print(context_data.shape, context_data.columns)

(56000, 9) Index(['target_id', 'sample', 'model', 'context', 'context_prompt',
       'context_bias', 'target_prompt', 'experiment_type', 'response'],
      dtype='object')
(2600, 8) Index(['target_id', 'sample', 'model', 'context', 'context_prompt',
       'context_bias', 'experiment_type', 'response'],
      dtype='object')


# Preprocessing

In [3]:
answer_df = pd.read_pickle("saved_states/processed_answers.pkl")
conversion_df = pd.read_pickle("saved_states/conversions.pkl")

In [4]:
print(answer_df.shape, answer_df.head(2))

(56000, 11)    target_id  sample          model      context context_prompt  \
0          0       0  gpt-3.5-turbo  gpt-4-turbo         simple   
1          0       1  gpt-3.5-turbo  gpt-4-turbo         simple   

    context_bias target_prompt experiment_type        number  \
0  answer_anchor    onlyanswer        decrease  3.000000e+06   
1  answer_anchor    onlyanswer        decrease  3.000000e+09   

   order_of_magnitude comment  
0                 6.0    None  
1                 9.0    None  


## Filter out not usable answers

In [5]:
no_answer = answer_df[answer_df["comment"].notnull() & (answer_df["comment"] != "unit_converted")]
print(len(no_answer))#, no_answer)

no_answer_cats = no_answer.groupby("comment")
for key, val_group in no_answer_cats:
    print(key, len(val_group))

1129
no_answer 293
no_concrete_answer 306
percentage 14
range 512
unit_not_convertible 4


In [6]:
filtered_answers = answer_df[~answer_df.index.isin(no_answer.index)]

len(filtered_answers)

54871

## Group values by question setting combination

In [7]:
oom_df = pd.read_pickle("saved_states/results_oom_df.pkl")
num_df = pd.read_pickle("saved_states/results_num_df.pkl")

## Print single items

In [8]:
def print_data(model, target_id, context, exp_type, prompt, sample=None):
    model = models[model]
    context = context_biases[context]
    exp_type = exp_types[exp_type]
    prompt = prompts[prompt]

    print("EXPERIMENT", target_id, model, context, prompt, exp_type)

    fp = target_questions[target_questions["id"] == target_id].iloc[0]["question"]
    print("TARGET QUESTION", fp)

    baseline_stats = oom_df[(oom_df["model"] == model) &
                       (oom_df["target_id"] == target_id) &
                       (oom_df["context_bias"] == "none") &
                       (oom_df["experiment_type"] == "neutral") &
                       (oom_df["target_prompt"] == prompt)].iloc[0]
    
    baseline_answers = answer_df[(answer_df["model"] == model) &
                           (answer_df["target_id"] == target_id) &
                           (answer_df["context_bias"] == "none") &
                           (answer_df["experiment_type"] == "neutral") &
                           (answer_df["target_prompt"] == prompt)]
    baseline_med_answer = None
    
    if sample != None:
        baseline_len = sample
    else:
        baseline_len = int(baseline_stats["num_valid_answers"]/2)

    baseline_answers = baseline_answers.sort_values(by="number").reset_index(drop=True)
    baseline_med_id = baseline_answers.loc[baseline_len, "sample"]

    baseline_med_answer = target_data[(target_data["model"] == model) &
                            (target_data["target_id"] == target_id) &
                            (target_data["context_bias"] == "none") &
                            (target_data["experiment_type"] == "neutral") &
                            (target_data["target_prompt"] == prompt) &
                            (target_data["sample"] == baseline_med_id)].iloc[0]

    print("BASELINE -------------------------------------------")
    print(baseline_stats[["50%", "unique", "std", "target_id"]])
    print(baseline_answers[["sample", "number", "order_of_magnitude"]])
    print(baseline_med_answer["response"])

    for exp in ["decrease", "increase", "neutral"]:
        try:
            context_question = context_questions[(context_questions["target_id"] == target_id) &
                                                (context_questions["bias"] == context) &
                                                (context_questions["experiment_type"] == exp)].iloc[0]["question"]

            exp_stats = oom_df[(oom_df["model"] == model) &
                            (oom_df["target_id"] == target_id) &
                            (oom_df["context_bias"] == context) &
                            (oom_df["experiment_type"] == exp) &
                            (oom_df["target_prompt"] == prompt)].iloc[0]
            
            exp_answers = answer_df[(answer_df["model"] == model) &
                                (answer_df["target_id"] == target_id) &
                                (answer_df["context_bias"] == context) &
                                (answer_df["experiment_type"] == exp) &
                                (answer_df["target_prompt"] == prompt)]
            
            exp_med_answer = None

            if sample != None:
                exp_len = sample
            else:
                exp_len = int(exp_stats["num_valid_answers"]/2)

            exp_answers = exp_answers.sort_values(by="number").reset_index(drop=True)
            exp_med_id = exp_answers.loc[exp_len, "sample"]
            print(exp_len, exp_med_id)
            exp_med_answer = target_data[(target_data["model"] == model) &
                                    (target_data["target_id"] == target_id) &
                                    (target_data["context_bias"] == context) &
                                    (target_data["experiment_type"] == exp) &
                                    (target_data["target_prompt"] == prompt) &
                                    (target_data["sample"] == exp_med_id)].iloc[0]
                
            exp_context = context_data[(context_data["model"] == model) &
                                        (context_data["target_id"] == target_id) &
                                        (context_data["context_bias"] == context) &
                                        (context_data["experiment_type"] == exp)].iloc[0]
        
            if exp_type == "both" or exp == exp_type:
                print(exp + " -------------------------------------------")
                print(exp_stats[["50%", "unique", "std", "target_id"]])
                print(exp_answers[["sample", "number", "order_of_magnitude"]])
                print(context_question)
                print(exp_context["response"])
                print(exp_med_answer["response"])
        except:
            pass

exp_types = {
    0: "decrease",
    1: "increase",
    2: "both",
    3: "neutral"
}

context_biases = {
    0: "general",
    1: "question_anchor",
    2: "answer_anchor",
    3: "implicit_confirmation",
    4: "explicit_confirmation",
    5: "availability",
    6: "control"
}

models = {
    0: "gpt-3.5-turbo",
    1: "gpt-4-turbo",
    2: "gpt-4o",
    3: "Llama-3-70b-chat-hf"
}

prompts = {
    0: "onlyanswer",
    1: "reasoning"
}

In [9]:
print_data(
    2, # model
    48, # qid
    6, # context
    2, # exp_type
    1, # prompt
    4 ) # sample

EXPERIMENT 48 gpt-4o control reasoning both
TARGET QUESTION How much would a newly-discovered unreleased Beatles album sell for?


BASELINE -------------------------------------------
50%               6.0
unique              3
std          0.471405
target_id          48
Name: 4151, dtype: object
   sample      number  order_of_magnitude
0       5    150000.0                 5.0
1       1   2000000.0                 6.0
2       0   5000000.0                 6.0
3       2   5000000.0                 6.0
4       3   5000000.0                 6.0
5       4   5000000.0                 6.0
6       6   5000000.0                 6.0
7       7   5000000.0                 6.0
8       9   5000000.0                 6.0
9       8  10000000.0                 7.0
Reasoning:
1. The Beatles are one of the most iconic and influential bands in music history.
2. Unreleased music from legendary artists can have immense value.
3. Previous Beatles memorabilia and rare items have sold for millions.
4. The market for rare music items shows high demand and willingness to pay premium prices.
5. Estimating based on other high-ticket music m

Example: print all System 2 exceptions (OOM deviation > 1 despite high baseline persistence)

In [10]:
eval = 0

if eval == 0:
    for (qid, bias) in [(0, 'explicit_confirmation'), (3, 'general'), (27, 'answer_anchor'), (27, 'explicit_confirmation'), (32, 'implicit_confirmation')]:#[35, 44, 11, 21, 26, 43, 1, 45, 18]:#[43, 11, 35, 16, 47, 29, 18, 37, 41, 45, 4, 1]:
        print_data(0, qid, [key for (key, val) in context_biases.items() if val == bias][0], 2, 1)
        print("________________________________________________\n")
if eval == 1:
    for (qid, bias) in [(5, 'availability'), (9, 'answer_anchor'), (27, 'answer_anchor'), (27, 'explicit_confirmation'), (27, 'implicit_confirmation'), (38, 'question_anchor'), (46, 'general')]:#[35, 27, 49, 42, 48, 41, 11, 34]:
        print_data(1, qid, [key for (key, val) in context_biases.items() if val == bias][0], 2, 1)
        print("________________________________________________\n")
if eval == 2:
    for (qid, bias) in [(3, 'availability'), (3, 'general'), (12, 'availability'), (12, 'general'), (12, 'question_anchor'), (21, 'availability'), (21, 'question_anchor'), (27, 'answer_anchor'), (27, 'availability'), (37, 'question_anchor'), (38, 'question_anchor'), (38, 'explicit_confirmation'), (40, 'question_anchor'), (40, 'explicit_confirmation'), (48, 'availability'), (48, 'general'), (48, 'implicit_confirmation')]:#[40, 4, 42, 16]:#, 36, 5, 37, 41, 44, 2, 11, 12, 13, 19, 15, 39]:
        print_data(2, qid, [key for (key, val) in context_biases.items() if val == bias][0], 2, 1)
        print("________________________________________________\n")
if eval == 3:
    for (qid, bias) in [(11, 'question_anchor'), (11, 'implicit_confirmation'), (12, 'implicit_confirmation'), (18, 'answer_anchor'), (18, 'general'), (18, 'question_anchor'), (18, 'explicit_confirmation'), (27, 'explicit_confirmation'), (30, 'availability'), (30, 'general'), (30, 'implicit_confirmation'), (40, 'availability'), (40, 'general'), (40, 'implicit_confirmation'), (49, 'question_anchor')]:#[35, 44, 39, 43, 42]:#, 40, 13, 21, 4, 5, 46, 8, 41]:
        print_data(3, qid, [key for (key, val) in context_biases.items() if val == bias][0], 2, 1)
        print("________________________________________________\n")

EXPERIMENT 0 gpt-3.5-turbo explicit_confirmation reasoning both
TARGET QUESTION How many people in the world are talking on their cell phones at this instant?
BASELINE -------------------------------------------
50%               9.0
unique              2
std          0.421637
target_id           0
Name: 14, dtype: object
   sample        number  order_of_magnitude
0       1  4.740000e+08                 8.0
1       8  5.530000e+08                 8.0
2       9  1.000000e+09                 9.0
3       6  2.500000e+09                 9.0
4       3  2.600000e+09                 9.0
5       5  3.654000e+09                 9.0
6       7  3.696000e+09                 9.0
7       4  4.740000e+09                 9.0
8       0  5.530000e+09                 9.0
9       2  5.530000e+09                 9.0
Reasoning: 
1. There are approximately 7.9 billion people in the world.
2. According to Statista, about 5.22 billion people worldwide use a mobile phone.
3. Assuming that at any given moment, 