# Section 6.1

## Import necessary packages

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pickle as pkl
import os
import pandas as pd
import seaborn as sns
from itertools import product
import scipy.stats as stats
from collections import defaultdict

## Load ground truth parameters

In [2]:
# Load ground truth parameters
with open('../data/ground_truth.pkl', 'rb') as f:
    res = pkl.load(f)
    params = res['params']
print('Ground truth parameters:', params)

Ground truth parameters: [ 1.          1.          0.00552199  0.44735774 -1.1740447   0.1696878
 -0.19067363]


## Section 6.1: Empirical Analysis with Other Data

### Define Utility Functions

In [3]:
def print_res(n_trials, n_real, n_max_aug, db_models, methods, metric='rbias', display=True):
    y = defaultdict(lambda: (float('inf'), float('inf'), float('inf')))
    for method, db_model in product(methods, db_models):
        res_file = f'../res/{db_model}_{method}_{n_real}_{n_max_aug}_{n_trials}.pkl'
        if not os.path.exists(res_file):
            continue
        with open(res_file, 'rb') as f:
            res = pkl.load(f)
        n_aug_list, n_real_list, sample_id_list, params_est_list = res['n_aug_list'],res['n_real_list'], res['sample_id_list'], np.array(res['params_list'])

        bias_list = []
        for i in [0,1,3,4,5,6]: # Remove the third column due to small its magnitude, which makes the relative bias reduction not meaningful
            df = pd.DataFrame({'n_real': n_real_list, 'n_aug': n_aug_list, 'sample_id': sample_id_list, 'param': params_est_list[:,i],
                            'e': np.abs(params_est_list[:,i] - params[i]), 're': (np.abs(params_est_list[:,i] - params[i]) )/ (np.abs(params[i])),  'se': (params_est_list[:,i] - params[i])**2})
            df_agg = df.groupby(['n_aug']).agg(
                bias=('e', 'mean'),
                rbias=('re', 'mean'),
                var=('param', lambda x: np.var(x, ddof=0)),
                mse=('se', 'mean')).reset_index()
            bias_list.append(np.array(df_agg[metric]) * 100)
        bias_list = np.array(bias_list)
        bias_redu = bias_list - bias_list[:,0].reshape(6,1)  
        if display:
            print('{}\t{}\t{:.3f}'.format(db_model, method, np.mean(bias_redu[:,-1])))
        y[method, db_model] = (np.mean(bias_redu[:,-1]), np.mean(bias_list[:,0].reshape(6,1)), np.mean(bias_list[:,-1].reshape(6,1)))
    return y


In [4]:
def pairwise_t_test(sample1, sample2):
    # Perform one-sample t-test on the difference between samples
    diff = sample2 - sample1
    t, p = stats.ttest_1samp(diff, 0)
    return p/2 if t > 0 else 1 - p/2

In [5]:
def grouped_t_test(n_trials, n_real, n_max_aug, method, metric='re', display=True):

    res_files = {
        'naive':f'../res/naive_{method}_{n_real}_{n_max_aug}_{n_trials}.pkl',
        'a':f'../res/naive_{method}_{0}_{n_max_aug}_{n_trials}.pkl',
        'aae': f'../res/nn_{method}_{n_real}_{n_max_aug}_{n_trials}.pkl',
    }
    re_list = {}
    re_list['p'] = np.zeros(n_trials)
    for res_file_name, res_file in res_files.items():
        if not os.path.exists(res_file):
            continue
        f = open(res_file, 'rb')
        res = pkl.load(f)
        n_aug_list, n_real_list, sample_id_list, params_est_list = res['n_aug_list'],res['n_real_list'], res['sample_id_list'], np.array(res['params_list'])
        f.close() 

        re_list[res_file_name] = np.zeros(n_trials)
        for i in [0,1,3,4,5,6]:
            df = pd.DataFrame({'n_real': n_real_list, 'n_aug': n_aug_list, 'sample_id': sample_id_list, 'param': params_est_list[:,i],
                            'e': np.abs(params_est_list[:,i] - params[i]), 're': (np.abs(params_est_list[:,i] - params[i]) )/ (np.abs(params[i])),  'se': (params_est_list[:,i] - params[i])**2})
            re_list[res_file_name] += np.array(df[df['n_aug'] == n_max_aug][metric])
            if res_file_name == 'aae':
                re_list['p']  += np.array(df[df['n_aug'] == 0][metric])

    # Conduct pairwise t-tests between AAE and other methods
    p_values = []
    for res_file_name in ['a', 'naive', 'p']:
        if res_file_name in re_list and 'aae' in re_list:
            p_value = pairwise_t_test(re_list['aae'], re_list[res_file_name])
            p_values.append(p_value)
    
    # Record maximum p-value if any tests were conducted
    if p_values:
        return max(p_values)

In [6]:
def find_n_equiv(total_bias, b):
    for i in range(len(total_bias)-1):
        if b <= total_bias[i] and b > total_bias[i+1]:
            break
    return ((b - total_bias[i])/(total_bias[i + 1] - total_bias[i]) + i) * 25 + 25

In [7]:
def one_sided_p_value(x, mu):
    t, p = stats.ttest_1samp(x, mu)
    return p/2 if t > 0 else 1 - p/2

### Table 6: Bias Reduction from Using Human Data Only

In [8]:
methods = ['gpt-0613', 'cot-gpt-0613', 'gpt-0125', 'cot-gpt-0125', 'gpt-4', 'cot-gpt-4', 'gpt-4o',  'cot-gpt-4o', 'fs-gpt-4o','ft-gpt-4o']
db_models = ['nn', 'naive']
df = pd.DataFrame()
for n_real in [50,100,150,200]:
    y = print_res(n_trials=50, n_real=n_real, n_max_aug=500, db_models = db_models, methods = methods, display=False)
    y0 = print_res(n_trials=50, n_real=0, n_max_aug=500, db_models = db_models, methods = methods, display=False)
    cols = pd.MultiIndex.from_tuples([(n_real, 'Pure AI'),(n_real, 'Naive'), (n_real, "AAE")])
    df1 = pd.DataFrame({
        cols[0]: [y0[method, 'naive'][2] - y[method, 'naive'][1] for method in methods], \
        cols[1]: [y[method, 'naive'][0] for method in methods], \
        cols[2]: [y[method, 'nn'][0] for method in methods]}, index=methods)
    df = pd.concat([df, df1], axis=1)

# Replace values > 1000 with '-'
df = df.map(lambda x: '-' if abs(x) > 1000 else x)
# Display DataFrame with 2 decimal places
df = df.map(lambda x: '{:.2f}'.format(float(x)) if x != '-' else x)
df

Unnamed: 0_level_0,50,50,50,100,100,100,150,150,150,200,200,200
Unnamed: 0_level_1,Pure AI,Naive,AAE,Pure AI,Naive,AAE,Pure AI,Naive,AAE,Pure AI,Naive,AAE
gpt-0613,-17.27,-34.2,-37.81,15.09,-1.15,-11.02,28.53,8.57,-6.83,30.71,8.26,-7.09
cot-gpt-0613,148.81,119.98,-34.33,181.18,130.32,-9.5,194.62,126.11,-4.8,196.8,119.07,-5.39
gpt-0125,29.7,5.58,-30.22,62.06,32.47,-9.8,75.5,37.96,-5.3,77.68,34.69,-6.59
cot-gpt-0125,129.8,112.47,-35.35,162.17,124.39,-11.07,175.6,123.92,-5.17,177.78,113.93,-6.64
gpt-4,-12.43,-16.7,-30.63,19.94,10.17,-8.62,33.37,21.31,-4.6,35.55,19.68,-4.72
cot-gpt-4,173.34,132.71,-38.32,205.71,137.6,-11.36,219.15,136.34,-7.69,221.32,121.56,-6.62
gpt-4o,421.27,292.59,-25.68,453.64,263.41,-8.92,467.07,234.85,-4.56,469.25,207.47,-7.03
cot-gpt-4o,304.98,229.8,-28.65,337.34,219.76,-7.86,350.78,199.59,-5.92,352.96,178.51,-6.26
fs-gpt-4o,201.95,133.85,-36.74,234.32,132.1,-12.85,247.76,123.92,-9.02,249.93,110.3,-6.99
ft-gpt-4o,153.33,89.9,-31.61,185.69,98.61,-8.45,199.13,96.48,-2.84,201.31,84.7,-4.38


#### Pairwise t-test

In [13]:
# Create DataFrame to store p-values
methods = ['gpt-0613', 'cot-gpt-0613', 'gpt-0125', 'cot-gpt-0125', 'gpt-4', 'cot-gpt-4', 'gpt-4o', 'cot-gpt-4o', 'ft-gpt-4o', 'fs-gpt-4o']
n_reals = [50, 100, 150, 200]
p_values_df = pd.DataFrame(index=methods, columns=n_reals)

# Fill DataFrame with p-values
for method in methods:
    for n_real in n_reals:
        p_value = grouped_t_test(n_trials=50, n_real=n_real, n_max_aug=500, method=method, display=False)
        p_values_df.loc[method, n_real] = p_value  

p_values_df_cutoff = p_values_df.map(lambda x: '{:.0e}'.format(x) if x < 0.001 else '{:.3f}'.format(x))
p_values_df_cutoff   

Unnamed: 0,50,100,150,200
gpt-0613,0.105,0.0004,0.0003,5e-05
cot-gpt-0613,3e-07,0.002,0.019,0.0004
gpt-0125,2e-07,0.001,0.005,0.0002
cot-gpt-0125,9e-08,0.0006,0.006,0.0003
gpt-4,0.0002,0.0002,0.005,0.0002
cot-gpt-4,1e-07,0.0003,0.0002,0.0007
gpt-4o,5e-06,0.006,0.03,0.0002
cot-gpt-4o,4e-06,0.007,0.01,0.0001
ft-gpt-4o,2e-07,0.007,0.155,0.019
fs-gpt-4o,3e-07,6e-05,0.0002,0.0003


### Table 7: Percentage of Savings in Data size

In [11]:
# Load estimated params with real data
res_file = f'../res/naive_real_25_300_50.pkl'
with open(res_file, 'rb') as f:
    res = pkl.load(f)

n_aug_list, n_real_list, sample_id_list, params_est_list = res['n_aug_list'],res['n_real_list'], res['sample_id_list'], np.array(res['params_list'])
bias_list = []
for i in [0,1,3,4,5,6]: # Remove the third column due to small its magnitude, which makes the relative bias reduction not meaningful
    df = pd.DataFrame({'n_real': n_real_list, 'n_aug': n_aug_list, 'sample_id': sample_id_list, 'param': params_est_list[:,i],
                    'e': np.abs(params_est_list[:,i] - params[i]), 're': (np.abs(params_est_list[:,i] - params[i]) )/ (np.abs(params[i]) ),  'se': (params_est_list[:,i] - params[i])**2})
    df_agg = df.groupby(['n_aug']).agg(rbias=('re', 'mean')).reset_index()
    bias_list.append(np.array(df_agg['rbias']) * 100)

bias_list = np.array(bias_list)
total_bias = []
for i in range(bias_list.shape[1]):
    total_bias.append(np.sum(bias_list[:,i]))

In [12]:
# Load estimated params
n_sample = 50
n_max_aug = 500
db_models = ['nn']
methods = ['gpt-0613', 'cot-gpt-0613', 'gpt-0125', 'cot-gpt-0125',  'gpt-4', 'cot-gpt-4', 'gpt-4o', 'cot-gpt-4o', 'fs-gpt-4o', 'ft-gpt-4o']
metric = 're'
df_res = pd.DataFrame()
for n_real in [50, 100, 150, 200]:
    saving, saving_std, saving_p_value = [], [], []
    for method, db_model in product(methods, db_models):
        res_file = f'../res/{db_model}_{method}_{n_real}_{n_max_aug}_{n_sample}.pkl'
        if not os.path.exists(res_file):
            continue
        with open(res_file, 'rb') as f:
            res = pkl.load(f)   
        n_aug_list, n_real_list, sample_id_list, params_est_list = res['n_aug_list'],res['n_real_list'], res['sample_id_list'], np.array(res['params_list'])
        bias_list = []
        for i in [0,1,3,4,5,6]: # Remove the third column due to small its magnitude, which makes the relative bias reduction not meaningful
            df = pd.DataFrame({'n_real': n_real_list, 'n_aug': n_aug_list, 'sample_id': sample_id_list, 'param': params_est_list[:,i],
                            'e': np.abs(params_est_list[:,i] - params[i]), 're': (np.abs(params_est_list[:,i] - params[i]))/ (np.abs(params[i])),  'se': (params_est_list[:,i] - params[i])**2})
            bias_list.append(np.array(df[df['n_aug'] == n_max_aug][metric]) * 100)

        bias_list = np.array(bias_list)
        bias_list = np.sum(bias_list, axis = 0)
        equiv_n_list = []
        for i in range(len(bias_list)):
            equiv_n_list.append(find_n_equiv(total_bias, bias_list[i]))

        equiv_n_list = np.array(equiv_n_list)
        saving.append(np.mean((equiv_n_list - n_real)/equiv_n_list) * 100)
        saving_std.append(np.std((equiv_n_list - n_real)/equiv_n_list * 100)/np.sqrt(n_sample))
        saving_p_value.append(one_sided_p_value((equiv_n_list - n_real)/equiv_n_list, 0))

    df1_values = [f"{s:.2f} ({std:.2f}){'*' if p > 0.05 else ''}" for s, std, p in zip(saving, saving_std, saving_p_value)]
    df1 = pd.DataFrame({'n={}'.format(n_real): df1_values}, 
                       index=['gpt-0613', 'cot-gpt-0613', 'gpt-0125', 'cot-gpt-0125', 'gpt-4', 'cot-gpt-4', 'gpt-4o', 'cot-gpt-4o', 'fs-gpt-4o', 'ft-gpt-4o'])
    df_res = pd.concat([df_res, df1], axis=1)

df_res
    

Unnamed: 0,n=50,n=100,n=150,n=200
gpt-0613,59.39 (3.65),33.26 (5.47),30.81 (6.53),17.05 (10.32)*
cot-gpt-0613,55.04 (4.04),29.30 (5.50),26.05 (6.04),10.47 (10.76)*
gpt-0125,52.96 (4.31),29.22 (7.26),26.29 (6.35),16.06 (10.71)*
cot-gpt-0125,56.44 (3.48),34.18 (4.88),25.91 (6.99),14.97 (9.83)*
gpt-4,48.99 (5.25),25.74 (7.29),24.37 (7.20),5.12 (12.09)*
cot-gpt-4,60.29 (3.54),34.24 (5.26),34.83 (5.69),15.81 (8.67)
gpt-4o,46.90 (4.30),27.10 (7.07),23.71 (6.55),17.20 (9.25)
cot-gpt-4o,51.39 (3.93),24.86 (5.88),29.72 (5.46),14.20 (9.81)*
fs-gpt-4o,58.18 (3.42),38.11 (5.10),40.23 (5.06),20.42 (8.60)
ft-gpt-4o,50.26 (4.93),27.01 (5.92),18.34 (6.31),5.56 (11.90)*
