In [1]:
import pandas as pd
import numpy as np
import pickle
from scipy import stats

In [2]:
c1 = ['clock', 'knife', 'oven', 'chair', 'bottle', 'keyboard']
c2 = ['cat', 'elephant', 'dog', 'bird', 'bear']
c3 = ['airplane', 'boat', 'car', 'truck', 'bicycle']

def convert_to_tri_class(x, c1, c2, c3):
    if x in c1:
        c = 0
    elif x in c2:
        c = 1
    else:
        assert x in c3
        c = 2
    return c

def convert_prob_to_tri_class(row, c1, c2, c3):
    c1_sum = 0
    for v in c1:
        c1_sum += row[v]
    c2_sum = 0
    for v in c2:
        c2_sum += row[v]
    c3_sum = 0
    for v in c3:
        c3_sum += row[v]
    row['model_0'] = min(1.0, c1_sum)
    row['model_1'] = min(1.0, c2_sum)
    row['model_2'] = min(1.0, c3_sum)
    return row

def combine_experts(rows):
    true_result = rows.iloc[0]['image_category_new']
    expert_predictions = rows['participant_classification_new']
    
    correct_responses = []
    incorrect_responses = []
    for pred in expert_predictions:
        if pred==true_result:
            correct_responses.append(pred)
        else:
            incorrect_responses.append(pred)
            
    expert_predictions = []
    if len(correct_responses)>=2:
        expert_predictions = correct_responses[:2]
        if len(incorrect_responses)>=1:
            expert_predctions.append(incorrect_responses[0])
        else:
            expert_predictions.append(correct_responses[2])
    elif len(correct_responses)==1:
        expert_predictions = [correct_responses[0]]
        expert_predictions.extend(incorrect_responses[:2])
    else:
        expert_predictions = incorrect_responses[:3]
    if true_result == 1:
        expert_predictions = [expert_predictions[2], expert_predictions[0], expert_predictions[1]]
    elif true_result == 2:
        expert_predictions = [expert_predictions[1], expert_predictions[2], expert_predictions[0]]
        
    return expert_predictions

In [3]:
# create dataframe of expert and model predictions
n_experts = 3
df_model = pd.read_csv('model_preds_raw.csv')
df_human = pd.read_csv("annotations_raw.csv")
noisy = True

# take subset of noisier images
df_human = df_human[df_human['noise_level']>95]
df_model = df_model[df_model['noise_level']>95]

df_human = df_human[[
 'participant_id', 'image_id', 'image_name', 'noise_level', 'image_category',
 'participant_classification', 'confidence', 'correct', 'total_accuracy'
]]
for c in ['participant_classification', 'image_category']:
    df_human[c+"_new"] = df_human[c].apply(
        convert_to_tri_class, args=(c1, c2, c3,)
    )

df_human_proc = pd.DataFrame(
    df_human.groupby(['image_name', 'noise_level']).apply(combine_experts),
    columns=["Y_H"]
)
df_human_proc.reset_index(inplace=True)
df_human_proc['consensus'] = df_human_proc['Y_H'].apply(
    lambda x: stats.mode(x)[0][0]
)
for e in range(n_experts):
    df_human_proc['expert'+str(e+1)] = df_human_proc['Y_H'].apply(lambda x: x[e])
    
df_model = df_model.apply(
        convert_prob_to_tri_class, args=(c1, c2, c3,), axis=1
)
model_name = 'alexnet' if noisy else 'vgg19'
dn_df = df_model[df_model['model_name']==model_name].copy()
dn_df['model_pred_int'] = dn_df.apply(
    lambda x: np.argmax([x['model_'+str(i)] for i in range(3)]), axis=1
)
dn_df = dn_df[['image_name', 'noise_level','model_0','model_1', 'model_2','model_pred_int']]

df = dn_df.merge(df_human_proc, on=['image_name', 'noise_level'], how='right')
df = df.sample(frac=1)
df = df[df['consensus']!=-1].reset_index(drop=True)
df.to_csv('imagenet_processed.csv')

In [4]:
df.head()

Unnamed: 0,image_name,noise_level,model_0,model_1,model_2,model_pred_int,Y_H,consensus,expert1,expert2,expert3
0,n02123394_6265,110,0.043533,0.938086,0.018381,1,"[1, 1, 1]",1,1,1,1
1,n04548280_9121,125,0.553714,0.160425,0.285862,0,"[0, 0, 2]",0,0,0,2
2,n02111500_4875,110,0.000621,0.999017,0.000362,1,"[1, 1, 1]",1,1,1,1
3,n04467665_1187,110,0.000382,0.000105,0.999513,2,"[2, 0, 2]",2,2,0,2
4,n01592084_2422,125,0.002229,0.996785,0.000986,1,"[1, 1, 1]",1,1,1,1


In [5]:
# get accuracies
n = 250
print('accuracy on test set (n={}):'.format(n))

df['model_correct'] = df['model_pred_int']==df['consensus']
test_accuracy = np.mean(df[:n]['model_correct'])
class_wise_accs = list(df[:n].groupby('consensus').aggregate(
        {'model_correct':'mean'}
)['model_correct'])
print("\tclassifier (overall): {}".format(test_accuracy))
print("\t\t " + str(class_wise_accs))

for e in range(n_experts):
    e_corr_col = 'expert{}_correct'.format(e+1)
    df[e_corr_col] = df['expert'+str(e+1)]==df['consensus']
    expert_acc = sum(df[:n]['expert'+str(e+1)]==df[:n]['consensus'])/n
    class_wise_accs = list(df[:n].groupby('consensus').aggregate(
            {e_corr_col:'mean'}
    )[e_corr_col])
    print ("\texpert {}: {}".format(e+1, expert_acc))
    print("\t\t " + str(class_wise_accs))

accuracy on test set (n=250):
	classifier (overall): 0.9
		 [0.872093023255814, 0.9764705882352941, 0.8481012658227848]
	expert 1: 0.844
		 [0.9651162790697675, 0.5764705882352941, 1.0]
	expert 2: 0.912
		 [0.9651162790697675, 1.0, 0.759493670886076]
	expert 3: 0.832
		 [0.5232558139534884, 0.9882352941176471, 1.0]


In [6]:
# save data dict for our model
y_h = np.array(df[['expert1','expert2','expert3']]) + 1
y_m = [[df['model_'+str(i)]] for i in range(3)]
y_m = np.array(y_m).reshape((len(df), 1, 3))

out_dict = {
    'Y_H' : y_h.tolist(),
    'Y_M' : y_m.tolist(),
    'n_models': 1,
    'n_humans': n_experts,
    'K': 3
}

ext = "_noisy" if noisy else ""
with open('data' + ext + '.pickle', 'wb') as handle:
    pickle.dump(out_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [7]:
# create data dict for INFEXP model
y_h = np.array(df[['expert'+str(i+1) for i in range(n_experts)]])
y_h = y_h.reshape((n_experts,len(df)))
d_new = np.array(df['consensus'])

y_m_new = np.array([df['model_'+str(i)] for i in range(3)])
model_confs = np.array([y_m_new])
model_preds = np.array([[np.argmax(i) for i in j] for j in model_confs])

df['model_correct'] = df['model_pred_int']==df['consensus']
model_perf = np.array([[df['model_correct'].mean()]])
class_wise_perf = np.array(
    df.groupby(
        'consensus'
    ).aggregate(
        {'model_correct':'mean'}
    )['model_correct']
)

n_models = 1
infexp_dict = {
    'model_confs' : model_confs,
    'model_preds' : model_preds,
    'true_targets' : d_new,
    'expert_preds' : y_h,
    'chosen_models' : np.array([0]),
    'model_perf' : model_perf,
    'model_perf_per_class' : class_wise_perf
}

with open('imagenet_infexp.pickle', 'wb') as handle:
    pickle.dump(infexp_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)