In [1]:
import sys
import numpy as np
import pandas as pd 

from utils import load_hidden_representations_from_hdf5, read_templates_from_file

In [2]:
# params
log_dir = "/logfiles"
module = "decoder"
task = "rte"

In [3]:
df_prompts = read_templates_from_file(f"/t0-analysis/prompts/{task}/all.csv")
# df_prompts = read_templates_from_file(f"/t0-analysis/prompts/{task}/fixed_prompt.csv")
# df = read_templates_from_file(f"/t0-analysis/prompts/{task}/fixed_target_yes_no.csv")
display(df_prompts)

Unnamed: 0,name,template,category,includes_targets,targets,target_ids,shuffle
0,gpt_3_yes_no_with_targets,{premise} Question: {hypothesis} Yes or No?,instructive,True,"▁Yes, ▁No","0, 1",False
1,mnli_crowdsource_with_targets,{premise} Using only the above description and...,instructive,True,"▁Yes, ▁No","0, 1",False
2,based_on_previous_passage_with_targets,"{premise} Based on the previous passage, is it...",instructive,True,"▁Yes, ▁No","0, 1",False
3,infer_with_targets,"Suppose {premise} Can we infer that ""{hypothes...",instructive,True,"▁Yes, ▁No","0, 1",False
4,follow_with_targets,Given that {premise} Does it follow that {hypo...,instructive,True,"▁Yes, ▁No","0, 1",False
5,imply_with_targets,"{premise} Question: Does this imply that ""{hyp...",instructive,True,"▁Yes, ▁No","0, 1",False
6,guaranteed_with_targets,"Given {premise} Is it guaranteed true that ""{h...",instructive,True,"▁Yes, ▁No","0, 1",False
7,justified_with_targets,"{premise} Are we justified in saying that ""{hy...",instructive,True,"▁Yes, ▁No","0, 1",False
8,must_be_true_with_targets,"Given that {premise} Therefore, it must be tru...",instructive,True,"▁Yes, ▁No","0, 1",False
9,should_assume_with_targets,"Given {premise} Should we assume that ""{hypoth...",instructive,True,"▁Yes, ▁No","0, 1",False


## Compute task performance

In [4]:
models = {}

for model in ["bigscience-T0_3B", "bigscience-T0", "google-t5-xl-lm-adapt", "google-t5-xxl-lm-adapt"]:
# for model in ["bigscience-T0_3B"]:
# for model in ["bigscience-T0"]:
    # load prompted inputs and labels
    df_inputs = {}
    for _, row in df_prompts.iterrows():
        prompt = row['name']
        df_inputs[prompt] = pd.read_csv(f"/logfiles/{task}/{model}/{module}/{prompt}/prompted_samples.csv", sep='\t', index_col=0)
        
    # compute performance for each prompt
    performances = {"task performance": []}
    for template in df_inputs.keys():
        df = df_inputs[template]
        predictions = [str(v) for v in df["prediction"].values]
        labels = [str(v) for v in df["label"].values]
        correct_predictions = [1 if p == l else 0 for (p,l) in zip(predictions, labels)]
        performances["task performance"].append(np.sum(correct_predictions) / len(df))

    performances_df = pd.DataFrame(performances, index=df_inputs.keys())
    models[model] = performances_df

In [5]:
models.keys()

dict_keys(['bigscience-T0_3B', 'bigscience-T0', 'google-t5-xl-lm-adapt', 'google-t5-xxl-lm-adapt'])

In [6]:
for model in models:
    print(model)
    display(models[model])

bigscience-T0_3B


Unnamed: 0,task performance
gpt_3_yes_no_with_targets,0.566787
mnli_crowdsource_with_targets,0.606498
based_on_previous_passage_with_targets,0.693141
infer_with_targets,0.599278
follow_with_targets,0.592058
imply_with_targets,0.574007
guaranteed_with_targets,0.635379
justified_with_targets,0.606498
must_be_true_with_targets,0.703971
should_assume_with_targets,0.646209


bigscience-T0


Unnamed: 0,task performance
gpt_3_yes_no_with_targets,0.805054
mnli_crowdsource_with_targets,0.841155
based_on_previous_passage_with_targets,0.848375
infer_with_targets,0.819495
follow_with_targets,0.747292
imply_with_targets,0.851986
guaranteed_with_targets,0.819495
justified_with_targets,0.776173
must_be_true_with_targets,0.833935
should_assume_with_targets,0.837545


google-t5-xl-lm-adapt


Unnamed: 0,task performance
gpt_3_yes_no_with_targets,0.0
mnli_crowdsource_with_targets,0.0
based_on_previous_passage_with_targets,0.0
infer_with_targets,0.00722
follow_with_targets,0.151625
imply_with_targets,0.018051
guaranteed_with_targets,0.00361
justified_with_targets,0.148014
must_be_true_with_targets,0.191336
should_assume_with_targets,0.0


google-t5-xxl-lm-adapt


Unnamed: 0,task performance
gpt_3_yes_no_with_targets,0.00361
mnli_crowdsource_with_targets,0.0
based_on_previous_passage_with_targets,0.01083
infer_with_targets,0.018051
follow_with_targets,0.01083
imply_with_targets,0.00361
guaranteed_with_targets,0.0
justified_with_targets,0.01083
must_be_true_with_targets,0.00361
should_assume_with_targets,0.021661
