In [1]:
import sys
import numpy as np
import pandas as pd 
pd.set_option('display.max_colwidth', -1)

from transformers import AutoTokenizer

from utils import load_hidden_representations_from_hdf5, read_templates_from_file
from task_helpers import TASK_LABELS

In [2]:
def get_target_scores(model_name_or_path, template_name, targets):
    targets = [t.strip() for t in targets[0].split(',')]
    tokenizer = AutoTokenizer.from_pretrained(
        model_name_or_path, cache_dir="/pre-trained-transformers"
    )
    scores = np.load(f"/logfiles/{task}/{model}/{module}/{template_name}/scores_t0.npy")
    target_ids = [tokenizer._convert_token_to_id_with_added_voc(t) for t in targets]
    # print(targets, target_ids)
    target_scores, argmax_scores, max_scores = [], [], []
    for sid in range(len(scores)):
        values = [scores[sid][tid] for tid in target_ids]
        target_scores.append(values)
        argmax_scores.append(np.argmax(values))
        max_scores.append(np.max(values))
    return targets, target_ids, target_scores, argmax_scores, max_scores

In [3]:
# params
log_dir = "/logfiles"
module = "encoder"
task = "rte"

In [4]:
MODEL_NAMES = {
    "bigscience-T0_3B": "bigscience/T0_3B", 
    "bigscience-T0": "bigscience/T0", 
    "google-t5-xl-lm-adapt": "google/t5-xl-lm-adapt",
    "google-t5-xxl-lm-adapt": "google/t5-xxl-lm-adapt",
}

In [5]:
df_prompts = read_templates_from_file(f"/t0-analysis/prompts/{task}/all.csv")
# df_prompts = read_templates_from_file(f"/t0-analysis/prompts/{task}/fixed_prompt.csv")
# df_prompts = read_templates_from_file(f"/t0-analysis/prompts/{task}/fixed_target_yes_no.csv")
# df_prompts = read_templates_from_file(f"/t0-analysis/prompts/{task}/irrelevant.csv")
# df_prompts = read_templates_from_file(f"/t0-analysis/prompts/{task}/misleading.csv")
# df_prompts = read_templates_from_file(f"/t0-analysis/prompts/{task}/controls.csv")
display(df_prompts)

Unnamed: 0,name,template,category,includes_targets,targets,target_ids,shuffle
0,gpt_3_yes_no_with_targets,{premise} Question: {hypothesis} Yes or No?,instructive,True,"▁Yes, ▁No, ▁yes, ▁no","0, 1, 0, 1",False
1,mnli_crowdsource_with_targets,"{premise} Using only the above description and what you know about the world, is ""{hypothesis}"" definitely correct? Yes or No?",instructive,True,"▁Yes, ▁No, ▁yes, ▁no","0, 1, 0, 1",False
2,based_on_previous_passage_with_targets,"{premise} Based on the previous passage, is it true that ""{hypothesis}""? Yes or No?",instructive,True,"▁Yes, ▁No, ▁yes, ▁no","0, 1, 0, 1",False
3,infer_with_targets,"Suppose {premise} Can we infer that ""{hypothesis}""? Yes or No?",instructive,True,"▁Yes, ▁No, ▁yes, ▁no","0, 1, 0, 1",False
4,follow_with_targets,Given that {premise} Does it follow that {hypothesis} Yes or No?,instructive,True,"▁Yes, ▁No, ▁yes, ▁no","0, 1, 0, 1",False
5,imply_with_targets,"{premise} Question: Does this imply that ""{hypothesis}""? Yes or No?",instructive,True,"▁Yes, ▁No, ▁yes, ▁no","0, 1, 0, 1",False
6,guaranteed_with_targets,"Given {premise} Is it guaranteed true that ""{hypothesis}""? Yes or No?",instructive,True,"▁Yes, ▁No, ▁yes, ▁no","0, 1, 0, 1",False
7,justified_with_targets,"{premise} Are we justified in saying that ""{hypothesis}""? Yes or No?",instructive,True,"▁Yes, ▁No, ▁yes, ▁no","0, 1, 0, 1",False
8,must_be_true_with_targets,"Given that {premise} Therefore, it must be true that ""{hypothesis}""? Yes or No?",instructive,True,"▁Yes, ▁No, ▁yes, ▁no","0, 1, 0, 1",False
9,should_assume_with_targets,"Given {premise} Should we assume that ""{hypothesis}"" is true? Yes or No?",instructive,True,"▁Yes, ▁No, ▁yes, ▁no","0, 1, 0, 1",False


In [6]:
# df_prompts.to_csv(f"/t0-analysis/prompts/{task}/all_.csv", sep=";", index=False)
# df_prompts.to_csv(f"/t0-analysis/prompts/{task}/fixed_prompt_.csv", sep=";", index=False)
# df_prompts.to_csv(f"/t0-analysis/prompts/{task}/fixed_target_yes_no_.csv", sep=";", index=False)
# df_prompts.to_csv(f"/t0-analysis/prompts/{task}/irrelevant_.csv", sep=";", index=False)
# df_prompts.to_csv(f"/t0-analysis/prompts/{task}/misleading_.csv", sep=";", index=False)
# df_prompts.to_csv(f"/t0-analysis/prompts/{task}/controls_.csv", sep=";", index=False)

In [7]:
use_pattern = [
    "gpt_3_yes_no_with_targets",
    "mnli_crowdsource_with_targets",
    "based_on_previous_passage_with_targets",
    "infer_with_targets",
    "follow_with_targets",
    "imply_with_targets",
    "guaranteed_with_targets",
    "justified_with_targets",
    "must_be_true_with_targets",
    "should_assume_with_targets",
    "gpt_3_true_false_with_targets",
    "gpt_3_cat_dog_with_targets",
    "gpt_3_cat_dog_with_targets_yes_no",
    "gpt_3_yes_no_without_targets",
    "words_appear_with_targets",
    "similar_words_with_targets",
    "start_with_the_with_targets",
    "same_meaning_with_targets",
    "paraphrase_with_targets",
    "paraphrase_r_with_targets",
    "summarize_with_targets",
    "inflection_with_targets",
    "null_pattern_with_targets",
    "null_pattern_r_with_targets",
    "null_pattern_without_targets",
    "null_pattern_r_without_targets",
    "premise_only_with_targets",
    "premise_only_without_targets",
    "hypothesis_only_with_targets",
    "hypothesis_only_without_targets",
    "sentiment_with_targets",
    "sentiment_cat_dog_with_targets"
]

## Compute task performance

In [8]:
models_inputs, models_performance = {}, {}


for model in ["bigscience-T0_3B", "bigscience-T0", "google-t5-xl-lm-adapt", "google-t5-xxl-lm-adapt"]:
# for model in ["bigscience-T0_3B"]:
# for model in ["bigscience-T0_3B", "google-t5-xl-lm-adapt"]:
# for model in ["bigscience-T0"]:
# for model in ["bigscience-T0", "google-t5-xxl-lm-adapt"]:
    # load prompted inputs and labels
    print(model)
    df_inputs = {}

    print('collecting scores...')
    for _, row in df_prompts.iterrows():
        prompt = row['name']
        if prompt in use_pattern:
            # print(prompt)
            df_inputs[prompt] = pd.read_csv(f"/logfiles/{task}/{model}/{module}/{prompt}/prompted_samples.csv", sep='\t', index_col=0)
            
            # get scores
            targets, target_ids, target_scores, argmax_scores, max_scores = get_target_scores(MODEL_NAMES[model], prompt, df_prompts[df_prompts["name"]== prompt]["targets"].values)
            # print(targets, target_ids, argmax_scores)
            
            targets = [targets] * len(df_inputs[prompt])
            argmax_prediction = [TASK_LABELS[task][idx % 2] for idx in argmax_scores] # % number of classes

            df_inputs[prompt]["targets"] = targets
            df_inputs[prompt]["target_ids"] = [target_ids] * len(df_inputs[prompt])
            df_inputs[prompt]["scores"] = target_scores
            df_inputs[prompt]["max score"] = max_scores
            df_inputs[prompt]["argmax score"] = argmax_scores
            df_inputs[prompt]["argmax score prediction"] = argmax_prediction
    
    models_inputs[model] = df_inputs
    
    print('computing accuracy...')
    # compute performance for each prompt
    performances = {"task em performance": [], "task score performance": []}
    for template in df_inputs.keys():
        # print(template)
        df = df_inputs[template]
        predictions = [str(v) for v in df["prediction"].values]
        score_predictions = [str(v) for v in df["argmax score prediction"].values]
        labels = [str(v) for v in df["label"].values]
        correct_predictions = [1 if p == l else 0 for (p,l) in zip(predictions, labels)]
        correct_score_predictions = [1 if p == l else 0 for (p,l) in zip(score_predictions, labels)]
        performances["task em performance"].append(np.sum(correct_predictions) / len(df))
        performances["task score performance"].append(np.sum(correct_score_predictions) / len(df))

        
    performances_df = pd.DataFrame(performances, index=df_inputs.keys())
    models_performance[model] = performances_df
    print("\n")

bigscience-T0_3B
collecting scores...


Downloading:   0%|          | 0.00/632 [00:00<?, ?B/s]

computing accuracy...


bigscience-T0
collecting scores...


Downloading:   0%|          | 0.00/633 [00:00<?, ?B/s]

computing accuracy...


google-t5-xl-lm-adapt
collecting scores...
computing accuracy...


google-t5-xxl-lm-adapt
collecting scores...
computing accuracy...




In [9]:
models_performance.keys()

dict_keys(['bigscience-T0_3B', 'bigscience-T0', 'google-t5-xl-lm-adapt', 'google-t5-xxl-lm-adapt'])

In [10]:
for model in models_performance:
    print(model)
    display(models_performance[model])
    # print(model, "(sorted by task score performance)")
    # display(models[model].sort_values("task score performance", ascending=False))
    print("\n")

bigscience-T0_3B


Unnamed: 0,task em performance,task score performance
gpt_3_yes_no_with_targets,0.566787,0.566787
mnli_crowdsource_with_targets,0.606498,0.606498
based_on_previous_passage_with_targets,0.693141,0.693141
infer_with_targets,0.599278,0.599278
follow_with_targets,0.592058,0.592058
imply_with_targets,0.574007,0.574007
guaranteed_with_targets,0.635379,0.635379
justified_with_targets,0.606498,0.606498
must_be_true_with_targets,0.703971,0.703971
should_assume_with_targets,0.646209,0.646209




bigscience-T0


Unnamed: 0,task em performance,task score performance
gpt_3_yes_no_with_targets,0.805054,0.805054
mnli_crowdsource_with_targets,0.841155,0.841155
based_on_previous_passage_with_targets,0.848375,0.848375
infer_with_targets,0.819495,0.819495
follow_with_targets,0.747292,0.747292
imply_with_targets,0.851986,0.851986
guaranteed_with_targets,0.819495,0.819495
justified_with_targets,0.776173,0.776173
must_be_true_with_targets,0.833935,0.833935
should_assume_with_targets,0.837545,0.837545




google-t5-xl-lm-adapt


Unnamed: 0,task em performance,task score performance
gpt_3_yes_no_with_targets,0.0,0.552347
mnli_crowdsource_with_targets,0.0,0.527076
based_on_previous_passage_with_targets,0.0,0.527076
infer_with_targets,0.00722,0.527076
follow_with_targets,0.151625,0.530686
imply_with_targets,0.018051,0.527076
guaranteed_with_targets,0.00361,0.527076
justified_with_targets,0.148014,0.527076
must_be_true_with_targets,0.191336,0.527076
should_assume_with_targets,0.0,0.537906




google-t5-xxl-lm-adapt


Unnamed: 0,task em performance,task score performance
gpt_3_yes_no_with_targets,0.00361,0.581227
mnli_crowdsource_with_targets,0.0,0.523466
based_on_previous_passage_with_targets,0.01083,0.527076
infer_with_targets,0.018051,0.527076
follow_with_targets,0.01083,0.530686
imply_with_targets,0.00361,0.534296
guaranteed_with_targets,0.0,0.530686
justified_with_targets,0.01083,0.530686
must_be_true_with_targets,0.00361,0.530686
should_assume_with_targets,0.021661,0.530686






In [11]:
for model in models_performance:
    # print(model)
    # display(models[model])
    print(model, "(sorted by task score performance)")
    display(models_performance[model].sort_values("task score performance", ascending=False))
    print("\n")

bigscience-T0_3B (sorted by task score performance)


Unnamed: 0,task em performance,task score performance
must_be_true_with_targets,0.703971,0.703971
based_on_previous_passage_with_targets,0.693141,0.693141
paraphrase_with_targets,0.66065,0.66065
should_assume_with_targets,0.646209,0.646209
guaranteed_with_targets,0.635379,0.635379
gpt_3_true_false_with_targets,0.624549,0.624549
summarize_with_targets,0.620939,0.620939
same_meaning_with_targets,0.617329,0.617329
justified_with_targets,0.606498,0.606498
mnli_crowdsource_with_targets,0.606498,0.606498




bigscience-T0 (sorted by task score performance)


Unnamed: 0,task em performance,task score performance
imply_with_targets,0.851986,0.851986
based_on_previous_passage_with_targets,0.848375,0.848375
mnli_crowdsource_with_targets,0.841155,0.841155
should_assume_with_targets,0.837545,0.837545
must_be_true_with_targets,0.833935,0.833935
gpt_3_true_false_with_targets,0.823105,0.823105
infer_with_targets,0.819495,0.819495
guaranteed_with_targets,0.819495,0.819495
gpt_3_yes_no_with_targets,0.805054,0.805054
gpt_3_yes_no_without_targets,0.292419,0.801444




google-t5-xl-lm-adapt (sorted by task score performance)


Unnamed: 0,task em performance,task score performance
gpt_3_yes_no_with_targets,0.0,0.552347
gpt_3_yes_no_without_targets,0.0,0.548736
should_assume_with_targets,0.0,0.537906
null_pattern_with_targets,0.231047,0.537906
hypothesis_only_with_targets,0.173285,0.530686
null_pattern_r_with_targets,0.151625,0.530686
inflection_with_targets,0.158845,0.530686
start_with_the_with_targets,0.050542,0.530686
follow_with_targets,0.151625,0.530686
must_be_true_with_targets,0.191336,0.527076




google-t5-xxl-lm-adapt (sorted by task score performance)


Unnamed: 0,task em performance,task score performance
gpt_3_yes_no_without_targets,0.0,0.592058
gpt_3_yes_no_with_targets,0.00361,0.581227
gpt_3_cat_dog_with_targets_yes_no,0.0,0.552347
summarize_with_targets,0.00361,0.534296
imply_with_targets,0.00361,0.534296
same_meaning_with_targets,0.00361,0.530686
null_pattern_with_targets,0.025271,0.530686
should_assume_with_targets,0.021661,0.530686
must_be_true_with_targets,0.00361,0.530686
justified_with_targets,0.01083,0.530686






In [12]:
# # check where score based prediction and em based prediction disagree
# prompt = "gpt_3_yes_no_with_targets"
# df_check = models_inputs["google-t5-xl-lm-adapt"]
# df_check[prompt][df_check[prompt]["prediction"] != df_check[prompt]["argmax score prediction"]].head(15)