In [8]:
import sys
import numpy as np
import pandas as pd 
pd.set_option('display.max_colwidth', -1)

from transformers import AutoTokenizer

from utils import load_hidden_representations_from_hdf5, read_templates_from_file
from task_helpers import TASK_LABELS

In [9]:
def get_target_scores(model_name_or_path, template_name, targets):
    targets = [t.strip() for t in targets[0].split(',')]
    tokenizer = AutoTokenizer.from_pretrained(
        model_name_or_path, cache_dir="/pre-trained-transformers"
    )
    scores = np.load(f"/logfiles/{task}/{model}/{module}/{template_name}/scores_t0.npy")
    target_ids = [tokenizer._convert_token_to_id_with_added_voc(t) for t in targets]
    # print(targets, target_ids)
    target_scores, argmax_scores, max_scores = [], [], []
    for sid in range(len(scores)):
        values = [scores[sid][tid] for tid in target_ids]
        target_scores.append(values)
        argmax_scores.append(np.argmax(values))
        max_scores.append(np.max(values))
    return targets, target_ids, target_scores, argmax_scores, max_scores

In [10]:
# params
log_dir = "/logfiles"
module = "encoder"
task = "rte"

In [11]:
MODEL_NAMES = {
    "bigscience-T0_3B": "bigscience/T0_3B", 
    "bigscience-T0": "bigscience/T0", 
    "google-t5-xl-lm-adapt": "google/t5-xl-lm-adapt",
    "google-t5-xxl-lm-adapt": "google/t5-xxl-lm-adapt",
}

In [12]:
df_prompts = read_templates_from_file(f"/t0-analysis/prompts/{task}/all.csv")
# df_prompts = read_templates_from_file(f"/t0-analysis/prompts/{task}/fixed_prompt.csv")
# df_prompts = read_templates_from_file(f"/t0-analysis/prompts/{task}/fixed_target_yes_no.csv")
# df_prompts = read_templates_from_file(f"/t0-analysis/prompts/{task}/irrelevant.csv")
# df_prompts = read_templates_from_file(f"/t0-analysis/prompts/{task}/misleading.csv")
# df_prompts = read_templates_from_file(f"/t0-analysis/prompts/{task}/controls.csv")
display(df_prompts)

Unnamed: 0,name,template,category,includes_targets,targets,target_ids,shuffle
0,gpt_3_yes_no_with_targets,{premise} Question: {hypothesis} Yes or No?,instructive,True,"▁Yes, ▁No, ▁yes, ▁no","0, 1, 0, 1",False
1,mnli_crowdsource_with_targets,"{premise} Using only the above description and what you know about the world, is ""{hypothesis}"" definitely correct? Yes or No?",instructive,True,"▁Yes, ▁No, ▁yes, ▁no","0, 1, 0, 1",False
2,based_on_previous_passage_with_targets,"{premise} Based on the previous passage, is it true that ""{hypothesis}""? Yes or No?",instructive,True,"▁Yes, ▁No, ▁yes, ▁no","0, 1, 0, 1",False
3,infer_with_targets,"Suppose {premise} Can we infer that ""{hypothesis}""? Yes or No?",instructive,True,"▁Yes, ▁No, ▁yes, ▁no","0, 1, 0, 1",False
4,follow_with_targets,Given that {premise} Does it follow that {hypothesis} Yes or No?,instructive,True,"▁Yes, ▁No, ▁yes, ▁no","0, 1, 0, 1",False
5,imply_with_targets,"{premise} Question: Does this imply that ""{hypothesis}""? Yes or No?",instructive,True,"▁Yes, ▁No, ▁yes, ▁no","0, 1, 0, 1",False
6,guaranteed_with_targets,"Given {premise} Is it guaranteed true that ""{hypothesis}""? Yes or No?",instructive,True,"▁Yes, ▁No, ▁yes, ▁no","0, 1, 0, 1",False
7,justified_with_targets,"{premise} Are we justified in saying that ""{hypothesis}""? Yes or No?",instructive,True,"▁Yes, ▁No, ▁yes, ▁no","0, 1, 0, 1",False
8,must_be_true_with_targets,"Given that {premise} Therefore, it must be true that ""{hypothesis}""? Yes or No?",instructive,True,"▁Yes, ▁No, ▁yes, ▁no","0, 1, 0, 1",False
9,should_assume_with_targets,"Given {premise} Should we assume that ""{hypothesis}"" is true? Yes or No?",instructive,True,"▁Yes, ▁No, ▁yes, ▁no","0, 1, 0, 1",False


In [13]:
# df_prompts.to_csv(f"/t0-analysis/prompts/{task}/all_.csv", sep=";", index=False)
# df_prompts.to_csv(f"/t0-analysis/prompts/{task}/fixed_prompt_.csv", sep=";", index=False)
# df_prompts.to_csv(f"/t0-analysis/prompts/{task}/fixed_target_yes_no_.csv", sep=";", index=False)
# df_prompts.to_csv(f"/t0-analysis/prompts/{task}/irrelevant_.csv", sep=";", index=False)
# df_prompts.to_csv(f"/t0-analysis/prompts/{task}/misleading_.csv", sep=";", index=False)
# df_prompts.to_csv(f"/t0-analysis/prompts/{task}/controls_.csv", sep=";", index=False)

In [14]:
use_pattern = [
    "gpt_3_yes_no_with_targets",
    "mnli_crowdsource_with_targets",
    "based_on_previous_passage_with_targets",
    "infer_with_targets",
    "follow_with_targets",
    "imply_with_targets",
    "guaranteed_with_targets",
    "justified_with_targets",
    "must_be_true_with_targets",
    "should_assume_with_targets",
    "gpt_3_true_false_with_targets",
    "gpt_3_cat_dog_with_targets",
    "gpt_3_cat_dog_with_targets_yes_no",
    "gpt_3_yes_no_without_targets",
    "words_appear_with_targets",
    "similar_words_with_targets",
    "start_with_the_with_targets",
    "same_meaning_with_targets",
    "paraphrase_with_targets",
    "paraphrase_r_with_targets",
    "summarize_with_targets",
    "inflection_with_targets",
    "null_pattern_with_targets",
    "null_pattern_r_with_targets",
    "null_pattern_without_targets",
    "null_pattern_r_without_targets",
    "premise_only_with_targets",
    "premise_only_without_targets",
    "hypothesis_only_with_targets",
    "hypothesis_only_without_targets",
]

## Compute task performance

In [15]:
models_inputs, models_performance = {}, {}


# for model in ["bigscience-T0_3B", "bigscience-T0", "google-t5-xl-lm-adapt", "google-t5-xxl-lm-adapt"]:
# for model in ["bigscience-T0_3B"]:
for model in ["bigscience-T0_3B", "google-t5-xl-lm-adapt"]:
# for model in ["bigscience-T0"]:
    # load prompted inputs and labels
    print(model)
    df_inputs = {}

    print('collecting scores...')
    for _, row in df_prompts.iterrows():
        prompt = row['name']
        if prompt in use_pattern:
            # print(prompt)
            df_inputs[prompt] = pd.read_csv(f"/logfiles/{task}/{model}/{module}/{prompt}/prompted_samples.csv", sep='\t', index_col=0)
            
            # get scores
            targets, target_ids, target_scores, argmax_scores, max_scores = get_target_scores(MODEL_NAMES[model], prompt, df_prompts[df_prompts["name"]== prompt]["targets"].values)
            # print(targets, target_ids, argmax_scores)
            
            targets = [targets] * len(df_inputs[prompt])
            argmax_prediction = [TASK_LABELS[task][idx % 2] for idx in argmax_scores] # % number of classes

            df_inputs[prompt]["targets"] = targets
            df_inputs[prompt]["target_ids"] = [target_ids] * len(df_inputs[prompt])
            df_inputs[prompt]["scores"] = target_scores
            df_inputs[prompt]["max score"] = max_scores
            df_inputs[prompt]["argmax score"] = argmax_scores
            df_inputs[prompt]["argmax score prediction"] = argmax_prediction
    
    models_inputs[model] = df_inputs
    
    print('computing accuracy...')
    # compute performance for each prompt
    performances = {"task em performance": [], "task score performance": []}
    for template in df_inputs.keys():
        # print(template)
        df = df_inputs[template]
        predictions = [str(v) for v in df["prediction"].values]
        score_predictions = [str(v) for v in df["argmax score prediction"].values]
        labels = [str(v) for v in df["label"].values]
        correct_predictions = [1 if p == l else 0 for (p,l) in zip(predictions, labels)]
        correct_score_predictions = [1 if p == l else 0 for (p,l) in zip(score_predictions, labels)]
        performances["task em performance"].append(np.sum(correct_predictions) / len(df))
        performances["task score performance"].append(np.sum(correct_score_predictions) / len(df))

        
    performances_df = pd.DataFrame(performances, index=df_inputs.keys())
    models_performance[model] = performances_df
    print("\n")

bigscience-T0_3B
collecting scores...


Downloading:   0%|          | 0.00/632 [00:00<?, ?B/s]

computing accuracy...


google-t5-xl-lm-adapt
collecting scores...
computing accuracy...




In [16]:
models_performance.keys()

dict_keys(['bigscience-T0_3B', 'google-t5-xl-lm-adapt'])

In [17]:
for model in models_performance:
    print(model)
    display(models_performance[model])
    # print(model, "(sorted by task score performance)")
    # display(models[model].sort_values("task score performance", ascending=False))
    print("\n")

bigscience-T0_3B


Unnamed: 0,task em performance,task score performance
gpt_3_yes_no_with_targets,0.566787,0.566787
mnli_crowdsource_with_targets,0.606498,0.606498
based_on_previous_passage_with_targets,0.693141,0.693141
infer_with_targets,0.599278,0.599278
follow_with_targets,0.592058,0.592058
imply_with_targets,0.574007,0.574007
guaranteed_with_targets,0.635379,0.635379
justified_with_targets,0.606498,0.606498
must_be_true_with_targets,0.703971,0.703971
should_assume_with_targets,0.646209,0.646209




google-t5-xl-lm-adapt


Unnamed: 0,task em performance,task score performance
gpt_3_yes_no_with_targets,0.0,0.552347
mnli_crowdsource_with_targets,0.0,0.527076
based_on_previous_passage_with_targets,0.0,0.527076
infer_with_targets,0.00722,0.527076
follow_with_targets,0.151625,0.530686
imply_with_targets,0.018051,0.527076
guaranteed_with_targets,0.00361,0.527076
justified_with_targets,0.148014,0.527076
must_be_true_with_targets,0.191336,0.527076
should_assume_with_targets,0.0,0.537906






In [18]:
for model in models_performance:
    # print(model)
    # display(models[model])
    print(model, "(sorted by task score performance)")
    display(models_performance[model].sort_values("task score performance", ascending=False))
    print("\n")

bigscience-T0_3B (sorted by task score performance)


Unnamed: 0,task em performance,task score performance
must_be_true_with_targets,0.703971,0.703971
based_on_previous_passage_with_targets,0.693141,0.693141
paraphrase_with_targets,0.66065,0.66065
should_assume_with_targets,0.646209,0.646209
guaranteed_with_targets,0.635379,0.635379
gpt_3_true_false_with_targets,0.624549,0.624549
summarize_with_targets,0.620939,0.620939
same_meaning_with_targets,0.617329,0.617329
mnli_crowdsource_with_targets,0.606498,0.606498
justified_with_targets,0.606498,0.606498




google-t5-xl-lm-adapt (sorted by task score performance)


Unnamed: 0,task em performance,task score performance
gpt_3_yes_no_with_targets,0.0,0.552347
gpt_3_yes_no_without_targets,0.0,0.548736
null_pattern_with_targets,0.231047,0.537906
should_assume_with_targets,0.0,0.537906
hypothesis_only_with_targets,0.173285,0.530686
null_pattern_r_with_targets,0.151625,0.530686
follow_with_targets,0.151625,0.530686
inflection_with_targets,0.158845,0.530686
start_with_the_with_targets,0.050542,0.530686
mnli_crowdsource_with_targets,0.0,0.527076






In [19]:
# check where score based prediction and em based prediction disagree
prompt = "gpt_3_yes_no_with_targets"
df_check = models_inputs["google-t5-xl-lm-adapt"]
df_check[prompt][df_check[prompt]["prediction"] != df_check[prompt]["argmax score prediction"]].head(15)

Unnamed: 0,input,label,prediction,targets,target_ids,scores,max score,argmax score,argmax score prediction
0,"Dana Reeve, the widow of the actor Christopher Reeve, has died of lung cancer at age 44, according to the Christopher Reeve Foundation. Question: Christopher Reeve had an accident. Yes or No?",not_entailment,<token>,"[▁Yes, ▁No, ▁yes, ▁no]","[2163, 465, 4273, 150]","[0.03396237, -0.023033142, -3.669095, -3.9741828]",0.033962,0,entailment
1,"Yet, we now are discovering that antibiotics are losing their effectiveness against illness. Disease-causing bacteria are mutating faster than we can come up with new antibiotics to fight the new variations. Question: Bacteria is winning the war against antibiotics. Yes or No?",entailment,<token>,"[▁Yes, ▁No, ▁yes, ▁no]","[2163, 465, 4273, 150]","[-0.08580762, -0.5106403, -4.3245792, -5.1296253]",-0.085808,0,entailment
2,"Cairo is now home to some 15 million people - a burgeoning population that produces approximately 10,000 tonnes of rubbish per day, putting an enormous strain on public services. In the past 10 years, the government has tried hard to encourage private investment in the refuse sector, but some estimate 4,000 tonnes of waste is left behind every day, festering in the heat as it waits for someone to clear it up. It is often the people in the poorest neighbourhoods that are worst affected. But in some areas they are fighting back. In Shubra, one of the northern districts of the city, the residents have taken to the streets armed with dustpans and brushes to clean up public areas which have been used as public dumps. Question: 15 million tonnes of rubbish are produced daily in Cairo. Yes or No?",not_entailment,<token>,"[▁Yes, ▁No, ▁yes, ▁no]","[2163, 465, 4273, 150]","[-0.4349459, -1.0328158, -4.9311695, -5.4728775]",-0.434946,0,entailment
3,"The Amish community in Pennsylvania, which numbers about 55,000, lives an agrarian lifestyle, shunning technological advances like electricity and automobiles. And many say their insular lifestyle gives them a sense that they are protected from the violence of American society. But as residents gathered near the school, some wearing traditional garb and arriving in horse-drawn buggies, they said that sense of safety had been shattered. ""If someone snaps and wants to do something stupid, there's no distance that's going to stop them,"" said Jake King, 56, an Amish lantern maker who knew several families whose children had been shot. Question: Pennsylvania has the biggest Amish community in the U.S. Yes or No?",not_entailment,<token>,"[▁Yes, ▁No, ▁yes, ▁no]","[2163, 465, 4273, 150]","[-0.6161814, -0.93125427, -4.86127, -5.47552]",-0.616181,0,entailment
4,"Security forces were on high alert after an election campaign in which more than 1,000 people, including seven election candidates, have been killed. Question: Security forces were on high alert after a campaign marred by violence. Yes or No?",entailment,<token>,"[▁Yes, ▁No, ▁yes, ▁no]","[2163, 465, 4273, 150]","[0.18155444, -0.7510078, -4.222875, -5.2157507]",0.181554,0,entailment
5,"In 1979, the leaders signed the Egypt-Israel peace treaty on the White House lawn. Both President Begin and Sadat received the Nobel Peace Prize for their work. The two nations have enjoyed peaceful relations to this day. Question: The Israel-Egypt Peace Agreement was signed in 1979. Yes or No?",entailment,<token>,"[▁Yes, ▁No, ▁yes, ▁no]","[2163, 465, 4273, 150]","[0.9798691, -0.079351366, -3.5683565, -4.5615983]",0.979869,0,entailment
6,"singer and actress Britney Spears, 24, has filled papers in Los Angeles County Superior Court to divorce her husband Kevin Federline, 28. A spokeswoman for the court, Kathy Roberts stated that the papers cited irreconcilable differences"" as the reason for the divorce and have, according to the courts, been legally separated as of Monday, November 6, the same day that Spears appeared on Late Night with David Letterman. Question: Spears is to divorce from Kevin Federline. Yes or No?",entailment,<token>,"[▁Yes, ▁No, ▁yes, ▁no]","[2163, 465, 4273, 150]","[0.8159149, -0.21430445, -2.5335946, -4.1337037]",0.815915,0,entailment
7,"Following the successful bid to bring the 2010 Ryder Cup to Wales, the Wales Tourist Board has wasted little time in commissioning work to ensure that the benefits accruing from the event are felt throughout the country. Question: Wales to host 2010 Ryder Cup. Yes or No?",entailment,<token>,"[▁Yes, ▁No, ▁yes, ▁no]","[2163, 465, 4273, 150]","[-0.28284234, -0.523709, -4.515559, -4.943336]",-0.282842,0,entailment
8,Steve Jobs was attacked by Sculley and other Apple executives for not delivering enough hot new products and resigned from the company a few weeks later. Question: Steve Jobs worked for Apple. Yes or No?,entailment,<token>,"[▁Yes, ▁No, ▁yes, ▁no]","[2163, 465, 4273, 150]","[-0.2379162, -0.2371652, -4.5084333, -4.8143]",-0.237165,1,not_entailment
9,"Traditionally, the Brahui of the Raisani tribe are in charge of the law and order situation through the Pass area. This tribe is still living in present day Balochistan in Pakistan. Question: The Raisani tribe resides in Pakistan. Yes or No?",entailment,<token>,"[▁Yes, ▁No, ▁yes, ▁no]","[2163, 465, 4273, 150]","[-0.6030861, -0.5878572, -4.7725167, -4.8422356]",-0.587857,1,not_entailment
