In [1]:
import os

import numpy as np
from sklearn.linear_model import LogisticRegression

from utils import load_hidden_representations_from_hdf5, read_templates_from_file

----

In [2]:
# params
log_dir = "/logfiles"
model = "google-t5-xl-lm-adapt"
module = "encoder"
task = "rte"

In [3]:
# assertions
assert model == "google-t5-xl-lm-adapt"
assert module == "encoder"
assert task == "rte"

## Prompts

In [4]:
df = read_templates_from_file(f"/t0-analysis/prompts/{task}/all.csv")
# df = read_templates_from_file(f"/t0-analysis/prompts/{task}/fixed_prompt.csv")
# df = read_templates_from_file(f"/t0-analysis/prompts/{task}/fixed_target_yes_no.csv")
display(df)

Unnamed: 0,name,template,category,includes_targets,targets,target_ids,shuffle
0,gpt_3_yes_no_with_targets,{premise} Question: {hypothesis} Yes or No?,instructive,True,"▁Yes, ▁No, ▁yes, ▁no","0, 1, 0, 1",False
1,mnli_crowdsource_with_targets,"{premise} Using only the above description and what you know about the world, is ""{hypothesis}"" definitely correct? Yes or No?",instructive,True,"▁Yes, ▁No, ▁yes, ▁no","0, 1, 0, 1",False
2,based_on_previous_passage_with_targets,"{premise} Based on the previous passage, is it true that ""{hypothesis}""? Yes or No?",instructive,True,"▁Yes, ▁No, ▁yes, ▁no","0, 1, 0, 1",False
3,infer_with_targets,"Suppose {premise} Can we infer that ""{hypothesis}""? Yes or No?",instructive,True,"▁Yes, ▁No, ▁yes, ▁no","0, 1, 0, 1",False
4,follow_with_targets,Given that {premise} Does it follow that {hypothesis} Yes or No?,instructive,True,"▁Yes, ▁No, ▁yes, ▁no","0, 1, 0, 1",False
5,imply_with_targets,"{premise} Question: Does this imply that ""{hypothesis}""? Yes or No?",instructive,True,"▁Yes, ▁No, ▁yes, ▁no","0, 1, 0, 1",False
6,guaranteed_with_targets,"Given {premise} Is it guaranteed true that ""{hypothesis}""? Yes or No?",instructive,True,"▁Yes, ▁No, ▁yes, ▁no","0, 1, 0, 1",False
7,justified_with_targets,"{premise} Are we justified in saying that ""{hypothesis}""? Yes or No?",instructive,True,"▁Yes, ▁No, ▁yes, ▁no","0, 1, 0, 1",False
8,must_be_true_with_targets,"Given that {premise} Therefore, it must be true that ""{hypothesis}""? Yes or No?",instructive,True,"▁Yes, ▁No, ▁yes, ▁no","0, 1, 0, 1",False
9,should_assume_with_targets,"Given {premise} Should we assume that ""{hypothesis}"" is true? Yes or No?",instructive,True,"▁Yes, ▁No, ▁yes, ▁no","0, 1, 0, 1",False


## Train linear classifier

In [5]:
def unison_shuffled_copies(a, b):
    # from: https://stackoverflow.com/questions/4601373/better-way-to-shuffle-two-numpy-arrays-in-unison
    assert len(a) == len(b)
    p = np.random.permutation(len(a))
    return a[p], b[p]

In [6]:
use_pattern = [
    "gpt_3_yes_no_with_targets",
    "mnli_crowdsource_with_targets",
    "based_on_previous_passage_with_targets",
    "infer_with_targets",
    # "follow_with_targets",
    # "imply_with_targets",
    # "guaranteed_with_targets",
    # "justified_with_targets",
    # "must_be_true_with_targets",
    # "should_assume_with_targets",
    # "gpt_3_true_false_with_targets",
    # "gpt_3_cat_dog_with_targets",
    # "gpt_3_cat_dog_with_targets_yes_no",
    # "gpt_3_yes_no_without_targets",
    # "words_appear_with_targets",
    # "similar_words_with_targets",
    # "start_with_the_with_targets",
    # "same_meaning_with_targets",
    # "paraphrase_with_targets",
    # "paraphrase_r_with_targets",
    # "summarize_with_targets",
    # "inflection_with_targets",
    # "null_pattern_with_targets",
    # "null_pattern_r_with_targets",
    # "null_pattern_without_targets",
    # "null_pattern_r_without_targets",
    # "premise_only_with_targets",
    # "premise_only_without_targets",
    # "hypothesis_only_with_targets",
    # "hypothesis_only_without_targets",
]

In [7]:
for layer in range(0, 3):
# for layer in range(0, 25):
# for layer in range(24, 25):
    
    print("layer:", layer)
    file_names, prompt_names = [], []

    for _, row in df.iterrows():
        if row['name'] in use_pattern:
            # file_names.append(f"{task}/{model}/{module}/{row['name']}/hidden_represenations_layer{layer}_avg.hdf5",)
            file_names.append(f"{task}/{model}/{module}/{row['name']}/hidden_represenations_layer{layer}_avg-nopad.hdf5",)
            prompt_names.append(row['name'])

    # load hidden representations from hdf5 file
    representations = None
    classes = []
    n_sequences = 0

    for idx, file_name in enumerate(file_names):
        hidden_representations = load_hidden_representations_from_hdf5(os.path.join(log_dir, file_name))
        # take only first 100 samples
        hidden_representations = hidden_representations[:100, :]
        # print(hidden_representations.shape)
        n_sequences = hidden_representations.shape[0]

        if representations is None:
            representations = hidden_representations
        else:
            representations = np.concatenate((representations, hidden_representations), axis=0)

        # use pattern id as label
        # classes += n_sequences * [idx] # assign representations to classes
        # use sample id as label
        classes += list(range(n_sequences)) # assign representations to classes
        
    classes = np.asarray(classes)
    # print(classes)

    X, y = representations, classes
    # shuffle representations and classes
    # X, y = unison_shuffled_coSpies(X, y)
    print(X.shape, y.shape)
    # print(y)

    # train linear classifier
    # multi_class='multinomial' uses a CE loss
    # print('classifying between:', prompt_names)
    print('classifying input ids for prompts:', prompt_names)
    clf = LogisticRegression(random_state=0, max_iter=1000, multi_class='multinomial').fit(X, y)
    
    print(f'layer={layer}; accuracy on training data: ', clf.score(X, y))
    print('\n')



layer: 0


Reading embeddings: 100%|██████████| 277/277 [00:00<00:00, 4504.16it/s]
Reading embeddings: 100%|██████████| 277/277 [00:00<00:00, 5963.84it/s]
Reading embeddings: 100%|██████████| 277/277 [00:00<00:00, 5287.91it/s]
Reading embeddings: 100%|██████████| 277/277 [00:00<00:00, 6056.01it/s]


(400, 2048) (400,)
classifying input ids for prompts: ['gpt_3_yes_no_with_targets', 'mnli_crowdsource_with_targets', 'based_on_previous_passage_with_targets', 'infer_with_targets']
layer=0; accuracy on training data:  1.0


layer: 1


Reading embeddings: 100%|██████████| 277/277 [00:00<00:00, 4362.08it/s]
Reading embeddings: 100%|██████████| 277/277 [00:00<00:00, 5758.67it/s]
Reading embeddings: 100%|██████████| 277/277 [00:00<00:00, 6179.91it/s]
Reading embeddings: 100%|██████████| 277/277 [00:00<00:00, 5733.06it/s]


(400, 2048) (400,)
classifying input ids for prompts: ['gpt_3_yes_no_with_targets', 'mnli_crowdsource_with_targets', 'based_on_previous_passage_with_targets', 'infer_with_targets']
layer=1; accuracy on training data:  1.0


layer: 2


Reading embeddings: 100%|██████████| 277/277 [00:00<00:00, 4466.18it/s]
Reading embeddings: 100%|██████████| 277/277 [00:00<00:00, 5854.78it/s]
Reading embeddings: 100%|██████████| 277/277 [00:00<00:00, 5925.76it/s]
Reading embeddings: 100%|██████████| 277/277 [00:00<00:00, 5913.39it/s]


(400, 2048) (400,)
classifying input ids for prompts: ['gpt_3_yes_no_with_targets', 'mnli_crowdsource_with_targets', 'based_on_previous_passage_with_targets', 'infer_with_targets']
layer=2; accuracy on training data:  1.0




STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [8]:
for layer in range(0, 3):
# for layer in range(0, 25):
# for layer in range(24, 25):
    
    print("layer:", layer)
    file_names, prompt_names = [], []

    for _, row in df.iterrows():
        if row['name'] in use_pattern:
            # file_names.append(f"{task}/{model}/{module}/{row['name']}/hidden_represenations_layer{layer}_avg.hdf5",)
            file_names.append(f"{task}/{model}/{module}/{row['name']}/hidden_represenations_layer{layer}_avg-nopad.hdf5",)
            prompt_names.append(row['name'])

    # load hidden representations from hdf5 file
    representations = None
    classes = []
    n_sequences = 0

    for idx, file_name in enumerate(file_names):
        hidden_representations = load_hidden_representations_from_hdf5(os.path.join(log_dir, file_name))
        # take only first 100 samples
        hidden_representations = hidden_representations[:100, :]
        # print(hidden_representations.shape)
        n_sequences = hidden_representations.shape[0]

        if representations is None:
            representations = hidden_representations
        else:
            representations = np.concatenate((representations, hidden_representations), axis=0)

        # use pattern id as label
        classes += n_sequences * [idx] # assign representations to classes
        # use sample id as label
        # classes += list(range(n_sequences)) # assign representations to classes
        
    classes = np.asarray(classes)
    # print(classes)

    X, y = representations, classes
    # shuffle representations and classes
    # X, y = unison_shuffled_coSpies(X, y)
    print(X.shape, y.shape)
    # print(y)

    # train linear classifier
    # multi_class='multinomial' uses a CE loss
    print('classifying between:', prompt_names)
    # print('classifying input ids for prompts:', prompt_names)
    clf = LogisticRegression(random_state=0, max_iter=1000, multi_class='multinomial').fit(X, y)
    
    print(f'layer={layer}; accuracy on training data: ', clf.score(X, y))
    print('\n')



layer: 0


Reading embeddings: 100%|██████████| 277/277 [00:00<00:00, 5815.74it/s]
Reading embeddings: 100%|██████████| 277/277 [00:00<00:00, 6048.57it/s]
Reading embeddings: 100%|██████████| 277/277 [00:00<00:00, 6006.79it/s]
Reading embeddings: 100%|██████████| 277/277 [00:00<00:00, 6233.49it/s]

(400, 2048) (400,)
classifying between: ['gpt_3_yes_no_with_targets', 'mnli_crowdsource_with_targets', 'based_on_previous_passage_with_targets', 'infer_with_targets']





layer=0; accuracy on training data:  1.0


layer: 1


Reading embeddings: 100%|██████████| 277/277 [00:00<00:00, 4826.89it/s]
Reading embeddings: 100%|██████████| 277/277 [00:00<00:00, 5122.27it/s]
Reading embeddings: 100%|██████████| 277/277 [00:00<00:00, 5897.16it/s]
Reading embeddings: 100%|██████████| 277/277 [00:00<00:00, 5745.79it/s]


(400, 2048) (400,)
classifying between: ['gpt_3_yes_no_with_targets', 'mnli_crowdsource_with_targets', 'based_on_previous_passage_with_targets', 'infer_with_targets']
layer=1; accuracy on training data:  1.0


layer: 2


Reading embeddings: 100%|██████████| 277/277 [00:00<00:00, 4407.24it/s]
Reading embeddings: 100%|██████████| 277/277 [00:00<00:00, 5718.61it/s]
Reading embeddings: 100%|██████████| 277/277 [00:00<00:00, 5469.12it/s]
Reading embeddings: 100%|██████████| 277/277 [00:00<00:00, 5978.54it/s]


(400, 2048) (400,)
classifying between: ['gpt_3_yes_no_with_targets', 'mnli_crowdsource_with_targets', 'based_on_previous_passage_with_targets', 'infer_with_targets']
layer=2; accuracy on training data:  1.0




STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
