In [1]:
import pandas as pd
import transformers 
import numpy as np
import torch
import sys

sys.path.insert(0, '..')

from decompose_gpt2 import GPT2LMHeadModelDecomposed

%load_ext autoreload
%autoreload 2


In [2]:
tokenizer = transformers.AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token


In [3]:
verbs = pd.read_csv("all_VERBs.csv")["WORD"]
verb_ids = []

for verb in verbs.iloc:
    i = tokenizer.encode(" " + verb)
    if (len(i) == 1):
        verb_ids.append(i[0])

verb_ids = torch.Tensor(verb_ids).to(int)


In [4]:
number_df = pd.read_csv("nounpp.tsv", delimiter="\t")
max_length = 16

number_df["id"] = number_df["id"].apply(lambda x: int(x[2:]))

# only keep sentences that are plural/singular or singular/plural (distractor has different number)
number_df["subject_distractor_number"] = number_df["subject_distractor_number"].apply(
    lambda x: x if x == "singular_plural" or x == "plural_singular" else np.nan
)
number_df.dropna(inplace=True)

number_df["subject_number"] = number_df["subject_distractor_number"].apply(lambda x: x.split("_")[0])
number_df["distractor_number"] = number_df["subject_distractor_number"].apply(lambda x: x.split("_")[1])

number_df["verb"] = number_df["sentence"].apply(lambda x: " " + x.split(" ")[-1])
number_df["sentence"] = number_df["sentence"].apply(lambda x: " ".join(x.split(" ")[:-1]))

number_df = number_df.drop(
    columns=["subject_distractor_number"]
    ).pivot(index=["id", "subject_number", "distractor_number", "sentence"], columns=["correctness"], values=["verb"]).reset_index()


def get_token(correct_token, wrong_token):
    # replace with if longer than 1 token
    if len(correct_token) > 1 or len(wrong_token) > 1:
        return np.nan

    return [correct_token[0], wrong_token[0]]

number_df[("token", "correct")] = tokenizer(number_df[("verb", "correct")].to_list())["input_ids"]
number_df[("token", "wrong")] = tokenizer(number_df[("verb", "wrong")].to_list())["input_ids"]

number_df[("verb_tokens")] = number_df.apply(
    lambda x: get_token(x[("token", "correct")], x[("token", "wrong")]),
    axis=1)

number_df[("sentence_tokens")] = tokenizer(number_df[("sentence", "")].to_list())["input_ids"]

# drop duplicates
number_df = number_df.dropna().reset_index(drop=True).drop(columns="token")
number_df


  number_df = number_df.dropna().reset_index(drop=True).drop(columns="token")


Unnamed: 0_level_0,id,subject_number,distractor_number,sentence,verb,verb,verb_tokens,sentence_tokens
correctness,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,correct,wrong,Unnamed: 7_level_1,Unnamed: 8_level_1
0,601,singular,plural,The athlete behind the bikes,approves,approve,"[43770, 14762]","[464, 16076, 2157, 262, 16715]"
1,602,singular,plural,The athlete behind the cars,inspires,inspire,"[38934, 18330]","[464, 16076, 2157, 262, 5006]"
2,603,singular,plural,The athlete behind the cats,engages,engage,"[32902, 8209]","[464, 16076, 2157, 262, 11875]"
3,604,singular,plural,The athlete behind the cats,remembers,remember,"[18140, 3505]","[464, 16076, 2157, 262, 11875]"
4,605,singular,plural,The athlete behind the chairs,observes,observe,"[34526, 12414]","[464, 16076, 2157, 262, 18791]"
...,...,...,...,...,...,...,...,...
882,1795,plural,singular,The women near the dog,engage,engages,"[8209, 32902]","[464, 1466, 1474, 262, 3290]"
883,1796,plural,singular,The women near the tree,greet,greets,"[12589, 45204]","[464, 1466, 1474, 262, 5509]"
884,1797,plural,singular,The women near the window,avoid,avoids,"[3368, 30940]","[464, 1466, 1474, 262, 4324]"
885,1799,plural,singular,The women near the window,engage,engages,"[8209, 32902]","[464, 1466, 1474, 262, 4324]"


In [5]:
def make_component_masks(sentence_tokens):
    # prep = [13970, 1474, 2157] # beside, near, behind
    prep = tokenizer.encode(' beside near behind')

    # always starts with 0, 133
    i = 1
    The_i = 0
    subj_i = []

    while sentence_tokens[i] not in prep:
        subj_i.append(i)
        i += 1
        
    prep_i = i
    the_i = i + 1
    distractor_id = []
    i += 2

    while i < len(sentence_tokens):
        distractor_id.append(i)
        i += 1
        
    # place each one in a separate array 
    component_masks = np.zeros((5, len(sentence_tokens)))
    for n, component in enumerate([
        The_i, subj_i, prep_i, the_i, distractor_id
    ]):
        component_masks[n, component] = 1
    
    return component_masks


number_df["beta_mask"] = number_df["sentence_tokens"].apply(make_component_masks)


In [13]:
decomposed_model = None

def split_pos_neg_contributions(logits):
    """
    shape: (num_contributions + 1 (bias), num_classes)
    """
   # put negative "positive" contributions in the negative category
    # put negative "negative" contributions in the positive category
    positive_mask = (logits > 0).astype(int)

    positive_logits = logits[..., 1] * positive_mask[..., 1]
    positive_logits -= logits[..., 0] * (1 - positive_mask[..., 0])

    negative_logits = logits[..., 0] * positive_mask[..., 0]
    negative_logits -= logits[..., 1] * (1 - positive_mask[..., 1])

    # assert (np.all(positive_logits >= 0))
    # assert (np.all(negative_logits >= 0))

    binary_logits = np.stack(
        [negative_logits, positive_logits], axis=-2
    )
    # print(binary_logits.shape)
    return binary_logits


def get_proportion_contribution(n):
    beta_masks = torch.tensor(number_df["beta_mask"][n])
    verb_tokens = torch.tensor(number_df["verb_tokens"][n])
    inputs = torch.tensor(number_df["sentence_tokens"][n])

    contribution_logits = torch.zeros((5, 2))

    for i, mask in enumerate(beta_masks):
        beta_mask = torch.stack([mask, 1 - mask]).unsqueeze(0)

        with torch.no_grad():
            result = decomposed_model(input_ids=inputs,
                            beta_mask=beta_mask)["logits"]
            
        # result = result[0, -1, :]
        # # normalize across entire vocab
        # result = result - result[verb_ids].mean()
        # result = result / torch.std(result[verb_ids], keepdim=True)
        # contribution = result[verb_tokens]
        # contribution_logits[i, :] = contribution

        result = result[:, -1, :]
        result = result - result[:, verb_ids].mean(1, keepdim=True)
        result = result / result[:, verb_ids].std(1, keepdim=True)

        correct, wrong = split_pos_neg_contributions(result[:, verb_tokens].numpy())
        # beta_z_t / z_t
        correct = correct[0] / (correct.sum() + 1e-10)
        wrong = wrong[0] / (wrong.sum() + 1e-10)
        
        contribution_logits[i, 0] = correct
        contribution_logits[i, 1] = wrong

    return contribution_logits


### GCD w/ fixed bias

In [15]:
decomposed_model = GPT2LMHeadModelDecomposed.from_pretrained(
    "gpt2",
    debug=False,
    shapley_include_bias=False,
    generalized=True, 
    num_contributions=2
    )


In [16]:
contribution_logits = []
for i in range(100):
    contribution_logits.append(get_proportion_contribution(i))
    if i % 20 == 0:
        print(i)

contribution_logits = torch.stack(contribution_logits)


0
20
40
60
80


In [17]:
contribution_logits.mean(0)


tensor([[0.5407, 0.0480],
        [0.4395, 0.2156],
        [0.4578, 0.1599],
        [0.3725, 0.2872],
        [0.1809, 0.3719]])

In [19]:
contribution_logits_sp = []
contribution_logits_ps = []

for i in range(len(number_df)):
# for i in range(100):
    if number_df["subject_number"][i] == "singular":
        contribution_logits_sp.append(get_proportion_contribution(i))
    else:
        contribution_logits_ps.append(get_proportion_contribution(i))
    if i % 20 == 0:
        print(i)

contribution_logits_sp = torch.stack(contribution_logits_sp)
contribution_logits_ps = torch.stack(contribution_logits_ps)


0
20
40
60
80
100
120
140
160
180
200
220
240
260
280
300
320
340
360
380
400
420
440
460
480
500
520
540
560
580
600
620
640
660
680
700
720
740
760
780
800
820
840
860
880


In [20]:
contribution_logits_sp.mean(0)


tensor([[0.5353, 0.0385],
        [0.4375, 0.2356],
        [0.4637, 0.1839],
        [0.3525, 0.2781],
        [0.2115, 0.3860]])

In [21]:
contribution_logits_ps.mean(0)


tensor([[0.3040, 0.3017],
        [0.6178, 0.1336],
        [0.2425, 0.5139],
        [0.1563, 0.6640],
        [0.1926, 0.6071]])

### CD w/ fixed bias

In [22]:
decomposed_model = GPT2LMHeadModelDecomposed.from_pretrained(
    "gpt2",
    debug=False,
    shapley_include_bias=False,
    generalized=False, 
    num_contributions=2
    )


In [23]:
contribution_logits_sp = []
contribution_logits_ps = []

for i in range(len(number_df)):
# for i in range(100):
    if number_df["subject_number"][i] == "singular":
        contribution_logits_sp.append(get_proportion_contribution(i))
    else:
        contribution_logits_ps.append(get_proportion_contribution(i))
    if i % 20 == 0:
        print(i)

contribution_logits_sp = torch.stack(contribution_logits_sp)
contribution_logits_ps = torch.stack(contribution_logits_ps)


0
20
40
60
80
100
120
140
160
180
200
220
240
260
280
300
320
340
360
380
400
420
440
460
480
500
520
540
560
580
600
620
640
660
680
700
720
740
760
780
800
820
840
860
880


In [24]:
contribution_logits_sp.mean(0)


tensor([[0.3206, 0.3162],
        [0.4249, 0.2105],
        [0.4493, 0.1565],
        [0.3951, 0.2412],
        [0.1902, 0.4667]])

In [25]:
contribution_logits_ps.mean(0)


tensor([[0.2694, 0.3193],
        [0.2261, 0.4094],
        [0.1498, 0.4709],
        [0.2260, 0.4176],
        [0.3647, 0.2092]])

### GCD w/ permuted bias

In [26]:
decomposed_model = GPT2LMHeadModelDecomposed.from_pretrained(
    "gpt2",
    debug=False,
    shapley_include_bias=True,
    generalized=True, 
    num_contributions=2
    )


In [27]:
contribution_logits_sp = []
contribution_logits_ps = []

for i in range(len(number_df)):
# for i in range(100):
    if number_df["subject_number"][i] == "singular":
        contribution_logits_sp.append(get_proportion_contribution(i))
    else:
        contribution_logits_ps.append(get_proportion_contribution(i))
    if i % 20 == 0:
        print(i)

contribution_logits_sp = torch.stack(contribution_logits_sp)
contribution_logits_ps = torch.stack(contribution_logits_ps)


0
20
40
60
80
100
120
140
160
180
200
220
240
260
280
300
320
340
360
380
400
420
440
460
480
500
520
540
560
580
600
620
640
660
680
700
720
740
760
780
800
820
840
860
880


In [28]:
contribution_logits_sp.mean(0)


tensor([[0.4156, 0.1172],
        [0.4726, 0.1176],
        [0.3775, 0.1112],
        [0.3851, 0.1372],
        [0.2480, 0.2046]])

In [29]:
contribution_logits_ps.mean(0)


tensor([[0.2594, 0.3282],
        [0.2190, 0.3734],
        [0.1507, 0.3404],
        [0.1407, 0.4034],
        [0.2184, 0.2696]])

### CD w/ permuted bias

In [30]:
decomposed_model = GPT2LMHeadModelDecomposed.from_pretrained(
    "gpt2",
    debug=False,
    shapley_include_bias=True,
    generalized=False, 
    num_contributions=2
    )


In [None]:
contribution_logits_sp = []
contribution_logits_ps = []

for i in range(len(number_df)):
# for i in range(100):
    if number_df["subject_number"][i] == "singular":
        contribution_logits_sp.append(get_proportion_contribution(i))
    else:
        contribution_logits_ps.append(get_proportion_contribution(i))
    if i % 20 == 0:
        print(i)

contribution_logits_sp = torch.stack(contribution_logits_sp)
contribution_logits_ps = torch.stack(contribution_logits_ps)


In [None]:
contribution_logits_sp.mean(0)


In [None]:
contribution_logits_ps.mean(0)
