In [1]:
%load_ext autoreload
%autoreload 2 

import pandas as pd
import numpy as np
import torch
import sys

sys.path.insert(0, '..')
from decompose_lstm import DecomposedLSTM
from lstm_dictionary import Dictionary


In [2]:
model_path = "../../lstm/data/model/state_dict.pt"
vocab_path = "../../lstm/data/model/vocab.txt"

tokenizer = Dictionary(vocab_path)


In [3]:
verbs = pd.read_csv("all_VERBs.csv")["WORD"]
verb_ids = []

for verb in verbs.iloc:
    if verb in tokenizer.word2idx:
        verb_ids.append(tokenizer.word2idx[verb])

verb_ids = torch.Tensor(verb_ids).to(int)
len(verb_ids) / len(verbs)


0.3648072734160432

In [4]:
number_df = pd.read_csv("nounpp.tsv", delimiter="\t")
max_length = 16

number_df["id"] = number_df["id"].apply(lambda x: int(x[2:]))

# only keep sentences that are plural/singular or singular/plural (distractor has different number)
number_df["subject_distractor_number"] = number_df["subject_distractor_number"].apply(
    lambda x: x if x == "singular_plural" or x == "plural_singular" else np.nan
)
number_df.dropna(inplace=True)

number_df["subject_number"] = number_df["subject_distractor_number"].apply(lambda x: x.split("_")[0])
number_df["distractor_number"] = number_df["subject_distractor_number"].apply(lambda x: x.split("_")[1])

number_df["verb"] = number_df["sentence"].apply(lambda x: " " + x.split(" ")[-1])
number_df["sentence"] = number_df["sentence"].apply(lambda x: " ".join(x.split(" ")[:-1]))

number_df = number_df.drop(
    columns=["subject_distractor_number"]
    ).pivot(index=["id", "subject_number", "distractor_number", "sentence"], columns=["correctness"], values=["verb"]).reset_index()

number_df


Unnamed: 0_level_0,id,subject_number,distractor_number,sentence,verb,verb
correctness,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,correct,wrong
0,601,singular,plural,The athlete behind the bikes,approves,approve
1,602,singular,plural,The athlete behind the cars,inspires,inspire
2,603,singular,plural,The athlete behind the cats,engages,engage
3,604,singular,plural,The athlete behind the cats,remembers,remember
4,605,singular,plural,The athlete behind the chairs,observes,observe
...,...,...,...,...,...,...
1195,1796,plural,singular,The women near the tree,greet,greets
1196,1797,plural,singular,The women near the window,avoid,avoids
1197,1798,plural,singular,The women near the window,discourage,discourages
1198,1799,plural,singular,The women near the window,engage,engages


In [5]:
number_df["sentence_tokens"] = number_df["sentence"].apply(tokenizer.tokenize)
number_df["correct_token"] = number_df[("verb", "correct")].apply(lambda x: tokenizer.tokenize(x).item())
number_df["wrong_token"] = number_df[("verb", "wrong")].apply(lambda x: tokenizer.tokenize(x).item())

number_df.head()


Unnamed: 0_level_0,id,subject_number,distractor_number,sentence,verb,verb,sentence_tokens,correct_token,wrong_token
correctness,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,correct,wrong,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,601,singular,plural,The athlete behind the bikes,approves,approve,"[tensor(146), tensor(21749), tensor(2230), ten...",42696,11336
1,602,singular,plural,The athlete behind the cars,inspires,inspire,"[tensor(146), tensor(21749), tensor(2230), ten...",33476,17370
2,603,singular,plural,The athlete behind the cats,engages,engage,"[tensor(146), tensor(21749), tensor(2230), ten...",18223,3610
3,604,singular,plural,The athlete behind the cats,remembers,remember,"[tensor(146), tensor(21749), tensor(2230), ten...",8671,2030
4,605,singular,plural,The athlete behind the chairs,observes,observe,"[tensor(146), tensor(21749), tensor(2230), ten...",8739,8501


### GCD w/ fixed bias

In [6]:
decomposed_model = DecomposedLSTM.from_pretrained(
    model_path, shapley_include_bias=False, generalized=True)


In [21]:
sentence_len = 5
assert number_df["sentence_tokens"].apply(len).unique() == [sentence_len]


def split_pos_neg_contributions(logits):
    """
    shape: (num_contributions + 1 (bias), num_classes)
    """
   # put negative "positive" contributions in the negative category
    # put negative "negative" contributions in the positive category
    positive_mask = (logits > 0).astype(int)

    positive_logits = logits[..., 1] * positive_mask[..., 1]
    positive_logits -= logits[..., 0] * (1 - positive_mask[..., 0])

    negative_logits = logits[..., 0] * positive_mask[..., 0]
    negative_logits -= logits[..., 1] * (1 - positive_mask[..., 1])

    # assert (np.all(positive_logits >= 0))
    # assert (np.all(negative_logits >= 0))

    binary_logits = np.stack(
        [negative_logits, positive_logits], axis=-2
    )
    # print(binary_logits.shape)
    return binary_logits


def get_jumulet_contribution(sentence_tokens, correct_token, wrong_token):
    all_contributions = torch.empty((sentence_len + 1,2))

    for i in range(-1, sentence_len):
        beta_mask = torch.zeros(sentence_len,)
        init_in_beta = False
        if i == -1:
            # get contribution of the initial state
            init_in_beta = True
        else:
            # get contribution of the token at index i
            beta_mask[i] = 1

        with torch.no_grad():
            beta, gamma, bias = decomposed_model(
                sentence_tokens.unsqueeze(0),
                beta_mask.unsqueeze(0),
                init_in_beta=init_in_beta
            )
            # get last hidden states
            # z = beta + gamma + bias

            # contribution = beta / z
            # c = contribution[0, -1, (correct_token, wrong_token)]

            # result = beta[0, -1]
            # result = result - result[verb_ids].mean()
            # result = result / result[verb_ids].std()

            # c = result[[correct_token, wrong_token]]
            # all_contributions[i + 1] = c

            # get last logits at last token
            beta, gamma = beta[0, -1], gamma[0, -1]

            result = torch.stack([beta, gamma, bias])
            result = result - result[:, verb_ids].mean(1, keepdim=True)
            result = result / result[:, verb_ids].std(1, keepdim=True)

            correct, wrong = split_pos_neg_contributions(result[:, [correct_token, wrong_token]].numpy())
            # beta_z_t / z_t
            correct = correct[0] / (correct.sum() + 1e-10)
            wrong = wrong[0] / (wrong.sum() + 1e-10)
            
            all_contributions[i + 1, 0] = correct
            all_contributions[i + 1, 1] = wrong

    return all_contributions



In [22]:
get_jumulet_contribution(
    torch.tensor([146, 21749,  2230,     3, 14159]),
    42696,
    11336
)


tensor([[0.7310, 0.3425],
        [0.5284, 0.4665],
        [0.8190, 0.2914],
        [0.4206, 0.4844],
        [0.3142, 0.0547],
        [-0.0000, 0.5993]])

In [23]:
print(tokenizer.tokenize("The doctor near the dogs knows"))
print(tokenizer.tokenize("The doctor near the dogs know"))


tensor([ 146, 6022,  220,    3, 1041, 4814])
tensor([ 146, 6022,  220,    3, 1041, 2678])


In [24]:
get_jumulet_contribution(
    torch.tensor([ 146, 6022,  220,    3, 1041]),
    4814,
    2678
)


tensor([[0.4371, -0.0000],
        [0.2117, -0.0000],
        [0.6334, 0.4128],
        [-0.0000, 0.4894],
        [0.5996, -0.0000],
        [0.0126, 0.0996]])

In [25]:
number_df["jumulet_contribution_fixed"] = number_df.apply(
    lambda x: get_jumulet_contribution(
        x["sentence_tokens"][0],
        x["correct_token"][0],
        x["wrong_token"][0]
    ),
    axis=1
)


  x["sentence_tokens"][0],
  x["correct_token"][0],
  x["wrong_token"][0]


In [26]:
# singular subject, plural distractor
sp_contributions = np.stack(
    number_df[number_df["distractor_number"] == "plural"]["jumulet_contribution_fixed"]
    )

# singular subject
ps_contributions = np.stack(
    number_df[number_df["distractor_number"] == "singular"]["jumulet_contribution_fixed"]
    )


In [27]:
sp_contributions.mean(0)


array([[0.50556195, 0.26128128],
       [0.4273103 , 0.17074347],
       [0.5507412 , 0.16457987],
       [0.10536391, 0.604746  ],
       [0.42327154, 0.15192921],
       [0.20813046, 0.29759017]], dtype=float32)

In [28]:
ps_contributions.mean(0)


array([[0.17232771, 0.6856805 ],
       [0.16478121, 0.5035951 ],
       [0.7062673 , 0.02601113],
       [0.4275902 , 0.2525815 ],
       [0.08471411, 0.60459137],
       [0.09919609, 0.7470311 ]], dtype=float32)

### GCD w/ shapely bias

In [15]:
decomposed_model.shapley_include_bias = True
decomposed_model.generalized = True


In [16]:
with torch.no_grad():
    # activations for ". <eos>"
    init_phrase = torch.LongTensor([18, 19]).unsqueeze(0)
    embed = decomposed_model.model.encoder(init_phrase)
    _, hidden = decomposed_model.model.rnn(embed)


In [17]:
all_contributions = torch.empty((sentence_len + 1,2))
sentence_tokens = torch.tensor([ 146, 6022,  220,    3, 1041])
contributions = torch.zeros((6,6))
correct_token = 4814
wrong_token = 2678

for i in range(-1, sentence_len):
    beta_mask = torch.zeros(sentence_len,)
    init_in_beta = False
    if i == -1:
        # get contribution of the initial state
        init_in_beta = True
    else:
        # get contribution of the token at index i
        beta_mask[i] = 1

    with torch.no_grad():
        beta, gamma, bias = decomposed_model(
            sentence_tokens.unsqueeze(0),
            beta_mask.unsqueeze(0),
            init_in_beta=init_in_beta
            )
        z, _ = decomposed_model.model.forward(
            sentence_tokens.unsqueeze(0),
            hidden
        )
        # get last hidden states
        # z = beta + gamma + bias

        contribution = beta / z
        for j in range(max(i+1, 1), sentence_len):
            future_token = sentence_tokens[j].item()
            print(i, j, tokenizer.idx2word[future_token])
            print(contribution[0, j - 1, future_token])
            contributions[i + 1, j - 1] = contribution[0, j - 1, future_token]
        
        contributions[i+1, 4] = contribution[0, -1, correct_token]
        contributions[i+1, 5] = contribution[0, -1, wrong_token]




-1 1 doctor
tensor(0.2253)
-1 2 near
tensor(0.1425)
-1 3 the
tensor(-0.0316)
-1 4 dogs
tensor(0.5632)
0 1 doctor
tensor(0.4304)
0 2 near
tensor(0.1263)
0 3 the
tensor(-0.0108)
0 4 dogs
tensor(0.4810)
1 2 near
tensor(-0.0610)
1 3 the
tensor(0.0038)
1 4 dogs
tensor(0.6334)
2 3 the
tensor(0.1513)
2 4 dogs
tensor(0.0379)
3 4 dogs
tensor(0.0574)


In [18]:
contributions.numpy().round(2)


array([[ 0.23,  0.14, -0.03,  0.56,  0.49,  0.  ],
       [ 0.43,  0.13, -0.01,  0.48,  0.34,  0.07],
       [ 0.  , -0.06,  0.  ,  0.63,  0.53,  0.24],
       [ 0.  ,  0.  ,  0.15,  0.04,  0.1 ,  0.05],
       [ 0.  ,  0.  ,  0.  ,  0.06,  0.14, -0.29],
       [ 0.  ,  0.  ,  0.  ,  0.  ,  0.33,  0.29]], dtype=float32)

In [19]:
get_jumulet_contribution(
    torch.tensor([ 146, 6022,  220,    3, 1041]),
    4814,
    2678
)


tensor([[ 2.4350, -0.7693],
        [ 1.8150, -0.5487],
        [ 2.9406,  0.6212],
        [ 1.0748,  0.5382],
        [ 2.4758, -1.6794],
        [ 0.2030, -0.6550]])

In [20]:
number_df["jumulet_contribution_all"] = number_df.apply(
    lambda x: get_jumulet_contribution(
        x["sentence_tokens"][0],
        x["correct_token"][0],
        x["wrong_token"][0]
    ),
    axis=1
)


  x["sentence_tokens"][0],
  x["correct_token"][0],
  x["wrong_token"][0]


In [21]:
# singular subject, plural distractor
sp_contributions = np.stack(
    number_df[number_df["distractor_number"] == "plural"]["jumulet_contribution_all"]
    )

# singular subject
ps_contributions = np.stack(
    number_df[number_df["distractor_number"] == "singular"]["jumulet_contribution_all"]
    )


In [22]:
sp_contributions.mean(0)


array([[ 2.2203083 , -0.12467644],
       [ 1.6906027 , -0.15757851],
       [ 1.4441742 , -0.29585314],
       [ 0.48474443,  0.4425492 ],
       [ 1.717205  , -0.81538105],
       [ 0.47046962, -0.57692945]], dtype=float32)

In [23]:
ps_contributions.mean(0)


array([[-0.05044665,  2.326599  ],
       [-0.42378464,  1.8239726 ],
       [ 1.3210498 , -0.6069224 ],
       [ 0.18210855,  0.864774  ],
       [-0.6204558 ,  1.9938576 ],
       [-0.6671041 ,  1.5882058 ]], dtype=float32)

### CD w/ fixed bias


In [24]:
decomposed_model.shapley_include_bias = False
decomposed_model.generalized = False


In [25]:
get_jumulet_contribution(
    torch.tensor([ 146, 6022,  220,    3, 1041]),
    4814,
    2678
)


tensor([[ 0.9252, -0.3685],
        [ 0.6889,  0.3682],
        [ 1.1749,  1.5580],
        [-0.1456,  1.4437],
        [ 0.4166, -0.0121],
        [-1.0672,  0.1932]])

In [26]:
number_df["murdoch_contribution_all"] = number_df.apply(
    lambda x: get_jumulet_contribution(
        x["sentence_tokens"][0],
        x["correct_token"][0],
        x["wrong_token"][0]
    ),
    axis=1
)


  x["sentence_tokens"][0],
  x["correct_token"][0],
  x["wrong_token"][0]


In [27]:
# singular subject, plural distractor
sp_contributions = np.stack(
    number_df[number_df["distractor_number"] == "plural"]["murdoch_contribution_all"]
    )

# singular subject
ps_contributions = np.stack(
    number_df[number_df["distractor_number"] == "singular"]["murdoch_contribution_all"]
    )


In [28]:
sp_contributions.mean(0)


array([[ 0.6622968 ,  0.23949313],
       [ 0.61899114,  0.00720197],
       [ 0.38334918, -0.04500752],
       [-0.712665  , -0.38218394],
       [ 0.11812739, -0.488727  ],
       [-1.0592822 ,  0.02764427]], dtype=float32)

In [29]:
ps_contributions.mean(0)


array([[ 0.4168761 ,  0.8114369 ],
       [-0.1178124 ,  0.6596337 ],
       [ 0.5991303 , -0.96850574],
       [-0.28508526, -0.48116317],
       [-0.7458999 ,  0.1813681 ],
       [-0.36456928,  1.2045773 ]], dtype=float32)

### CD w/ Shapley bias

In [30]:
decomposed_model.shapley_include_bias = True
decomposed_model.generalized = False


In [31]:
number_df["murdoch_contribution_all"] = number_df.apply(
    lambda x: get_jumulet_contribution(
        x["sentence_tokens"][0],
        x["correct_token"][0],
        x["wrong_token"][0]
    ),
    axis=1
)


  x["sentence_tokens"][0],
  x["correct_token"][0],
  x["wrong_token"][0]


In [32]:
# singular subject, plural distractor
sp_contributions = np.stack(
    number_df[number_df["distractor_number"] == "plural"]["murdoch_contribution_all"]
    )

# singular subject
ps_contributions = np.stack(
    number_df[number_df["distractor_number"] == "singular"]["murdoch_contribution_all"]
    )


In [33]:
sp_contributions.mean(0)


array([[ 1.381581  ,  0.30388233],
       [ 1.205852  ,  0.30628666],
       [ 1.3112622 ,  0.33857033],
       [ 1.1692027 ,  0.26434854],
       [ 1.3237774 ,  0.29318175],
       [-0.5612065 ,  0.22889687]], dtype=float32)

In [34]:
ps_contributions.mean(0)


array([[ 0.28128153,  1.4002773 ],
       [ 0.31331527,  1.28619   ],
       [ 0.32630935,  1.4324658 ],
       [ 0.17511828,  1.2289157 ],
       [ 0.26221597,  1.4139682 ],
       [-0.24883562,  1.3244126 ]], dtype=float32)