# Problem with context:

If the sentence contains two of the same name, then problems start to occur: we want to mask both Amanda's in the following text:

    My name is Amanda and you email me at amanda@email.com

Problem 2: "Amanda" and "amanda" are tokenized differently, i.e.:

    Amanda = Aman ##da
    amanda = a ##man ##da

How to combine these...
    


In [1]:
import transformers
import datasets
import torch
import random
import copy
import itertools
import numpy as np

In [2]:
MODEL_NAME = "TurkuNLP/bert-base-finnish-cased-v1"

tokenizer = transformers.AutoTokenizer.from_pretrained(MODEL_NAME)
model = transformers.AutoModelForPreTraining.from_pretrained(MODEL_NAME)

In [3]:
special_tokens = tokenizer.all_special_tokens
print(special_tokens)
continuation_marker = "##"   # how to get this?

['[UNK]', '[SEP]', '[PAD]', '[CLS]', '[MASK]']


In [4]:
print(tokenizer)
print(tokenizer.__class__.mro())
print(dir(transformers.models.bert.tokenization_bert_fast))

BertTokenizerFast(name_or_path='TurkuNLP/bert-base-finnish-cased-v1', vocab_size=50105, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	104: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
[<class 'transformers.models.bert.tokenization_bert_fast.BertTokenizerFast'>, <class 'transformers.tokenizati

In [5]:
import string


"""
def mask(text, tokenizer):

    def get_indices(t):
        converted = tokenizer.convert_ids_to_tokens(t["input_ids"][0])
        indices=[]
        for i in range(0, len(t["input_ids"][0])):
            if converted[i][:2] != continuation_marker and converted[i] not in special_tokens:
                indices.append([i])
            else:
                if converted[i] not in special_tokens and indices!=[]:   # here we are only skipping the fact that first token is a special token; indices is empty.
                    indices[-1].append(i)
        return indices   
    
    t = tokenizer(text, return_tensors='pt') # prepare normal tokenized input
    indices = get_indices(t)

    return indices, t, tokenizer.decode(t.input_ids[0])
"""


def find_same_tokens(lst, index):
    target_value = lst[index]
    return [i for i, value in enumerate(lst) if value == target_value and i != index]


#def find_indices_of_same_word(lst, item):
#    return [i for i, x in enumerate(lst) if x == item]



def context_aware_mask(text, tokenizer):

    def get_indices(t):
        converted = tokenizer.convert_ids_to_tokens(t["input_ids"][0])
        indices=[]
        words = []
        for i in range(0, len(t["input_ids"][0])):
            if converted[i][:2] != continuation_marker and converted[i] not in special_tokens:
                indices.append([i])
                words.append(converted[i].lower())
            else:
                if converted[i] not in special_tokens and indices!=[]:   # here we are only skipping the fact that first token is a special token; indices is empty.
                    indices[-1].append(i)
                    words[-1] += converted[i][2:].lower()

        indices_context=[]
        assert len(words)==len(indices), "Issues with masking the sentence."
        
        for i in range(len(words)):
            ind_of_words = find_same_tokens(words, i)
            if ind_of_words != []:
                #print(words[i],":", ind_of_words, np.array(words)[ind_of_words])
                current = []
                for j in ind_of_words:
                    current+= indices[j]
                indices_context.append(current)
            else:
                indices_context.append([])
        
        assert len(indices)==len(indices_context), "Issues with context masking, "+str(len(indices))+"!="+str(len(indices_context))+"\nIndices:\t"+str(indices)+"\nContext:\t"+str(indices_context)
        return indices, indices_context  
    
    t = tokenizer(text, return_tensors='pt') # prepare normal tokenized input
    
    indices, indices_context = get_indices(t)

    return indices, t, tokenizer.decode(t.input_ids[0]), indices_context

In [6]:
text = "Moi, olen Amanda, mulle voit laittaa viestiä osoitteeseen amanda.a.myntti@utu.fi"
masked_indices, tokenized_text, decoded_text, context = context_aware_mask(text, tokenizer)
for i, c in zip(masked_indices, context):
    print(i, decoded_text)#,tokenized_text)
print(context)
print(masked_indices)

[1] [CLS] Moi, olen Amanda, mulle voit laittaa viestiä osoitteeseen amanda. a. myntti @ utu. fi [SEP]
[2] [CLS] Moi, olen Amanda, mulle voit laittaa viestiä osoitteeseen amanda. a. myntti @ utu. fi [SEP]
[3] [CLS] Moi, olen Amanda, mulle voit laittaa viestiä osoitteeseen amanda. a. myntti @ utu. fi [SEP]
[4, 5] [CLS] Moi, olen Amanda, mulle voit laittaa viestiä osoitteeseen amanda. a. myntti @ utu. fi [SEP]
[6] [CLS] Moi, olen Amanda, mulle voit laittaa viestiä osoitteeseen amanda. a. myntti @ utu. fi [SEP]
[7] [CLS] Moi, olen Amanda, mulle voit laittaa viestiä osoitteeseen amanda. a. myntti @ utu. fi [SEP]
[8] [CLS] Moi, olen Amanda, mulle voit laittaa viestiä osoitteeseen amanda. a. myntti @ utu. fi [SEP]
[9] [CLS] Moi, olen Amanda, mulle voit laittaa viestiä osoitteeseen amanda. a. myntti @ utu. fi [SEP]
[10] [CLS] Moi, olen Amanda, mulle voit laittaa viestiä osoitteeseen amanda. a. myntti @ utu. fi [SEP]
[11] [CLS] Moi, olen Amanda, mulle voit laittaa viestiä osoitteeseen amanda. a

In [7]:
def to_probability(A):
    softmax = torch.nn.Softmax(dim=0)
    return softmax(A)

def predict(masked, i, true_token, print_results=False, top=10):
    # do a prediction
    model_out = model(**masked)
    logits = model_out["prediction_logits"]

    # logits for this word specifically
    logits_i = logits[0,i,:]  # this contains the probabilities for this token
    # change to probability
    probs = to_probability(logits_i)
    # true token is the index
    word_probability = probs[true_token]

    # Do only in debug mode:
    if print_results:
        print(f'{tokenizer.decode(true_token)} has probability {word_probability}')
        # see 10 top predictions for debug
        top_logits, top_tokens= torch.sort(logits, dim=2, descending=True)#[:,:,:top]
        top_probs = to_probability(top_logits[0,i,:])
        top_logits = top_logits[:,:,:top]
        top_tokens = top_tokens[:,:,:top]

    
        print("Guesses:",tokenizer.decode(top_tokens[0,i,:]))
        print("Logits: ",top_logits[0,i,:])
        print("Probs:  ",top_probs[:top])
    return word_probability


def get_scores(to_be_masked, tokens, context=[], debug=False):
    """
    Calculates the (aggregated) probability of the given word based on the model prediction.
    For multi-subtoken words, aggregation strategy is gradual unmasking and multiplication.
    Input: 
        tokens: tokenizer output for a span of text
        to_be_masked: indices for which are masked from the tokens and over which we calculate
                      i.e. indices of the subtokens that form a word.
        debug (False): prints out extra information if True
    Returns:
        (aggregated) probability \in (0,1)
    """
    # initialize the score; we're multiplying, so 1
    final_score = 1

    # loop over the subtokens of a word
    for i in range(len(to_be_masked)):
        # making a deep copy as tensors are nested and yada yada
        t = copy.deepcopy(tokens)
        current = to_be_masked[i:]   # this is the token we are CURRENTLY interested in
        for j in current:
            t["input_ids"][0][j] = tokenizer.mask_token_id     # we mask the SUBtokens that are in current
        if context != []:
            for j in context:
                t["input_ids"][0][j] = tokenizer.mask_token_id 
        if debug:
            print(tokenizer.decode(t["input_ids"][0]))
        # multiply the final score with the predicted probability => aggregates over to_be_masked==one word
        final_score *= predict(t, current[0], tokens.input_ids[0][current[0]], print_results=debug)
        
    return final_score

In [8]:
for ind, cont in zip(masked_indices, context):
    final_score = get_scores(ind, tokenized_text, context = cont, debug=True)
    word = tokenizer.decode(tokenized_text["input_ids"][0][ind])
    print(f'{word} \t >> {final_score}')
    

[CLS] [MASK], olen Amanda, mulle voit laittaa viestiä osoitteeseen amanda. a. myntti @ utu. fi [SEP]
Moi has probability 0.09783897548913956
Guesses: Hei Moi Niin Muuten Juu hei Joo Minä Eli Kiitos
Logits:  tensor([18.0196, 16.0267, 14.7276, 14.0297, 13.9221, 13.8744, 13.7270, 13.6844,
        13.5303, 13.0911], grad_fn=<SliceBackward0>)
Probs:   tensor([0.7178, 0.0978, 0.0267, 0.0133, 0.0119, 0.0114, 0.0098, 0.0094, 0.0081,
        0.0052], grad_fn=<SliceBackward0>)
Moi 	 >> 0.09783897548913956
[CLS] Moi [MASK] olen Amanda [MASK] mulle voit laittaa viestiä osoitteeseen amanda. a. myntti @ utu. fi [SEP]
, has probability 0.9052180051803589
Guesses: ,! ja : mä minäkka Moi mäkin itse
Logits:  tensor([18.4248, 15.2571, 14.5421, 12.8983, 12.7908, 12.5602, 12.4797, 12.1201,
        11.8746, 11.7475], grad_fn=<SliceBackward0>)
Probs:   tensor([0.9052, 0.0381, 0.0186, 0.0036, 0.0032, 0.0026, 0.0024, 0.0017, 0.0013,
        0.0011], grad_fn=<SliceBackward0>)
, 	 >> 0.9052180051803589
[CLS] Moi

 ## Actual function 

 Also testing the hypothesis that email screws names over...

In [9]:
threshold = 1e-4
text = "Moi, olen Amanda, mulle voit laittaa viestiä osoitteeseen example@outlook.com"
masked_indices, tokenized_text, decoded_text, context = context_aware_mask(text, tokenizer)
for ind, cont in zip(masked_indices, context):
    final_score = get_scores(ind, tokenized_text, context=cont, debug=False)
    word = tokenizer.decode(tokenized_text["input_ids"][0][ind])
    if final_score < threshold:
        print(f'{word} \t >> {final_score} \t >> Redact')
    else:
        print(f'{word} \t >> {final_score}')

Moi 	 >> 0.11190123856067657
, 	 >> 0.8906912803649902
olen 	 >> 0.09335323423147202
Amanda 	 >> 2.389621840848122e-06 	 >> Redact
, 	 >> 0.20804329216480255
mulle 	 >> 0.00788317620754242
voit 	 >> 0.12527547776699066
laittaa 	 >> 0.8456868529319763
viestiä 	 >> 0.15869389474391937
osoitteeseen 	 >> 0.5966542363166809
example 	 >> 6.858204004700497e-10 	 >> Redact
@ 	 >> 0.995366096496582
outlook 	 >> 3.7645963857357856e-06 	 >> Redact
. 	 >> 0.9999146461486816
com 	 >> 0.9127006530761719


In [10]:
text = "Moi, olen Amanda, mulle voit laittaa viestiä osoitteeseen amanda@outlook.com"
masked_indices, tokenized_text, decoded_text, context = context_aware_mask(text, tokenizer)
for ind, cont in zip(masked_indices, context):
    final_score = get_scores(ind, tokenized_text, context=cont, debug=False)
    word = tokenizer.decode(tokenized_text["input_ids"][0][ind])
    if final_score < threshold:
        print(f'{word} \t >> {final_score} \t >> Redact')
    else:
        print(f'{word} \t >> {final_score}')

Moi 	 >> 0.11406373977661133
, 	 >> 0.8929223418235779
olen 	 >> 0.09167784452438354
Amanda 	 >> 7.73802000253454e-08 	 >> Redact
, 	 >> 0.209928959608078
mulle 	 >> 0.008461282588541508
voit 	 >> 0.12811265885829926
laittaa 	 >> 0.8577250242233276
viestiä 	 >> 0.15111681818962097
osoitteeseen 	 >> 0.5203199982643127
amanda 	 >> 3.7283613696370566e-12 	 >> Redact
@ 	 >> 0.996116042137146
outlook 	 >> 1.5931193502183305e-06 	 >> Redact
. 	 >> 0.9999407529830933
com 	 >> 0.8941932320594788


Works :3

In [11]:
threshold= 0.01
text = "Moi, olen Amanda, mulle voit laittaa viestiä osoitteeseen amanda.a.myntti@utu.fi"
masked_indices, tokenized_text, decoded_text, context = context_aware_mask(text, tokenizer)
for ind, cont in zip(masked_indices, context):
    final_score = get_scores(ind, tokenized_text, context=cont, debug=False)
    word = tokenizer.decode(tokenized_text["input_ids"][0][ind])
    if final_score < threshold:
        print(f'{word} \t >> {final_score} \t >> Redact')
    else:
        print(f'{word} \t >> {final_score}')

Moi 	 >> 0.09783897548913956
, 	 >> 0.9052180051803589
olen 	 >> 0.05866984277963638
Amanda 	 >> 4.587545845424756e-06 	 >> Redact
, 	 >> 0.2064819186925888
mulle 	 >> 0.005366495344787836 	 >> Redact
voit 	 >> 0.1366426795721054
laittaa 	 >> 0.876126229763031
viestiä 	 >> 0.10039646923542023
osoitteeseen 	 >> 0.3348371684551239
amanda 	 >> 1.563938489823613e-08 	 >> Redact
. 	 >> 0.4732753038406372
a 	 >> 0.10224533081054688
. 	 >> 0.14702488481998444
myntti 	 >> 6.6980549995321326e-09 	 >> Redact
@ 	 >> 0.995269238948822
utu 	 >> 0.00847545638680458 	 >> Redact
. 	 >> 0.9999898672103882
fi 	 >> 0.9912620782852173


In [14]:
threshold= 0.005
text = "Yleisimmät suomenkieliset miesten nimet ovat Matti Meikäläinen, Mikko, Tapani, Ville, ja Vladimir."
masked_indices, tokenized_text, decoded_text, context = context_aware_mask(text, tokenizer)
for ind, cont in zip(masked_indices, context):
    final_score = get_scores(ind, tokenized_text, context=cont, debug=False)
    word = tokenizer.decode(tokenized_text["input_ids"][0][ind])
    if final_score < threshold:
        print(f'{word} \t >> {final_score} \t >> Redact')
    else:
        print(f'{word} \t >> {final_score}')


Yleisimmät 	 >> 0.21354475617408752
suomenkieliset 	 >> 0.0023188365157693624 	 >> Redact
miesten 	 >> 0.07334546744823456
nimet 	 >> 0.8113657236099243
ovat 	 >> 0.9380959868431091
Matti 	 >> 0.9777960181236267
Meikäläinen 	 >> 0.00024416143423877656 	 >> Redact
, 	 >> 0.9159089922904968
Mikko 	 >> 0.008687050081789494
, 	 >> 0.8539314270019531
Tapani 	 >> 0.0012471594382077456 	 >> Redact
, 	 >> 0.9592955708503723
Ville 	 >> 0.009906979277729988
, 	 >> 0.13058654963970184
ja 	 >> 0.19433386623859406
Vladimir 	 >> 0.00017671416571829468 	 >> Redact
. 	 >> 0.9940406680107117
