# Now that the hypothesis is tested, I'm trying to find a good data format for the masking

In [1]:
import transformers
import datasets
import torch
import random
import copy
import itertools
import numpy as np

In [2]:
MODEL_NAME = "TurkuNLP/bert-base-finnish-cased-v1"

tokenizer = transformers.AutoTokenizer.from_pretrained(MODEL_NAME)
model = transformers.AutoModelForPreTraining.from_pretrained(MODEL_NAME)



In [9]:
special_tokens = tokenizer.all_special_tokens
print(special_tokens)
continuation_marker = "##"   # how to get this?

['[UNK]', '[SEP]', '[PAD]', '[CLS]', '[MASK]']


In [4]:
print(tokenizer)
print(tokenizer.__class__.mro())
print(dir(transformers.models.bert.tokenization_bert_fast))

BertTokenizerFast(name_or_path='TurkuNLP/bert-base-finnish-cased-v1', vocab_size=50105, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	104: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
[<class 'transformers.models.bert.tokenization_bert_fast.BertTokenizerFast'>, <class 'transformers.tokenizati

In [36]:


def mask(text, tokenizer):

    def get_indices(t):
        converted = tokenizer.convert_ids_to_tokens(t["input_ids"][0])
        indices=[]
        for i in range(0, len(t["input_ids"][0])):
            if converted[i][:2] != continuation_marker and converted[i] not in special_tokens:
                indices.append([i])
            else:
                if converted[i] not in special_tokens and indices!=[]:   # here we are only skipping the fact that first token is a special token; indices is empty.
                    indices[-1].append(i)
        return indices   
    
    t = tokenizer(text, return_tensors='pt') # prepare normal tokenized input
    indices = get_indices(t)

    return indices, t, tokenizer.decode(t.input_ids[0])


In [37]:
text = "Moi, olen Amanda, mulle voit laittaa viestiä osoitteeseen example@outlook.com"
masked_indices, tokenized_text, decoded_text = mask(text, tokenizer)
for i in masked_indices:
    print(i, decoded_text)#,tokenized_text)


[1] [CLS] Moi, olen Amanda, mulle voit laittaa viestiä osoitteeseen example @ outlook. com [SEP]
[2] [CLS] Moi, olen Amanda, mulle voit laittaa viestiä osoitteeseen example @ outlook. com [SEP]
[3] [CLS] Moi, olen Amanda, mulle voit laittaa viestiä osoitteeseen example @ outlook. com [SEP]
[4, 5] [CLS] Moi, olen Amanda, mulle voit laittaa viestiä osoitteeseen example @ outlook. com [SEP]
[6] [CLS] Moi, olen Amanda, mulle voit laittaa viestiä osoitteeseen example @ outlook. com [SEP]
[7] [CLS] Moi, olen Amanda, mulle voit laittaa viestiä osoitteeseen example @ outlook. com [SEP]
[8] [CLS] Moi, olen Amanda, mulle voit laittaa viestiä osoitteeseen example @ outlook. com [SEP]
[9] [CLS] Moi, olen Amanda, mulle voit laittaa viestiä osoitteeseen example @ outlook. com [SEP]
[10] [CLS] Moi, olen Amanda, mulle voit laittaa viestiä osoitteeseen example @ outlook. com [SEP]
[11] [CLS] Moi, olen Amanda, mulle voit laittaa viestiä osoitteeseen example @ outlook. com [SEP]
[12, 13, 14] [CLS] Moi, o

In [48]:
def to_probability(A):
    softmax = torch.nn.Softmax(dim=0)
    return softmax(A)

def predict(masked, i, true_token, print_results=False, top=10):
    # do a prediction
    model_out = model(**masked)
    logits = model_out["prediction_logits"]

    # logits for this word specifically
    logits_i = logits[0,i,:]  # this contains the probabilities for this token
    # change to probability
    probs = to_probability(logits_i)
    # true token is the index
    word_probability = probs[true_token]

    # Do only in debug mode:
    if print_results:
        print(f'{tokenizer.decode(true_token)} has probability {word_probability}')
        # see 10 top predictions for debug
        top_logits, top_tokens= torch.sort(logits, dim=2, descending=True)#[:,:,:top]
        top_probs = to_probability(top_logits[0,i,:])
        top_logits = top_logits[:,:,:top]
        top_tokens = top_tokens[:,:,:top]

    
        print("Guesses:",tokenizer.decode(top_tokens[0,i,:]))
        print("Logits: ",top_logits[0,i,:])
        print("Probs:  ",top_probs[:top])
    return word_probability


def get_scores(to_be_masked, tokens, debug=False):
    """
    Calculates the (aggregated) probability of the given word based on the model prediction.
    For multi-subtoken words, aggregation strategy is gradual unmasking and multiplication.
    Input: 
        tokens: tokenizer output for a span of text
        to_be_masked: indices for which are masked from the tokens and over which we calculate
                      i.e. indices of the subtokens that form a word.
        debug (False): prints out extra information if True
    Returns:
        (aggregated) probability \in (0,1)
    """
    # initialize the score; we're multiplying, so 1
    final_score = 1

    # loop over the subtokens of a word
    for i in range(len(to_be_masked)):
        # making a deep copy as tensors are nested and yada yada
        t = copy.deepcopy(tokens)
        current = to_be_masked[i:]   # this is the token we are CURRENTLY interested in
        for j in current:
            t["input_ids"][0][j] = tokenizer.mask_token_id     # we mask the SUBtokens that are in current
        if debug:
            print(tokenizer.decode(t["input_ids"][0]))
        # multiply the final score with the predicted probability => aggregates over to_be_masked==one word
        final_score *= predict(t, current[0], tokens.input_ids[0][current[0]], print_results=debug)
        
    return final_score

In [61]:
for ind in masked_indices:
    final_score = get_scores(ind, tokenized_text, debug=False)
    word = tokenizer.decode(tokenized_text["input_ids"][0][ind])
    print(f'{word} \t >> {final_score}')
    

Moi 	 >> 0.11190102249383926
, 	 >> 0.914529025554657
olen 	 >> 0.09335358440876007
Amanda 	 >> 2.389645032963017e-06
, 	 >> 0.1585882604122162
mulle 	 >> 0.007883135229349136
voit 	 >> 0.1252748966217041
laittaa 	 >> 0.8456864356994629
viestiä 	 >> 0.15869447588920593
osoitteeseen 	 >> 0.5966525673866272
example 	 >> 6.858137946430531e-10
@ 	 >> 0.995366096496582
outlook 	 >> 3.76461412088247e-06
. 	 >> 0.9999146461486816
com 	 >> 0.9127005338668823


 ## Actual function 

 Also testing the hypothesis that email screws names over...

In [67]:
threshold = 1e-4
text = "Moi, olen Amanda, mulle voit laittaa viestiä osoitteeseen example@outlook.com"
masked_indices, tokenized_text, decoded_text = mask(text, tokenizer)
for ind in masked_indices:
    final_score = get_scores(ind, tokenized_text, debug=False)
    word = tokenizer.decode(tokenized_text["input_ids"][0][ind])
    if final_score < threshold:
        print(f'{word} \t >> {final_score} \t >> Redact')
    else:
        print(f'{word} \t >> {final_score}')

Moi 	 >> 0.11190102249383926
, 	 >> 0.914529025554657
olen 	 >> 0.09335358440876007
Amanda 	 >> 2.389645032963017e-06 	 >> Redact
, 	 >> 0.1585882604122162
mulle 	 >> 0.007883135229349136
voit 	 >> 0.1252748966217041
laittaa 	 >> 0.8456864356994629
viestiä 	 >> 0.15869447588920593
osoitteeseen 	 >> 0.5966525673866272
example 	 >> 6.858137946430531e-10 	 >> Redact
@ 	 >> 0.995366096496582
outlook 	 >> 3.76461412088247e-06 	 >> Redact
. 	 >> 0.9999146461486816
com 	 >> 0.9127005338668823


In [68]:
text = "Moi, olen Amanda, mulle voit laittaa viestiä osoitteeseen amanda@outlook.com"
masked_indices, tokenized_text, decoded_text = mask(text, tokenizer)
for ind in masked_indices:
    final_score = get_scores(ind, tokenized_text, debug=False)
    word = tokenizer.decode(tokenized_text["input_ids"][0][ind])
    if final_score < threshold:
        print(f'{word} \t >> {final_score} \t >> Redact')
    else:
        print(f'{word} \t >> {final_score}')

Moi 	 >> 0.1140635758638382
, 	 >> 0.9137162566184998
olen 	 >> 0.09167791157960892
Amanda 	 >> 0.0007409827085211873
, 	 >> 0.1578478366136551
mulle 	 >> 0.008461365476250648
voit 	 >> 0.12811163067817688
laittaa 	 >> 0.8577243089675903
viestiä 	 >> 0.1511165201663971
osoitteeseen 	 >> 0.5203193426132202
amanda 	 >> 0.000500433670822531
@ 	 >> 0.9961161613464355
outlook 	 >> 1.5931143479974708e-06 	 >> Redact
. 	 >> 0.9999407529830933
com 	 >> 0.8941934704780579


Sliding window needed...?