## Cosine similarity

Redaction of PII in these steps:

- Select $n$ most probable subtitutions
- from these, extract $k$ most similar in terms of cosine similarity (hyper-sphere)
- select randomly

For multisubtoken words, nothing is said(?).
=> Trying by finding the most similar to the entire word. E.g.

- original = contex1 + "Amanda" + context2
- tokenized = "Aman" "da"
- redaction result = yes
- subtitution by token:
1. for context1 + [MASK] [MASK] + context2 the most probable predictions for the **first masked** are $S_1$
2. Find $S^r_1$ closest to "Amanda" where "Amanda" is max pooled form "Aman" and "da"
3. select $s_1$ in $S^r_1$ randomly
4. $S_2$ == most probable for context + s_1 + [MASK] + context2
5. Similarly find one closest to "Amanda" => s_2
6. Assume $s_1$+$s_2$ is a coherent word :)

Notes:

- if using the same model to redact and find substitutions: less computation
- if using different models, you may optimize both



In [43]:
from transformers import pipeline, pipelines, AutoTokenizer, AutoModelForPreTraining
from sklearn.metrics.pairwise import cosine_similarity
import torch
import numpy as np

In [2]:
from piimasker import PiiMasker

In [3]:
MODEL_NAME="TurkuNLP/bert-base-finnish-cased-v1"

In [54]:
# taking these from past course work:

def get_embed_for_full_dataset(dataset, lang, pipeline):
  model_name = MODEL_NAME
  #p=pipeline(task="feature-extraction",model=model_name,return_tensors=True,device=0)
  embedded=pipeline(pipelines.pt_utils.KeyDataset(dataset[lang], "text"), batch_size=64, truncation="only_first")

  # to cpu and take the mean over words
  embedded_pooled=[torch.mean(elem,axis=1).cpu() for elem in embedded]
  # to single matrix
  results=torch.vstack(embedded_pooled).numpy()
  return results

def get_embed_for_one_instance(x, pipeline):
    #p=pipeline(task="feature-extraction",model=MODEL_NAME,return_tensors=True,device=0)
    embedded = pipeline(x)
    embedded_pooled=[torch.mean(elem,axis=1).cpu() for elem in embedded]
    results=torch.vstack(embedded_pooled).numpy()
    return results

def cosine_sim(x,y):
  M=cosine_similarity(x,y)
  aligned=np.argsort(-M,axis=-1)

  sims=[]
  for i in range(M.shape[0]): #M.shape[0] is the number of rows / input documents
    j=aligned[i,1] # index 1 for 2nd best match => index [0] gives the same words.
    score=M[i,j]
    sims.append((i,j,score))
  # sort in descending order  element -> score => sort by score
  sims.sort(key=lambda element:element[2],reverse=True)

  return sims

    

In [36]:
text = "Minun nimeni on Marjukka ja tykkään soittaa pianoa"

model = AutoModelForPreTraining.from_pretrained(MODEL_NAME)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
pipe = pipeline(task="feature-extraction",model=MODEL_NAME,return_tensors=True,device=0)

In [37]:
pf = PiiMasker(model, tokenizer, 1e-3, tokenizer_type="WordPiece")
output = pf.find_pii(text)

In [38]:
print(output)

{'decoded_text': '[CLS] Minun nimeni on Marjukka ja tykkään soittaa pianoa [SEP]', 'tokenizer_output': {'input_ids': tensor([[  102,  5243, 38160,   145,  1030,  1927,   357,   142,  9966,  7081,
         37837, 50006,   103]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}, 'to_redact_indices': [[4, 5, 6]], 'to_redact_words': [['Mar', '##ju', '##kka']], 'predictions': [[['Ri', 'Ra', 'E', 'El', 'Vil', 'Ti', 'Jo', 'I', 'An', 'J', 'Ki', 'Kari', 'Jani', 'La', 'Ro', 'Aa', 'Ne', 'Pe', 'Un', 'Jan', 'Mari', 'Te', 'Mai', 'Mi', 'O', 'Ai', 'Se', 'Eli', 'Meri', 'Erik', 'As', 'Juha', 'Aar', 'Isa', 'Vi', 'Kä', 'Li', 'Ir', 'Al', 'Lil', 'Antti', 'Mati', 'Ni', 'Y', 'Per', 'Her', 'Satu', 'S', 'Pa', 'C', 'Matti', 'Ly', 'U', 'Var', 'Jukka', 'Jorma', 'Ei', 'Za', 'Tor', 'F', 'Ari', 'En', 'Lu', 'G', 'Hei', 'Elli', 'Lauri', 'Rei', 'Es', 'Er', 'W', 'Ha', 'Kas', 'Ta', 'Pekka', 'M', 'Na', 'Pir', 'Anna', 'Val', 'Ts', 'Tii

In [60]:
to_redact_words = output["to_redact_words"]
possible_redactions = output["predictions"]
input = output["tokenizer_output"]

def find_best_sim(sims, word_index=0):
    # word index == 0 => we are finding matches to the original word
    for sim in sims:
        if sim[0] == word_index:
            return sim
    return False


for words, preds in zip(to_redact_words, possible_redactions):
    result = ""
    for i, w, p in zip(range(len(words)),words, preds):
        if i == 0: # first iteration
            v = ["".join([w_.replace("##","") for w_ in words])]+p
            print(v)
            emb=get_embed_for_one_instance(v, pipe)
            sims = cosine_sim(emb,emb)
            w_sim = find_best_sim(sims)
            assert w_sim is not False
            print(p[w_sim[1]])
            result = p[w_sim[1]]
        else:
            
            

#to_redict = tok.input_ids[0][**redact_ids]
#to_substitute = tokenizer.decode(to_redact)
#good_predictions_1 = ["Mai", "An", "Emi", "Luci"]
#choose_index = 1 # An
#print(to_substitute)



['Marjukka', 'Ri', 'Ra', 'E', 'El', 'Vil', 'Ti', 'Jo', 'I', 'An', 'J', 'Ki', 'Kari', 'Jani', 'La', 'Ro', 'Aa', 'Ne', 'Pe', 'Un', 'Jan', 'Mari', 'Te', 'Mai', 'Mi', 'O', 'Ai', 'Se', 'Eli', 'Meri', 'Erik', 'As', 'Juha', 'Aar', 'Isa', 'Vi', 'Kä', 'Li', 'Ir', 'Al', 'Lil', 'Antti', 'Mati', 'Ni', 'Y', 'Per', 'Her', 'Satu', 'S', 'Pa', 'C', 'Matti', 'Ly', 'U', 'Var', 'Jukka', 'Jorma', 'Ei', 'Za', 'Tor', 'F', 'Ari', 'En', 'Lu', 'G', 'Hei', 'Elli', 'Lauri', 'Rei', 'Es', 'Er', 'W', 'Ha', 'Kas', 'Ta', 'Pekka', 'M', 'Na', 'Pir', 'Anna', 'Val', 'Ts', 'Tiina', 'Janne', 'Martti', 'Mar', 'Timo', 'In', 'Pi', 'San', 'Z', 'Soi', 'Katri', 'Gi', 'Tuuli', 'Kar', 'Sa', 'Ma', 'Johanna', 'Marja', 'Di']




Te


In [None]:
# get embed
orig = get_embed_for_one_instance(to_substitute)
guesses = []

In [None]:
print(orig)