# Description: This code utilizes a fine-tuned BERT model for anonymization.

## Load fine-tuned model from Hugging Face directly

In [74]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
import pandas as pd
from IPython.core.display import display, HTML

tokenizer = AutoTokenizer.from_pretrained("medxiaorudan/bert-base-cased-finetuned-MultiNERD-SystemB")
model = AutoModelForTokenClassification.from_pretrained("medxiaorudan/bert-base-cased-finetuned-MultiNERD-SystemB")

In [81]:
def word_start_tokens(tokenized):
    """Return list of bool identifying which tokens start words."""
    prev_word_idx = None
    is_word_start = []
    for word_idx in tokenized.word_ids():
        if word_idx is None or word_idx == prev_word_idx:
            is_word_start.append(False)
        else:
            is_word_start.append(True)
        prev_word_idx = word_idx
    return is_word_start


def predict_ner(words):
    tokenized = tokenizer(words, is_split_into_words=True, return_tensors='pt')
    pred = model(**tokenized)
    pred_idx = pred.logits.detach().numpy().argmax(axis=2)
    token_labels = [label_list[i] for s in pred_idx for i in s]
    word_labels = []
    for label, is_word_start in zip(token_labels, word_start_tokens(tokenized)):
        if is_word_start:
            word_labels.append(label)
    return word_labels

label_list = ["O","B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "B-ANIM", "I-ANIM", "B-DIS", "I-DIS"]

# Define the example sentences to be anonymized
example_sentences = [
    'Emma Watson, a famous actress, starred in the Harry Potter movies.',
    'NASA, the United States space agency, is responsible for the Apollo moon missions.',
    'The Sahara Desert is the largest hot desert in the world, covering much of North Africa.',
    'Cancer is a serious disease that affects millions of people worldwide.',
    'The African elephant is the largest land animal on Earth.',
    'Stephen Hawking was a brilliant physicist known for his work on black holes.',
    'The Vatican City is the smallest independent state in the world, located within Rome.',
    'Lionel Messi, one of the greatest football players, currently plays for Paris Saint-Germain.',
    'The Pacific Ocean is the largest and deepest ocean on Earth.',
]


In [78]:
# Define the blackout labels which we want to anonymize
blackout_labels=['B-PER', 'I-PER', 'B-ORG', 'I-ORG']

# Create an empty list to store data
data = []

# Iterate through example sentences
for e in example_sentences:
    words = e.split()
    ner_tags = predict_ner(words)
    
    # Create a boolean list for blackout labels
    blackout_label = [tag in blackout_labels for tag in ner_tags]
    
    # Add data to the list
    data.append({"words": words, "ner_tags": ner_tags, "blackout_label": blackout_label})

# Create DataFrame using the list
df = pd.DataFrame(data)

# Display the DataFrame with blackout_label
print(df)


                                               words  \
0  [Emma, Watson,, a, famous, actress,, starred, ...   
1  [NASA,, the, United, States, space, agency,, i...   
2  [The, Sahara, Desert, is, the, largest, hot, d...   
3  [Cancer, is, a, serious, disease, that, affect...   
4  [The, African, elephant, is, the, largest, lan...   
5  [Stephen, Hawking, was, a, brilliant, physicis...   
6  [The, Vatican, City, is, the, smallest, indepe...   
7  [Lionel, Messi,, one, of, the, greatest, footb...   
8  [The, Pacific, Ocean, is, the, largest, and, d...   

                                            ner_tags  \
0      [B-PER, I-PER, O, O, O, O, O, O, B-PER, O, O]   
1  [B-ORG, O, B-LOC, I-LOC, O, O, O, O, O, O, O, ...   
2  [O, B-LOC, I-LOC, O, O, O, O, O, O, O, O, O, O...   
3              [B-DIS, O, O, O, O, O, O, O, O, O, O]   
4           [O, B-ANIM, I-ANIM, O, O, O, O, O, O, O]   
5    [B-PER, I-PER, O, O, O, O, O, O, O, O, O, O, O]   
6  [O, B-LOC, I-LOC, O, O, O, O, O, O, O, O, O,

In [79]:
black_char = chr(9608)

# Create an empty list to store data
data = []

# Iterate through example sentences
for e in example_sentences:
    words = e.split()
    ner_tags = predict_ner(words)
    
    # Create a boolean list for blackout labels
    blackout_label = [tag in blackout_labels for tag in ner_tags]
    
    # Anonymize using black_char
    anonymized_words = [black_char if label else word for word, label in zip(words, blackout_label)]
    
    # Add data to the list
    data.append({"words": words, "anonymized_words": anonymized_words, "ner_tags": ner_tags, "blackout_label": blackout_label})

# Create DataFrame from the list
df = pd.DataFrame(data)

# Display the original and anonymized text comparison
for i, row in df.iterrows():
    words = row["words"]
    anonymized_words = row["anonymized_words"]
    
    # Display the comparison of original and anonymized text
    display(HTML(f'<p>Original: {" ".join(words)}</p><p>Anonymized: {" ".join(anonymized_words)}</p>'))


# Visualization

The IOB notation can be a bit tricky to interpret. To get a better intuitive understanding of tagging results, let's implement a visualization using the[`displacy`](https://explosion.ai/demos/displacy-ent) library.


In [80]:
from spacy import displacy

def render_with_displacy(words, tags):
    type_map={}
    tagged, offset, start, label = [], 0, None, None
    for word, tag in zip(words, tags):
        if tag[0] in 'OB' and start is not None:    # End of current entity
            tagged.append({
                'start': start,
                'end': offset,
                'label': type_map.get(label, label)
            })
            start, label = None, None
        if tag[0] == 'B':
            start, label = offset, tag[2:]
        elif tag[0] == 'I':
            if start is None:    # I without B, but nevermind
                start, label = offset, tag[2:]
        else:
            assert tag == 'O', 'unexpected tag {}'.format(tag)
        offset += len(word) + 1    # +1 for space
    if start:    # Entity spans to the end of sentence
        tagged.append({
                'start': start,
                'end': offset,
                'label': type_map.get(label, label)
        })
    doc = {
        'text': ' '.join(words),
        'ents': tagged
    }
    displacy.render(doc, style='ent', jupyter=True, manual=True)

# Visualize example sentences
for e, row in zip(example_sentences, df.iterrows()):
    words = row[1]['words']
    anonymized_words = row[1]["anonymized_words"]
    ner_tags = row[1]['ner_tags']
    
    # If blackout_labels exist, anonymize the text
    render_with_displacy(words, ner_tags)
    render_with_displacy(anonymized_words, ner_tags)
