## Anonymizing function

In [1]:
import spacy
from pprint import pprint

nlp = spacy.load('en')

# Kindly refer https://spacy.io/api/annotation#named-entities
tags_to_anon = ['PERSON', 'ORG', 'GPE', 'NORP', 'FAC']

def get_anon_string(string, mapping):
    anon_tokens = []
    doc = nlp(string)
    for i, token in enumerate(doc):
        key = token.text
        if token.ent_type_ in tags_to_anon:
            if key not in mapping:
                value = "{0}-{1}-{2}".format(token.ent_type_, token.ent_iob_, str(len(mapping)))
                mapping[key] = value
            else:
                value = mapping[key]
            anon_tokens.append(value)
        else:
            anon_tokens.append(key)
    return ' '.join(anon_tokens)

## Examples

In [2]:
strings = ["Anand P Gupta was a student in my university," + "\n" + "the XYZ Institute of Technology located \
at Pilani, Rajasthan, India. He performed well.",
           
           "Gabriella Marquex worked with me for a long long time in the Yugoslavic state. I was her direct \
superviser at Global link Technology."
          ]

#### Usage : Declare and empty dictionary and pass it into all function `get_anon_string` function calls. 

In [3]:
mapping = {}

anon_strings = [get_anon_string(string, mapping) for string in strings]

pprint(anon_strings)

['PERSON-B-0 PERSON-I-1 PERSON-I-2 was a student in my university , \n'
 ' ORG-B-3 ORG-I-4 ORG-I-5 ORG-I-6 ORG-I-7 located at GPE-B-8 , GPE-B-9 , '
 'GPE-B-10 . He performed well .',
 'PERSON-B-11 PERSON-I-12 worked with me for a long long time in the NORP-B-13 '
 'state . I was her direct superviser at ORG-B-14 ORG-I-15 ORG-I-7 .']


##### Note that even though words like `the` and `of` are in the mapping, not all of them are replaced in the text. Only those that are part of Named Entities are replaced. 

In [4]:
pprint(mapping)

{'Anand': 'PERSON-B-0',
 'Gabriella': 'PERSON-B-11',
 'Global': 'ORG-B-14',
 'Gupta': 'PERSON-I-2',
 'India': 'GPE-B-10',
 'Institute': 'ORG-I-5',
 'Marquex': 'PERSON-I-12',
 'P': 'PERSON-I-1',
 'Pilani': 'GPE-B-8',
 'Rajasthan': 'GPE-B-9',
 'Technology': 'ORG-I-7',
 'XYZ': 'ORG-I-4',
 'Yugoslavic': 'NORP-B-13',
 'link': 'ORG-I-15',
 'of': 'ORG-I-6',
 'the': 'ORG-B-3'}
