# Usage

### Anonymizing class `Anonymize` defined in `anonymize.py`

In [1]:
import spacy
from pprint import pprint
import tabulate

from anonymize import Anonymize

### Set tags to be considered 

In [2]:
spacy_tags_to_anon = ['PERSON', 'ORG', 'GPE', 'NORP', 'FAC', 'DATE', 'TIME']
custom_tags_to_anon = ['COURSE', 'CONTACT', 'EMAIL']
#spacy_tags_to_anon = ['PERSON', 'ORG']
#custom_tags_to_anon = []

anon = Anonymize(spacy_tags_to_anon, custom_tags_to_anon)

##### Kindly refer to https://spacy.io/api/annotation#named-entities for tag definitions

### Call `get_anon_string` for all the strings

In [3]:
orig_strings = ["Anand P Gupta took TGF4567 got gpa 3.156 was a student in my university, the XYZ Institute of Technology located at Pilani, Rajasthan, India. He performed well.",
           
           "Gabriella Marquex worked with me for a long long time in the Yugoslavic state. I was her direct superviser at Global link Technology.",
                
                "The complete program was built by Julia in two months and I hired her on last week in September 2011 on 09/12/2011. She took the course CSE110",
                
                "My home contact number is (000)-0000 and my mobile number and e-mail is some-email@123.com is +91-000(567)1234. My other number is 1234567890. My friend Ram's number is 000-0000"]

In [4]:
anon_strings = [anon.get_anon_string(orig_string) for orig_string in orig_strings]

for idx in range(len(orig_strings)):
    print("Example #{0}".format(idx + 1))
    to_print = tabulate.tabulate(zip(anon.tokenize(orig_strings[idx]), anon_strings[idx].split()), \
                             headers = ["Original String", "Anonymized String"])
    print(to_print)
    print("*"*50)

Example #1
Original String    Anonymized String
-----------------  -------------------
Anand              PERSON-B-0
P                  PERSON-I-1
Gupta              PERSON-I-2
took               took
TGF4567            COURSE-B-3
got                got
gpa                gpa
3                  3
.                  .
156                156
was                was
a                  a
student            student
in                 in
my                 my
university         university
,                  ,
the                ORG-B-4
XYZ                ORG-I-5
Institute          ORG-I-6
of                 ORG-I-7
Technology         ORG-I-8
located            located
at                 at
Pilani             GPE-B-9
,                  ,
Rajasthan          GPE-B-10
,                  ,
India              GPE-B-11
.                  .
He                 He
performed          performed
well               well
.                  .
**************************************************
Example #2
Orig

### To print mapping  -

In [5]:
pprint(anon.mapping)

{'(000)-0000': 'CONTACT-B-26',
 '+91-000(567)1234': 'CONTACT-B-28',
 '000-0000': 'CONTACT-B-31',
 '09/12/2011': 'DATE-B-24',
 '1234567890': 'CONTACT-B-29',
 '2011': 'DATE-I-23',
 'Anand': 'PERSON-B-0',
 'CSE110': 'COURSE-B-25',
 'Gabriella': 'PERSON-B-12',
 'Global': 'ORG-B-15',
 'Gupta': 'PERSON-I-2',
 'India': 'GPE-B-11',
 'Institute': 'ORG-I-6',
 'Julia': 'GPE-B-17',
 'Marquex': 'PERSON-I-13',
 'P': 'PERSON-I-1',
 'Pilani': 'GPE-B-9',
 'Rajasthan': 'GPE-B-10',
 'Ram': 'PERSON-B-30',
 'September': 'DATE-B-22',
 'TGF4567': 'COURSE-B-3',
 'Technology': 'ORG-I-8',
 'XYZ': 'ORG-I-5',
 'Yugoslavic': 'NORP-B-14',
 'last': 'DATE-B-20',
 'link': 'ORG-I-16',
 'months': 'DATE-I-19',
 'of': 'ORG-I-7',
 'some-email@123': 'EMAIL-B-27',
 'the': 'ORG-B-4',
 'two': 'DATE-B-18',
 'week': 'DATE-I-21'}


##### Note that even though words like `the` and `of` are in the mapping, not all of them are replaced in the text. Only those that are part of Named Entities are replaced. 

### To save mapping - 

In [6]:
filepath = 'anon_mapping.pkl'
anon.save_mapping(filepath)

### To load mapping - 

In [7]:
mapping = anon.load_mapping(filepath)
pprint(mapping)

{'(000)-0000': 'CONTACT-B-26',
 '+91-000(567)1234': 'CONTACT-B-28',
 '000-0000': 'CONTACT-B-31',
 '09/12/2011': 'DATE-B-24',
 '1234567890': 'CONTACT-B-29',
 '2011': 'DATE-I-23',
 'Anand': 'PERSON-B-0',
 'CSE110': 'COURSE-B-25',
 'Gabriella': 'PERSON-B-12',
 'Global': 'ORG-B-15',
 'Gupta': 'PERSON-I-2',
 'India': 'GPE-B-11',
 'Institute': 'ORG-I-6',
 'Julia': 'GPE-B-17',
 'Marquex': 'PERSON-I-13',
 'P': 'PERSON-I-1',
 'Pilani': 'GPE-B-9',
 'Rajasthan': 'GPE-B-10',
 'Ram': 'PERSON-B-30',
 'September': 'DATE-B-22',
 'TGF4567': 'COURSE-B-3',
 'Technology': 'ORG-I-8',
 'XYZ': 'ORG-I-5',
 'Yugoslavic': 'NORP-B-14',
 'last': 'DATE-B-20',
 'link': 'ORG-I-16',
 'months': 'DATE-I-19',
 'of': 'ORG-I-7',
 'some-email@123': 'EMAIL-B-27',
 'the': 'ORG-B-4',
 'two': 'DATE-B-18',
 'week': 'DATE-I-21'}
