Generate new examples based on this dataset: 
https://www.kaggle.com/abhinavwalia95/entity-annotated-corpus

This notebook takes the ner dataset from the previous link, and creates templates (utterances with placeholders) for a PII synthetic data generator to use in order to create new sentences.
Note that due to the nature of the tagging, there might be weird output sentences. For example:

- The same entity shows multiple times in sentence: "I travel from Argentina to Argentina"
- Bad grammer due to the lack of inflection and changes to nouns due to context: "*The statement said no Denmark or India-led troops were killed*" instead of "*The statement said no Danish or Indian led troops were killed*"
- Unrealistic sentences due to change in entities: "Prime minister Lebron James enters the government building in Kuala Lumpur"


The notebook additionally introduces two new entities: TITLE and ROLE, in order to overcome cases like "UK David Scott called his wife", where the original sentence is "UK Prime Minister Boris Johnson called his wife" as "Prime Minister" was originally tagged as PER in the original dataset. Same logic goes for titles, like Mr., Mrs., Ms.

In [None]:
import pandas as pd

In [None]:
#First, Download ner.csv from https://www.kaggle.com/abhinavwalia95/entity-annotated-corpus
ner_dataset = pd.read_csv("ner.csv",encoding = "ISO-8859-1", error_bad_lines=False)

In [None]:
ner_dataset.columns

In [None]:
len(ner_dataset)

In [None]:
ner_dataset = ner_dataset.drop_duplicates()
len(ner_dataset)

Example sentence:

In [None]:
ner_dataset[ner_dataset['sentence_idx']==13][['sentence_idx','word','tag','prev-word','prev-prev-word','next-word']]

### New entities - Title and Role

- **Title**: Mr., Mrs., Professor, Doctor, ...
- **Role**: President, Secretary General, U.N. Secretary, ...

Quick exploratory analysis of frequencies:
- First PER token
- Second PER token
- First and second PER token
- One before and first tokens of PER

In [None]:
# Evaluate words before I-per
bper = ner_dataset[ner_dataset['tag']=='B-per']
bper_tokens = bper['word']
prev_bper_token = bper['prev-word']
next_bper_token = bper['next-word']
two_prev_tokens = zip(prev_bper_token, bper_tokens)
two_next_tokens = zip(bper_tokens, next_bper_token)

In [None]:
from collections import Counter
print("20 most common PER token frequencies:")
Counter(bper_tokens).most_common(20)

In [None]:
print("20 most common previous and first PER token frequencies:")
Counter(two_prev_tokens).most_common(20)

In [None]:
print("20 most common first and second PER token frequencies:")
Counter(two_next_tokens).most_common(20)

In [None]:
# Lists of titles and roles to update as ttl, rol
TITLES = ['Mr.','Ms.','Mrs.']
ROLES = ['President','General','Senator','Secretary-General','Minister','General']
BIGRAMS_ROLES = [('Prime','Minister'),('prime','minister'),('U.S.','President'),
                 ('Venezuelan', 'President'),('Vice','President'), ('Foreign', 'Minister'),
                 ('U.S.','Secretary'),('U.N.','Secretary'),('Defence','Secretary')]


In [None]:
# Update title and per for most common cases

def fix_bigram_title(df, row,index,first='Prime',second='Minister',tag='ttl'):
    if row['word'] == first and row['next-word'] == second and 'per' in row['tag']:
        df.loc[index,'tag'] = 'B-{}'.format(tag)
    elif row['word'] == second and row['prev-word'] == first and 'per' in row['tag']:
        df.loc[index,'tag'] = 'I-{}'.format(tag)
    elif row['tag']== 'I-per' and row['prev-word'] == second and 'per' in row['tag']:
        df.loc[index,'tag'] = 'B-per'

def fix_unigram_title(df, prev_row,prev_index, row , index, title='President',tag='ttl'):
    #print(row)
    if prev_row['word'] == title and prev_row['tag'] == 'B-per' and row['tag']=='I-per':
        df.loc[prev_index,'tag']='B-{}'.format(tag)
        df.loc[index,'tag'] = 'B-per'

prev_row = None
prev_index = None
for index, row in ner_dataset.iterrows():
    # Handle 'Prime Minister'
    for bigram in BIGRAMS_ROLES:
        fix_bigram_title(ner_dataset,row,index,bigram[0],bigram[1],'rol')

    if prev_row is not None:
        for title in TITLES:
            fix_unigram_title(df=ner_dataset,prev_row=prev_row,prev_index=prev_index,row=row,index=index,title=title,tag='ttl')
        for role in ROLES:
            fix_unigram_title(ner_dataset,prev_row,prev_index,row,index,role,'rol')

    prev_row = row
    prev_index = index

In [None]:
ner_dataset[ner_dataset['sentence_idx']==13][['sentence_idx','word','tag','prev-word','prev-prev-word','next-word']]

In [None]:
# keep only relevant columns
dataset = ner_dataset[['sentence_idx','word','tag']]

In [None]:
dataset.to_csv("../../../datasets/ner_with_titles.csv",encoding = "ISO-8859-1")

### Create templates base on NER dataset

In [None]:
import re
class SentenceGetter(object):
    
    def __init__(self, dataset):
        self.n_sent = 1
        self.dataset = dataset
        self.empty = False
        agg_func = lambda s: [(w, t) for w,t in zip(s["word"].values.tolist(),
                                                        s["tag"].values.tolist())]
        self.grouped = self.dataset.groupby("sentence_idx").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None
    
    @staticmethod    
    def get_template(grouped,entity_name_replace_dict=None):
        TAGS_TO_IGNORE = ['nat','eve','art','tim']
        template = ""
        i=0
        cur_index = 0
        ents = []
        for token in grouped:
            token_text = token[0].replace("[", "").replace("]","")
            token_tag = token[1]
            if token_tag == 'O':
                template += " " + token_text
            elif 'B-' in token_tag and token_tag[2:] not in TAGS_TO_IGNORE:
                if entity_name_replace_dict:
                    ent = entity_name_replace_dict[token[1][2:]]
                else:
                    ent = token_tag[2:]
                ents.append(ent)
                template += " [" + ent + "]"
        template = re.sub(r'\s([?,\':.!"](?:|$))+', r'\1', template)
        
        for ent in ents:
            weird = "[{}] [{}]".format(ent,ent)
            template = template.replace(weird,"[{}]".format(ent))
        
        #remove additional weird combinations:
        
        to_replace = {
            "[COUNTRY] [ROLE] [PERSON]": "[ROLE] [PERSON]",
            "[COUNTRY] [ROLE]" : "[ROLE]",
            "[ORGANIZATION] [ROLE] [PERSON]" : "[ORGANIZATION]'s [ROLE] [PERSON]",
            "[COUNTRY] [LOCATION]" : "[LOCATION]",
            "[LOCATION] [COUNTRY]": "[LOCATION]",
            "[PERSON] [COUNTRY]" : "[PERSON]",
            "[PERSON] [LOCATION]" : "[PERSON]",
            "[COUNTRY] [PERSON]" : "[PERSON]",
            "[LOCATION] [PERSON]" : "[PERSON]",
            "The [ORGANIZATION]" : "[ORGANIZATION]",
            "[PERSON] [ORGANIZATION]" : "[PERSON]",
            "of [ORGANIZATION] [PERSON]" : "of [ORGANIZATION], [PERSON]",
            "[ORGANIZATION] [PERSON]" : "[PERSON]",
            "[PERSON] [PERSON]": "[PERSON]",
            "[LOCATION] says" : "[PERSON] says",
            "[LOCATION] said" : "[PERSON] said"
            
            
        }
        
        for weird in to_replace.keys():
            template = template.replace(weird,to_replace[weird])
        
        return template.strip()
    
getter = SentenceGetter(dataset)

In [None]:
ENTITIES_DICTIONARY = {"per":"PERSON","gpe":"COUNTRY","geo":"LOCATION","org":"ORGANIZATION",'ttl':'TITLE','rol':'ROLE'}

sentences = getter.sentences
print("original:",sentences[12])
print("template:", getter.get_template(sentences[12],entity_name_replace_dict=ENTITIES_DICTIONARY))

In [None]:
new_templates = [SentenceGetter.get_template(sentence, ENTITIES_DICTIONARY) for sentence in sentences]
new_templates[:5]

In [None]:
# save to file

with open("../../presidio_evaluator/data_generator/raw_data/new_templates2.txt","w+", encoding = "ISO-8859-1") as f:
    for template in new_templates:
        f.write("%s\n" % template)
        