# LwTR

In [None]:
from numpy import random

# LwTR - Label-wise Token Replacement
def lwtr(train_file):
    
    lwtr_file = train_file.copy() #make new list to prevent overwritting original file-list
    
    #for Label-wise Token Replacement, we need to catalogue each possible label
    o_label = [line for line in file if ' _ _ O' in line]
    b_prod = [line for line in file if ' _ _ B-PROD' in line]
    b_grp = [line for line in file if ' _ _ B-GRP' in line]
    b_corp = [line for line in file if ' _ _ B-CORP' in line]
    b_cw = [line for line in file if ' _ _ B-CW' in line]
    b_per = [line for line in file if ' _ _ B-PER' in line]
    b_loc = [line for line in file if ' _ _ B-LOC' in line]
    i_prod = [line for line in file if ' _ _ I-PROD' in line]
    i_grp = [line for line in file if ' _ _ I-GRP' in line]
    i_corp = [line for line in file if ' _ _ I-CORP' in line]
    i_cw = [line for line in file if ' _ _ I-CW' in line]
    i_per = [line for line in file if ' _ _ I-PER' in line]
    i_loc = [line for line in file if ' _ _ I-LOC' in line]

    #dictionary of labels
    lwtr = {'O': o_label, 
            'B-PROD': b_prod,
            'B-GRP': b_grp,
            'B-CORP': b_corp,
            'B-CW': b_cw,
            'B-PER': b_per,
            'B-LOC': b_loc,
            'I-PROD': i_prod,
            'I-GRP': i_grp,
            'I-CORP': i_corp,
            'I-CW': i_cw,
            'I-PER': i_per,
            'I-LOC': i_loc}

    for index, line in enumerate(train_file): #traverse through file. index:value
        x = random.binomial(n=1, p=0.5, size=1) #randomizer. x will be 0 or 1
        #x==1 means successly random / startswith() is to filter out id lines / line.strip() is to filter out '\n' lines
        if x == 1 and not line.startswith('# id ') and line.strip(): 
            curr_label = line.split(' _ _ ')[1].strip() #label found in current line. will be used as the key for lwtr dict
            lwtr_file[index] = random.choice(lwtr[curr_label]) #access lwtr dict to randomly choose replacement token of same label and reassign it to current line in file
    
    return lwtr_file

In [None]:
with open('/path/to/ner/file.conll') as f:
    file = [i for i in f.readlines()]
lwtr_file = lwtr(train_file = file)

with open('/path/to/save/train_lwtr.conll', 'w') as file:
    for line in lwtr_file:
        file.write(line)

# SR

In [None]:
def syn_replace(train_file, word_vector):
    from numpy import random
    from gensim.test.utils import datapath
    from gensim.models.fasttext import load_facebook_vectors
    
    wv = load_facebook_vectors(word_vector) #load language word vector
    
    sr_file = [] #eventual file to write into .conll 
    aug_count = 0
    no_aug_count = 0
    for index, line in enumerate(train_file):
        if not line.startswith('# id ') and line.strip(): #filter out '#id' and '\n'
            x = random.binomial(n=1, p=0.5, size=1) #random coin flip, 1 is augment, 0 is no augmentation
            if x == 1: #augment by replacing current word with the most similar (synonym) word in word vector
                separate = line.split(' _ _ ') #to get a list of just the word with the label
                word = separate[0] #isolate the word in the line, without it's label
                synonym = wv.most_similar(word)[0][0] #synonym of isolated word. 'most_similar' returns list of most top 10 most similar words, with corresponding percentage likelihoods. we just need the MOST likely ([0]), without the percentage ([0])
                separate[0] = synonym #replace original word with its new synonym
                sr_file.append(' _ _ '.join(separate)) #join back together in format of 'synonym _ _ label'
                
                aug_count += 1
                
            else: #no augmentation, still keep original line
                sr_file.append(line)
                
                no_aug_count += 1
        else: #append #id's and \n
            sr_file.append(line)
    print(f'{aug_count} instances of synonym replacement.')
    print(f'{no_aug_count} instances of no synonym replacement.')
    return sr_file

In [None]:
with open('/path/to/ner/file.conll') as f:
    file = f.readlines()
sr_file = syn_replace(train_file = file, word_vector = '/path/to/wiki/vector.bin')

with open('/path/to/save/train_sr.conll', 'w') as file:
    for line in sr_file:
        file.write(line)

# MR

In [None]:
def mr(train_file):
    ############## DATA STRUCTURES SETUP ###############

    from collections import defaultdict

    mention_rep = defaultdict(list) #create default dictionary of lists
    entities = ['PER', 'LOC', 'GRP', 'CORP', 'PROD', 'CW'] #all basic NER labels

    begin = False #variable for when you've first encountered and captured the head of a mention (B-)
    inside = False #variable for when you're inside the mention (I-). There can be 1+ of these

    for entity in entities: #traverse through the entities
        for line in train_file: #traverse through each line of the language file
            if begin: #check to see if you've already run into the head of the mention
                if f'I-{entity}' in line: #if not, then see if the I- is in this line
                    temp_list.append(line) #if so, you must've gone to the 'elif' below and created the temp_list varible. Append current I- onto this list
                    inside = True #switch to True to alert that you're inside the mention from now on

                else: #if there's no I- in this line that means you've 1) got the head, and 2) either gotten all I-'s already or there's no more I-'s, and you're done trying to capture a mention
                    if mention_rep[entity]: #check to see if you're already started storing values into this entity (PER, LOC, etc.)
                        mention_rep[entity].append(temp_list) #if so, simply append the temp_list onto the existing list
                    else: #if not that means this is your first entry for an entity's values. defaultdict(list) can't perform append when this happens...
                        mention_rep[entity] = [temp_list] #...so we'll have to store the temp_list as a list to prevent the temp_list's contents from being broken into individual values and stored into the dict's value list
                    begin = False #revert begin to False for next loop
                    inside = False #revert inside for the same reason

            elif f'B-{entity}' in line: #if you haven't already run into the head of the mention (B-), check to see if it's in this current loop/line
                temp_list = [] #create a temporary list that you'll use to collect a mention's beginning (B-) and insides (I-). this will also reset the temp_list for subsequent loops 
                temp_list.append(line) #add the beginning labels (B-) to this list
                begin = True #switch to True to alert that you've found the head label

    #assert every mention was stored as a list
    for i in mention_rep: #traverse the dict's keys
        for j in mention_rep[i]: #traverse each value in the value-list of the key
            assert type(j) is list #check if every value is a list. this is to ensure consistency later on


    ################################################# PERFORM MENTION REPLACEMENT ########################################################
    import numpy as np #will need ndarray

    mr_file = []
    labels = tuple(mention_rep.keys()) #set of labels: PER, LOC, GRP, CORP, PROD, CW

    #capture_inside = False
    for index, line in enumerate(train_file): #traverse through file by line
        if line.strip().endswith(labels): #endswith() is to filter out anything other than mentions
            if line.strip().split(' _ _ ')[1].startswith('B'): #only get the beginnings of mentions (B-)
                x = random.binomial(n = 1, p = 0.5, size = 1) #randomizer. x will be 0 or 1
                if x == 1: #x==1 means successfully random, perform mention replacement
                    curr_label = line.split(' _ _ ')[1].strip()[2:] #current label type will be used as key next. '2:' is to remove the 'B-' or 'I-' from beginning of label
                    new_mention = np.random.choice(np.array(mention_rep[curr_label], dtype = 'object')) #have to make this ndarray because the values of our mention_rep[key] dictionary is lists of LISTS. it becomes deprecated. same logic applies as LwTR above
                    if len(new_mention) == 1: #just insert sole element
                        mr_file.append(new_mention[0])
                    else: #multi-word mention
                        for mention in new_mention: #traverse through new_mention list
                            mr_file.append(mention)
                else: #B-, but no data augmentation
                    mr_file.append(line) #add line (non-replaced mention beginning (B-))
                    capture_inside = True #this will enable capturing the inside (I-) of the mention if it appears next, instead of disregarding it
            elif capture_inside: #add the inside of mention, because we just had a non-replaced mention beginning (B-), so we must capture all insides of original mention
                mr_file.append(line) #append non mention replaced inside (I-)

        else: #everything that doesn't have a mention label in it
            mr_file.append(line) #append current line
            capture_inside = False #switch off the flag to indicate we must capture a mention inside

    
    return mr_file

In [None]:
with open('/path/to/ner/file.conll') as f:
    file = [i for i in f.readlines()]
mr_file = mr(train_file = file)

with open('/path/to/save/train_mr.conll', 'w') as file:
    for line in mr_file:
        file.write(line)

# SiS

In [None]:
# Labels are as follows
# PER : Person
# LOC : Location
# GRP : Group
# CORP : Corporation
# PROD : Product
# CW: Creative Work
# O: out-of-mention label (i.e. unrelated to named entity)

# B - beginning of label, I - inside label
from numpy import random

def shuffle(train_file):
    labels = tuple(['-PER\n', '-LOC\n', '-GRP\n', '-CORP\n', '-PROD\n', '-CW\n', ' _ _ O']) #set of labels: PER, LOC, GRP, CORP, PROD, CW, and O
    shuffle_file = []
    prev_label = None
    temp_list = []
    shuffle_count = 0
    non_shuffle_count = 0
    
    for index, line in enumerate(train_file):
        if not line.startswith('# id ') and line.strip(): #to filter out #id's and \n
            for label in labels:
                if label in line:
                    curr_label = label
                    break
                    
            if curr_label != prev_label and temp_list: #when you've come to a new label, and not on your first loop
                x = random.binomial(n=1, p=0.5, size=1)
                if x == 1: #perform shuffle data augmnetation
                    temp_label = [line.split(' _ _ ')[1] for line in temp_list]
                    random.shuffle(temp_list)
                    for index, shuff_line in enumerate(temp_list): #traverse through temp list
                        shuffle_file.append(shuff_line.split(' _ _ ')[0] + ' _ _ ' + temp_label[index]) #putting labels in their original label
                    temp_list.clear() #reset temp_list
                    temp_list.append(line) #begin new temp_list with current new label
                    prev_label = curr_label #reassign current label value to prev_label to be used as reference for next loop
                    shuffle_count += 1
                else: #don't shuffle, just append
                    for original_line in temp_list:
                        shuffle_file.append(original_line)
                    temp_list.clear() #reset temp_list
                    temp_list.append(line) #begin new temp_list with current new label
                    prev_label = curr_label #reassign current label value to prev_label to be used as reference for next loop
                    non_shuffle_count += 1
            else: #not on transitionary label. this could mean you're on your first label or another of the same label you've been on
                temp_list.append(line) #add current line to temp_list
                prev_label = curr_label #reassign current label value to prev_label to be used as reference for next loop
        elif line == '\n': #end of current #id NER, shuffle and dump current temp_list holdings
            x = random.binomial(n=1, p=0.5, size=1)
            if x == 1: #perform shuffle data augmnetation
                temp_label = [line.split(' _ _ ')[1] for line in temp_list] #retain labels in order
                random.shuffle(temp_list)
                for index, shuff_line in enumerate(temp_list): #traverse through temp list
                        shuffle_file.append(shuff_line.split(' _ _ ')[0] + ' _ _ ' + temp_label[index]) #putting labels in their original label
                shuffle_file.append(line)
                temp_list.clear() #reset temp_list
                prev_label = None #reassign current label value to prev_label to be used as reference for next loop
                shuffle_count += 1
            else: #don't shuffle, just append
                for original_line in temp_list:
                    shuffle_file.append(original_line)
                shuffle_file.append(line)
                temp_list.clear() #reset temp_list
                prev_label = None #reassig
                non_shuffle_count += 1
        else:
            shuffle_file.append(line) #append #id's and \n
            prev_label = None
    print(f'{shuffle_count} instances of shuffling.')
    print(f'{non_shuffle_count} instances of not shuffling.')
    return shuffle_file

In [None]:
with open('/path/to/ner/file.conll') as f:
    file = [i for i in f.readlines()]
sis_file = shuffle(train_file = file)

with open('/path/to/save/train_mr.conll', 'w') as file:
    for line in sis_file:
        file.write(line)

# Combined

In [None]:
with open('/path/to/ner/file.conll') as f:
    file = [i for i in f.readlines()]
lwtr_file = lwtr(train_file = file)

with open('/path/to/ner/file.conll') as f:
    file = [i for i in f.readlines()]
sr_file = syn_replace(train_file = file, word_vector = '/path/to/wiki/vector.bin')

with open('/path/to/ner/file.conll') as f:
    file = [i for i in f.readlines()]
mr_file = mr(train_file = file)

with open('/path/to/ner/file.conll') as f:
    file = [i for i in f.readlines()]
sis_file = shuffle(train_file = file)

combined_file = lwtr_file + sr_file + mr_file + sis_file
with open('/path/to/save/train_combined.conll', 'w') as file:
    for line in combined_file:
        file.write(line)

# All

In [None]:
with open('/path/to/ner/file.conll') as f:
    file = [i for i in f.readlines()]
    
lwtr_file = lwtr(train_file = file)
sr_file = syn_replace(train_file = lwtr_file, word_vector = '/path/to/wiki/vector.bin')
mr_file = mr(train_file = sr_file)
all_file = shuffle(train_file = mr_file)

with open('/path/to/save/train_all.conll', 'w') as file:
    for line in all_file:
        file.write(line)