# LwTR

In [13]:
from numpy import random

# LwTR - Label-wise Token Replacement
def lwtr(train_file):
    
    lwtr_file = train_file.copy() #make new list to prevent overwritting original file-list
    
    #for Label-wise Token Replacement, we need to catalogue each possible label
    o_label = [line for line in file if ' _ _ O' in line]
    b_prod = [line for line in file if ' _ _ B-PROD' in line]
    b_grp = [line for line in file if ' _ _ B-GRP' in line]
    b_corp = [line for line in file if ' _ _ B-CORP' in line]
    b_cw = [line for line in file if ' _ _ B-CW' in line]
    b_per = [line for line in file if ' _ _ B-PER' in line]
    b_loc = [line for line in file if ' _ _ B-LOC' in line]
    i_prod = [line for line in file if ' _ _ I-PROD' in line]
    i_grp = [line for line in file if ' _ _ I-GRP' in line]
    i_corp = [line for line in file if ' _ _ I-CORP' in line]
    i_cw = [line for line in file if ' _ _ I-CW' in line]
    i_per = [line for line in file if ' _ _ I-PER' in line]
    i_loc = [line for line in file if ' _ _ I-LOC' in line]

    #dictionary of labels
    lwtr = {'O': o_label, 
            'B-PROD': b_prod,
            'B-GRP': b_grp,
            'B-CORP': b_corp,
            'B-CW': b_cw,
            'B-PER': b_per,
            'B-LOC': b_loc,
            'I-PROD': i_prod,
            'I-GRP': i_grp,
            'I-CORP': i_corp,
            'I-CW': i_cw,
            'I-PER': i_per,
            'I-LOC': i_loc}

    for index, line in enumerate(train_file): #traverse through file. index:value
        x = random.binomial(n=1, p=0.5, size=1) #randomizer. x will be 0 or 1
        #x==1 means successly random / startswith() is to filter out id lines / line.strip() is to filter out '\n' lines
        if x == 1 and not line.startswith('# id ') and line.strip(): 
            curr_label = line.split(' _ _ ')[1].strip() #label found in current line. will be used as the key for lwtr dict
            lwtr_file[index] = random.choice(lwtr[curr_label]) #access lwtr dict to randomly choose replacement token of same label and reassign it to current line in file
    
    return lwtr_file

In [14]:
with open('SemEval2022-Task11_Train-Dev/BN-Bangla/bn_train_sr.conll') as f:
    file = [i for i in f.readlines()]
part2 = lwtr(train_file = file)

# SR

In [20]:
def syn_replace(train_file, word_vector):
    from numpy import random
    from gensim.test.utils import datapath
    from gensim.models.fasttext import load_facebook_vectors
    
    wv = load_facebook_vectors(word_vector) #load language word vector
    
    sr_file = [] #eventual file to write into .conll 
    aug_count = 0
    no_aug_count = 0
    for index, line in enumerate(file):
        if not line.startswith('# id ') and line.strip(): #filter out '#id' and '\n'
            x = random.binomial(n=1, p=0.5, size=1) #random coin flip, 1 is augment, 0 is no augmentation
            if x == 1: #augment by replacing current word with the most similar (synonym) word in word vector
                separate = line.split(' _ _ ') #to get a list of just the word with the label
                word = separate[0] #isolate the word in the line, without it's label
                synonym = wv.most_similar(word)[0][0] #synonym of isolated word. 'most_similar' returns list of most top 10 most similar words, with corresponding percentage likelihoods. we just need the MOST likely ([0]), without the percentage ([0])
                separate[0] = synonym #replace original word with its new synonym
                sr_file.append(' _ _ '.join(separate)) #join back together in format of 'synonym _ _ label'
                
                aug_count += 1
                
            else: #no augmentation, still keep original line
                sr_file.append(line)
                
                no_aug_count += 1
        else: #append #id's and \n
            sr_file.append(line)
    print(f'{aug_count} instances of synonym replacement.')
    print(f'{no_aug_count} instances of no synonym replacement.')
    return sr_file

In [47]:
#Turkish
with open('SemEval2022-Task11_Train-Dev/TR-Turkish/tr_train.conll') as f:
    file = f.readlines()
sr = syn_replace(file, 'SemEval2022-Task11_Train-Dev/TR-Turkish/wiki.tr/wiki.tr.bin')

with open('SemEval2022-Task11_Train-Dev/TR-Turkish/tr_train_sr.conll', 'w') as file:
    for line in sr:
        file.write(line)

109212 instances of synonym replacement.
109187 instances of no synonym replacement.


In [56]:
#Chinese
with open('SemEval2022-Task11_Train-Dev/ZH-Chinese/zh_train.conll') as f:
    file = f.readlines()
sr = syn_replace(file, 'SemEval2022-Task11_Train-Dev/ZH-Chinese/wiki.zh/wiki.zh.bin')

with open('SemEval2022-Task11_Train-Dev/ZH-Chinese/zh_train_sr.conll', 'w') as file:
    for line in sr:
        file.write(line)

191503 instances of synonym replacement.
190646 instances of no synonym replacement.


In [49]:
#Hindi
with open('SemEval2022-Task11_Train-Dev/HI-Hindi/hi_train.conll') as f:
    file = f.readlines()
sr = syn_replace(file, 'SemEval2022-Task11_Train-Dev/HI-Hindi/wiki.hi/wiki.hi.bin')

with open('SemEval2022-Task11_Train-Dev/HI-Hindi/hi_train_sr.conll', 'w') as file:
    for line in sr:
        file.write(line)

121800 instances of synonym replacement.
122766 instances of no synonym replacement.


In [50]:
#Farsi
with open('SemEval2022-Task11_Train-Dev/FA-Farsi/fa_train.conll') as f:
    file = f.readlines()
sr = syn_replace(file, 'SemEval2022-Task11_Train-Dev/FA-Farsi/wiki.fa/wiki.fa.bin')

with open('SemEval2022-Task11_Train-Dev/FA-Farsi/fa_train_sr.conll', 'w') as file:
    for line in sr:
        file.write(line)

139284 instances of synonym replacement.
138963 instances of no synonym replacement.


In [51]:
#German
with open('SemEval2022-Task11_Train-Dev/DE-German/de_train.conll') as f:
    file = f.readlines()
sr = syn_replace(file, 'SemEval2022-Task11_Train-Dev/DE-German/wiki.de/wiki.de.bin')

with open('SemEval2022-Task11_Train-Dev/DE-German/de_train_sr.conll', 'w') as file:
    for line in sr:
        file.write(line)

109305 instances of synonym replacement.
109417 instances of no synonym replacement.


In [52]:
#Russian
with open('SemEval2022-Task11_Train-Dev/RU-Russian/ru_train.conll') as f:
    file = f.readlines()
sr = syn_replace(file, 'SemEval2022-Task11_Train-Dev/RU-Russian/wiki.ru/wiki.ru.bin')

with open('SemEval2022-Task11_Train-Dev/RU-Russian/ru_train_sr.conll', 'w') as file:
    for line in sr:
        file.write(line)

121102 instances of synonym replacement.
121282 instances of no synonym replacement.


In [53]:
#Dutch
with open('SemEval2022-Task11_Train-Dev/NL-Dutch/nl_train.conll') as f:
    file = f.readlines()
sr = syn_replace(file, 'SemEval2022-Task11_Train-Dev/NL-Dutch/wiki.nl/wiki.nl.bin')

with open('SemEval2022-Task11_Train-Dev/NL-Dutch/nl_train_sr.conll', 'w') as file:
    for line in sr:
        file.write(line)

116538 instances of synonym replacement.
116935 instances of no synonym replacement.


In [54]:
#Spanish
with open('SemEval2022-Task11_Train-Dev/ES-Spanish/es_train.conll') as f:
    file = f.readlines()
sr = syn_replace(file, 'SemEval2022-Task11_Train-Dev/ES-Spanish/wiki.es/wiki.es.bin')

with open('SemEval2022-Task11_Train-Dev/ES-Spanish/es_train_sr.conll', 'w') as file:
    for line in sr:
        file.write(line)

132922 instances of synonym replacement.
133813 instances of no synonym replacement.


In [55]:
#English
with open('SemEval2022-Task11_Train-Dev/EN-English/en_train.conll') as f:
    file = f.readlines()
sr = syn_replace(file, 'SemEval2022-Task11_Train-Dev/EN-English/wiki.en/wiki.en.bin')

with open('SemEval2022-Task11_Train-Dev/EN-English/en_train_sr.conll', 'w') as file:
    for line in sr:
        file.write(line)

126980 instances of synonym replacement.
126559 instances of no synonym replacement.


In [57]:
a = 3

# ALL

In [21]:
with open('SemEval2022-Task11_Train-Dev/EN-English/en_train_lwtr.conll') as f:
    file = f.readlines()
a = syn_replace(file, 'SemEval2022-Task11_Train-Dev/EN-English/wiki.en/wiki.en.bin')

126943 instances of synonym replacement.
126596 instances of no synonym replacement.


In [25]:
#Spanish
with open('SemEval2022-Task11_Train-Dev/ES-Spanish/es_train_lwtr.conll') as f:
    file = f.readlines()
es_sr = syn_replace(file, 'SemEval2022-Task11_Train-Dev/ES-Spanish/wiki.es/wiki.es.bin')

es_mr = mr(file = es_sr)

all_methods = shuffle(file = es_mr)

with open('SemEval2022-Task11_Train-Dev/ES-Spanish/es_train_all.conll', 'w') as file:
    for line in all_methods:
        file.write(line)

133461 instances of synonym replacement.
133274 instances of no synonym replacement.
29024 instances of shuffling.
28941 instances of not shuffling.


In [None]:
with open('SemEval2022-Task11_Train-Dev/ES-Spanish/es_train.conll') as f:
    file = f.readlines()
es_sr = syn_replace(file, 'SemEval2022-Task11_Train-Dev/ES-Spanish/wiki.es/wiki.es.bin')

with open('SemEval2022-Task11_Train-Dev/ES-Spanish/es_train_sr.conll', 'w') as file:
    for line in es_sr:
        file.write(line)

In [26]:
#Farsi
with open('SemEval2022-Task11_Train-Dev/FA-Farsi/fa_train_lwtr.conll') as f:
    file = f.readlines()
fa_sr = syn_replace(file, 'SemEval2022-Task11_Train-Dev/FA-Farsi/wiki.fa/wiki.fa.bin')

fa_mr = mr(file = fa_sr)

all_methods = shuffle(file = fa_mr)

with open('SemEval2022-Task11_Train-Dev/FA-Farsi/fa_train_all.conll', 'w') as file:
    for line in all_methods:
        file.write(line)

139155 instances of synonym replacement.
139092 instances of no synonym replacement.
27716 instances of shuffling.
27751 instances of not shuffling.


In [None]:
with open('SemEval2022-Task11_Train-Dev/FA-Farsi/fa_train.conll') as f:
    file = f.readlines()
fa_sr = syn_replace(file, 'SemEval2022-Task11_Train-Dev/FA-Farsi/wiki.fa/wiki.fa.bin')

with open('SemEval2022-Task11_Train-Dev/FA-Farsi/fa_train_sr.conll', 'w') as file:
    for line in fa_sr:
        file.write(line)

In [28]:
#Hindi
with open('SemEval2022-Task11_Train-Dev/HI-Hindi/hi_train_lwtr.conll') as f:
    file = f.readlines()
hi_sr = syn_replace(file, 'SemEval2022-Task11_Train-Dev/HI-Hindi/wiki.hi/wiki.hi.bin')

hi_mr = mr(file = hi_sr)

all_methods = shuffle(file = hi_mr)

with open('SemEval2022-Task11_Train-Dev/HI-Hindi/hi_train_all.conll', 'w') as file:
    for line in all_methods:
        file.write(line)

122031 instances of synonym replacement.
122535 instances of no synonym replacement.
21758 instances of shuffling.
22129 instances of not shuffling.


In [None]:
with open('SemEval2022-Task11_Train-Dev/HI-Hindi/hi_train.conll') as f:
    file = f.readlines()
hi_sr = syn_replace(file, 'SemEval2022-Task11_Train-Dev/HI-Hindi/wiki.hi/wiki.hi.bin')

with open('SemEval2022-Task11_Train-Dev/HI-Hindi/hi_train_sr.conll', 'w') as file:
    for line in hi_sr:
        file.write(line)

In [29]:
#Dutch
with open('SemEval2022-Task11_Train-Dev/NL-Dutch/nl_train_lwtr.conll') as f:
    file = f.readlines()
nl_sr = syn_replace(file, 'SemEval2022-Task11_Train-Dev/NL-Dutch/wiki.nl/wiki.nl.bin')

nl_mr = mr(file = nl_sr)

all_methods = shuffle(file = nl_mr)

with open('SemEval2022-Task11_Train-Dev/NL-Dutch/nl_train_all.conll', 'w') as file:
    for line in all_methods:
        file.write(line)

116738 instances of synonym replacement.
116735 instances of no synonym replacement.
28789 instances of shuffling.
29110 instances of not shuffling.


In [None]:
with open('SemEval2022-Task11_Train-Dev/NL-Dutch/nl_train.conll') as f:
    file = f.readlines()
nl_sr = syn_replace(file, 'SemEval2022-Task11_Train-Dev/NL-Dutch/wiki.nl/wiki.nl.bin')

with open('SemEval2022-Task11_Train-Dev/NL-Dutch/nl_train_sr.conll', 'w') as file:
    for line in nl_sr:
        file.write(line)

In [30]:
#Russian
with open('SemEval2022-Task11_Train-Dev/RU-Russian/ru_train_lwtr.conll') as f:
    file = f.readlines()
ru_sr = syn_replace(file, 'SemEval2022-Task11_Train-Dev/RU-Russian/wiki.ru/wiki.ru.bin')

ru_mr = mr(file = ru_sr)

all_methods = shuffle(file = ru_mr)

with open('SemEval2022-Task11_Train-Dev/RU-Russian/ru_train_all.conll', 'w') as file:
    for line in all_methods:
        file.write(line)

121204 instances of synonym replacement.
121180 instances of no synonym replacement.
26139 instances of shuffling.
26035 instances of not shuffling.


In [None]:
with open('SemEval2022-Task11_Train-Dev/RU-Russian/ru_train.conll') as f:
    file = f.readlines()
ru_sr = syn_replace(file, 'SemEval2022-Task11_Train-Dev/RU-Russian/wiki.ru/wiki.ru.bin')

with open('SemEval2022-Task11_Train-Dev/RU-Russian/ru_train_sr.conll', 'w') as file:
    for line in ru_sr:
        file.write(line)

In [31]:
#Turkish
with open('SemEval2022-Task11_Train-Dev/TR-Turkish/tr_train_lwtr.conll') as f:
    file = f.readlines()
tr_sr = syn_replace(file, 'SemEval2022-Task11_Train-Dev/TR-Turkish/wiki.tr/wiki.tr.bin')

tr_mr = mr(file = tr_sr)

all_methods = shuffle(file = tr_mr)

with open('SemEval2022-Task11_Train-Dev/TR-Turkish/tr_train_all.conll', 'w') as file:
    for line in all_methods:
        file.write(line)

109387 instances of synonym replacement.
109012 instances of no synonym replacement.
28877 instances of shuffling.
28991 instances of not shuffling.


In [None]:
with open('SemEval2022-Task11_Train-Dev/TR-Turkish/tr_train.conll') as f:
    file = f.readlines()
tr_sr = syn_replace(file, 'SemEval2022-Task11_Train-Dev/TR-Turkish/wiki.tr/wiki.tr.bin')

with open('SemEval2022-Task11_Train-Dev/TR-Turkish/tr_train_sr.conll', 'w') as file:
    for line in tr_sr:
        file.write(line)

In [32]:
#Chinese
with open('SemEval2022-Task11_Train-Dev/ZH-Chinese/zh_train_lwtr.conll') as f:
    file = f.readlines()
zh_sr = syn_replace(file, 'SemEval2022-Task11_Train-Dev/ZH-Chinese/wiki.zh/wiki.zh.bin')

zh_mr = mr(file = zh_sr)

all_methods = shuffle(file = zh_mr)

with open('SemEval2022-Task11_Train-Dev/ZH-Chinese/zh_train_all.conll', 'w') as file:
    for line in all_methods:
        file.write(line)

190279 instances of synonym replacement.
191870 instances of no synonym replacement.
29805 instances of shuffling.
29968 instances of not shuffling.


In [58]:
#German
with open('SemEval2022-Task11_Train-Dev/DE-German/de_train_sr.conll') as f:
    file = [i for i in f.readlines()]
de_lwtr = lwtr(train_file = file)

de_mr = mr(file = de_lwtr)

all_methods = shuffle(file = de_mr)

with open('SemEval2022-Task11_Train-Dev/DE-German/de_train_all.conll', 'w') as file:
    for line in all_methods:
        file.write(line)

28573 instances of shuffling.
28285 instances of not shuffling.


In [None]:
with open('SemEval2022-Task11_Train-Dev/ZH-Chinese/zh_train.conll') as f:
    file = f.readlines()
zh_sr = syn_replace(file, 'SemEval2022-Task11_Train-Dev/ZH-Chinese/wiki.zh/wiki.zh.bin')

with open('SemEval2022-Task11_Train-Dev/ZH-Chinese/zh_train_sr.conll', 'w') as file:
    for line in zh_sr:
        file.write(line)

In [33]:
with open('SemEval2022-Task11_Train-Dev/DE-German/de_train_sr.conll') as f:
    file = f.readlines()

In [44]:
a = []
for i in file:
    if not i.startswith('# id ') and i.strip():
        a.append(i.split(' _ _ ')[1])

In [45]:
b = set(a)

In [46]:
b

{'B-CORP\n',
 'B-CW\n',
 'B-GRP\n',
 'B-LOC\n',
 'B-PER\n',
 'B-PROD\n',
 'I-CORP\n',
 'I-CW\n',
 'I-GRP\n',
 'I-LOC\n',
 'I-PER\n',
 'I-PROD\n',
 'O\n'}

# MR

In [15]:
def mr(file):
    ############## DATA STRUCTURES SETUP ###############

    from collections import defaultdict

    mention_rep = defaultdict(list) #create default dictionary of lists
    entities = ['PER', 'LOC', 'GRP', 'CORP', 'PROD', 'CW'] #all basic NER labels

    begin = False #variable for when you've first encountered and captured the head of a mention (B-)
    inside = False #variable for when you're inside the mention (I-). There can be 1+ of these

    for entity in entities: #traverse through the entities
        for line in file: #traverse through each line of the language file
            if begin: #check to see if you've already run into the head of the mention
                if f'I-{entity}' in line: #if not, then see if the I- is in this line
                    temp_list.append(line) #if so, you must've gone to the 'elif' below and created the temp_list varible. Append current I- onto this list
                    inside = True #switch to True to alert that you're inside the mention from now on

                else: #if there's no I- in this line that means you've 1) got the head, and 2) either gotten all I-'s already or there's no more I-'s, and you're done trying to capture a mention
                    if mention_rep[entity]: #check to see if you're already started storing values into this entity (PER, LOC, etc.)
                        mention_rep[entity].append(temp_list) #if so, simply append the temp_list onto the existing list
                    else: #if not that means this is your first entry for an entity's values. defaultdict(list) can't perform append when this happens...
                        mention_rep[entity] = [temp_list] #...so we'll have to store the temp_list as a list to prevent the temp_list's contents from being broken into individual values and stored into the dict's value list
                    begin = False #revert begin to False for next loop
                    inside = False #revert inside for the same reason

            elif f'B-{entity}' in line: #if you haven't already run into the head of the mention (B-), check to see if it's in this current loop/line
                temp_list = [] #create a temporary list that you'll use to collect a mention's beginning (B-) and insides (I-). this will also reset the temp_list for subsequent loops 
                temp_list.append(line) #add the beginning labels (B-) to this list
                begin = True #switch to True to alert that you've found the head label

    #assert every mention was stored as a list
    for i in mention_rep: #traverse the dict's keys
        for j in mention_rep[i]: #traverse each value in the value-list of the key
            assert type(j) is list #check if every value is a list. this is to ensure consistency later on


    ################################################# PERFORM MENTION REPLACEMENT ########################################################
    import numpy as np #will need ndarray

    mr_file = []
    labels = tuple(mention_rep.keys()) #set of labels: PER, LOC, GRP, CORP, PROD, CW

    #capture_inside = False
    for index, line in enumerate(file): #traverse through file by line
        #print('line:', line)
        if line.strip().endswith(labels): #endswith() is to filter out anything other than mentions
            if line.strip().split(' _ _ ')[1].startswith('B'): #only get the beginnings of mentions (B-)
                #print('is label')
                x = random.binomial(n = 1, p = 0.5, size = 1) #randomizer. x will be 0 or 1
                #print(f'x = {x}')
                if x == 1: #x==1 means successfully random, perform mention replacement
                    curr_label = line.split(' _ _ ')[1].strip()[2:] #current label type will be used as key next. '2:' is to remove the 'B-' or 'I-' from beginning of label
                    #print(f'curr_label: {curr_label}')
                    new_mention = np.random.choice(np.array(mention_rep[curr_label], dtype = 'object')) #have to make this ndarray because the values of our mention_rep[key] dictionary is lists of LISTS. it becomes deprecated. same logic applies as LwTR above
                    #print(f'new_mention: {new_mention}')
                    if len(new_mention) == 1: #just insert sole element
                        #print('len 1')
                        mr_file.append(new_mention[0])
                        #print(f'{index+1}: {mr_file}')
                    else: #multi-word mention
                        #print('multi-word')
                        for mention in new_mention: #traverse through new_mention list
                            #test_file.insert(index+idx, mention) #subsequently insert each part of new_mention 
                            mr_file.append(mention)
                            #print(f'{index+1}: {mr_file}\n')
                else: #B-, but no data augmentation
                    #print('label, but no data augmentation')
                    mr_file.append(line) #add line (non-replaced mention beginning (B-))
                    capture_inside = True #this will enable capturing the inside (I-) of the mention if it appears next, instead of disregarding it
                    #print(f'{index+1}: {mr_file}\n')
            elif capture_inside: #add the inside of mention, because we just had a non-replaced mention beginning (B-), so we must capture all insides of original mention
                #print('unsuccessful mention replacement. capturing current inside mention, I-')
                mr_file.append(line) #append non mention replaced inside (I-)
                #print(f'{index+1}: {mr_file}\n')
    #         else:
    #             print('sucessful mention replacement. not capturing current inside of mention')

        else: #everything that doesn't have a mention label in it
            #print('not label')
            mr_file.append(line) #append current line
            #print(f'{index+1}: {mr_file}\n')
            capture_inside = False #switch off the flag to indicate we must capture a mention inside


    ############################################################### CHECKING ############################################################
    new_count = 0
    for i in mr_file:
        if i.startswith('# id '):
            new_count += 1
    assert new_count == 15300
    
    return mr_file

In [22]:
part3 = mr(file = a)

# SiS

In [5]:
# Labels are as follows
# PER : Person
# LOC : Location
# GRP : Group
# CORP : Corporation
# PROD : Product
# CW: Creative Work
# O: out-of-mention label (i.e. unrelated to named entity)

# B - beginning of label, I - inside label
from numpy import random

def shuffle(file):
    labels = tuple(['-PER\n', '-LOC\n', '-GRP\n', '-CORP\n', '-PROD\n', '-CW\n', ' _ _ O']) #set of labels: PER, LOC, GRP, CORP, PROD, CW, and O
    shuffle_file = []
    prev_label = None
    temp_list = []
    shuffle_count = 0
    non_shuffle_count = 0
    
    for index, line in enumerate(file):
        #print(f'Loop: {index+1} current line: {line} temp_list: {temp_list}')
        #print(f'shuffle_file: {shuffle_file}')
        if not line.startswith('# id ') and line.strip(): #to filter out #id's and \n
            for label in labels:
                if label in line:
                    curr_label = label
                    break
                    
            if curr_label != prev_label and temp_list: #when you've come to a new label, and not on your first loop
                x = random.binomial(n=1, p=0.5, size=1)
                if x == 1: #perform shuffle data augmnetation
                    temp_label = [line.split(' _ _ ')[1] for line in temp_list]
                    random.shuffle(temp_list)
                    #print('data not augmenting')
                    #print(f'temp_list before looping through: {temp_list}')
                    for index, shuff_line in enumerate(temp_list): #traverse through temp list
                        shuffle_file.append(shuff_line.split(' _ _ ')[0] + ' _ _ ' + temp_label[index]) #putting labels in their original label
                    temp_list.clear() #reset temp_list
                    temp_list.append(line) #begin new temp_list with current new label
                    prev_label = curr_label #reassign current label value to prev_label to be used as reference for next loop
                    shuffle_count += 1
                    #print('data augmenting')
                    #print(f'after new temp_list: {temp_list}')
                else: #don't shuffle, just append
                    #print('data not augmenting')
                    #print(f'temp_list before looping through: {temp_list}')
                    for original_line in temp_list:
                        shuffle_file.append(original_line)
                    temp_list.clear() #reset temp_list
                    temp_list.append(line) #begin new temp_list with current new label
                    prev_label = curr_label #reassign current label value to prev_label to be used as reference for next loop
                    non_shuffle_count += 1
                    #print('data not augmenting')
                    #print(f'after new temp_list: {temp_list}')
                #temp_list.append(line)
            else: #not on transitionary label. this could mean you're on your first label or another of the same label you've been on
                temp_list.append(line) #add current line to temp_list
                #print(f'{index+1} {line} prev_label: {prev_label}')
                prev_label = curr_label #reassign current label value to prev_label to be used as reference for next loop
                #print(f'{index+1} {line} curr_label: {curr_label}')
        elif line == '\n': #end of current #id NER, shuffle and dump current temp_list holdings
            x = random.binomial(n=1, p=0.5, size=1)
            if x == 1: #perform shuffle data augmnetation
                temp_label = [line.split(' _ _ ')[1] for line in temp_list] #retain labels in order
                random.shuffle(temp_list)
                #print('data not augmenting')
                #print(f'temp_list before looping through: {temp_list}')
                for index, shuff_line in enumerate(temp_list): #traverse through temp list
                        shuffle_file.append(shuff_line.split(' _ _ ')[0] + ' _ _ ' + temp_label[index]) #putting labels in their original label
                shuffle_file.append(line)
                temp_list.clear() #reset temp_list
                prev_label = None #reassign current label value to prev_label to be used as reference for next loop
                shuffle_count += 1
                #print('data augmenting')
                #print(f'after new temp_list: {temp_list}')
            else: #don't shuffle, just append
                #print('data not augmenting')
                #print(f'temp_list before looping through: {temp_list}')
                for original_line in temp_list:
                    shuffle_file.append(original_line)
                shuffle_file.append(line)
                temp_list.clear() #reset temp_list
                prev_label = None #reassig
                non_shuffle_count += 1
        else:
            shuffle_file.append(line) #append #id's and \n
            prev_label = None
    print(f'{shuffle_count} instances of shuffling.')
    print(f'{non_shuffle_count} instances of not shuffling.')
    return shuffle_file

In [23]:
all_methods = shuffle(file = part3)

29500 instances of shuffling.
29590 instances of not shuffling.


In [17]:
with open('SemEval2022-Task11_Train-Dev/ZH-Chinese/zh_train.conll') as f:
    en_test = f.readlines()
testing = shuffle(file = en_test)

with open('SemEval2022-Task11_Train-Dev/ZH-Chinese/zh_train_sis.conll', 'w') as file:
    for line in testing:
        file.write(line)

29877 instances of shuffling.
29896 instances of not shuffling.


In [24]:
with open('SemEval2022-Task11_Train-Dev/EN-English/en_train_all.conll', 'w') as file:
    for line in all_methods:
        file.write(line)