# LwTR - Label-wise Token Replacement

From Adel & Dai:

Label-wise token replacement (LwTR): For each token, we use a binomial distribution to randomly
decide whether it should be replaced. If yes, we then use a label-wise token distribution, built from the
original training set, to randomly select another token with the same label. Thus, we keep the original label sequence unchanged.

# Bangla

In [201]:
with open('SemEval2022-Task11_Train-Dev/BN-Bangla/bn_train.conll') as f:
    file = [i for i in f.readlines()]

In [226]:
#each dataset should have 15,300 training sentences
original_count = 0
for i in file:
    if i.startswith('# id '):
        original_count += 1
assert original_count == 15300

In [3]:
# Labels are as follows
# PER : Person
# LOC : Location
# GRP : Group
# CORP : Corporation
# PROD : Product
# CW: Creative Work
# O: out-of-mention label (i.e. unrelated to named entity)

# B - beginning of label, I - inside label
# mentions could be any combination of B/I-label class, and can either be 1 word or multi-word mentions

In [4]:
lwtr_file = file.copy() #make new list to prevent overwritting original file-list

In [5]:
#for Label-wise Token Replacement, we need to catalogue each possible label
o_label = [line for line in file if ' _ _ O' in line]
b_prod = [line for line in file if ' _ _ B-PROD' in line]
b_grp = [line for line in file if ' _ _ B-GRP' in line]
b_corp = [line for line in file if ' _ _ B-CORP' in line]
b_cw = [line for line in file if ' _ _ B-CW' in line]
b_per = [line for line in file if ' _ _ B-PER' in line]
b_loc = [line for line in file if ' _ _ B-LOC' in line]
i_prod = [line for line in file if ' _ _ I-PROD' in line]
i_grp = [line for line in file if ' _ _ I-GRP' in line]
i_corp = [line for line in file if ' _ _ I-CORP' in line]
i_cw = [line for line in file if ' _ _ I-CW' in line]
i_per = [line for line in file if ' _ _ I-PER' in line]
i_loc = [line for line in file if ' _ _ I-LOC' in line]

#dictionary of labels
lwtr = {'O': o_label, 
        'B-PROD': b_prod,
        'B-GRP': b_grp,
        'B-CORP': b_corp,
        'B-CW': b_cw,
        'B-PER': b_per,
        'B-LOC': b_loc,
        'I-PROD': i_prod,
        'I-GRP': i_grp,
        'I-CORP': i_corp,
        'I-CW': i_cw,
        'I-PER': i_per,
        'I-LOC': i_loc}

In [6]:
from numpy import random

for index, line in enumerate(file): #traverse through file. index:value
    x = random.binomial(n=1, p=0.5, size=1) #randomizer. x will be 0 or 1
    #x==1 means successly random / startswith() is to filter out id lines / line.strip() is to filter out '\n' lines
    if x == 1 and not line.startswith('# id ') and line.strip(): 
        curr_label = line.split(' _ _ ')[1].strip() #label found in current line. will be used as the key for lwtr dict
        lwtr_file[index] = random.choice(lwtr[curr_label]) #access lwtr dict to randomly choose replacement token of same label and reassign it to current line in file

In [None]:
lwtr_file

In [10]:
# Write to file
with open('SemEval2022-Task11_Train-Dev/BN-Bangla/bn_train_lwtr.conll', 'w') as file:
    for line in lwtr_file:
        file.write(line)

# German

In [15]:
with open('SemEval2022-Task11_Train-Dev/DE-German/de_train.conll') as f:
    file = [i for i in f.readlines()]

#each dataset should have 15,300 training sentences
count = 0
for i in file:
    if i.startswith('# id '):
        count += 1
assert count == 15300

lwtr_file = file.copy() #make new list to prevent overwritting original file-list


#for Label-wise Token Replacement, we need to catalogue each possible label
o_label = [line for line in file if ' _ _ O' in line]
b_prod = [line for line in file if ' _ _ B-PROD' in line]
b_grp = [line for line in file if ' _ _ B-GRP' in line]
b_corp = [line for line in file if ' _ _ B-CORP' in line]
b_cw = [line for line in file if ' _ _ B-CW' in line]
b_per = [line for line in file if ' _ _ B-PER' in line]
b_loc = [line for line in file if ' _ _ B-LOC' in line]
i_prod = [line for line in file if ' _ _ I-PROD' in line]
i_grp = [line for line in file if ' _ _ I-GRP' in line]
i_corp = [line for line in file if ' _ _ I-CORP' in line]
i_cw = [line for line in file if ' _ _ I-CW' in line]
i_per = [line for line in file if ' _ _ I-PER' in line]
i_loc = [line for line in file if ' _ _ I-LOC' in line]

#dictionary of labels
lwtr = {'O': o_label, 
        'B-PROD': b_prod,
        'B-GRP': b_grp,
        'B-CORP': b_corp,
        'B-CW': b_cw,
        'B-PER': b_per,
        'B-LOC': b_loc,
        'I-PROD': i_prod,
        'I-GRP': i_grp,
        'I-CORP': i_corp,
        'I-CW': i_cw,
        'I-PER': i_per,
        'I-LOC': i_loc}

from numpy import random

for index, line in enumerate(file): #traverse through file. index:value
    x = random.binomial(n=1, p=0.5, size=1) #randomizer. x will be 0 or 1
    #x==1 means successly random / startswith() is to filter out id lines / line.strip() is to filter out '\n' lines
    if x == 1 and not line.startswith('# id ') and line.strip(): 
        curr_label = line.split(' _ _ ')[1].strip() #label found in current line. will be used as the key for lwtr dict
        lwtr_file[index] = random.choice(lwtr[curr_label]) #access lwtr dict to randomly choose replacement token of same label and reassign it to current line in file

lwtr_file

# Write to file
with open('SemEval2022-Task11_Train-Dev/DE-German/de_train_lwtr.conll', 'w') as file:
    for line in lwtr_file:
        file.write(line)

# English

In [17]:
with open('SemEval2022-Task11_Train-Dev/EN-English/en_train.conll') as f:
    file = [i for i in f.readlines()]

#each dataset should have 15,300 training sentences
count = 0
for i in file:
    if i.startswith('# id '):
        count += 1
assert count == 15300

lwtr_file = file.copy() #make new list to prevent overwritting original file-list


#for Label-wise Token Replacement, we need to catalogue each possible label
o_label = [line for line in file if ' _ _ O' in line]
b_prod = [line for line in file if ' _ _ B-PROD' in line]
b_grp = [line for line in file if ' _ _ B-GRP' in line]
b_corp = [line for line in file if ' _ _ B-CORP' in line]
b_cw = [line for line in file if ' _ _ B-CW' in line]
b_per = [line for line in file if ' _ _ B-PER' in line]
b_loc = [line for line in file if ' _ _ B-LOC' in line]
i_prod = [line for line in file if ' _ _ I-PROD' in line]
i_grp = [line for line in file if ' _ _ I-GRP' in line]
i_corp = [line for line in file if ' _ _ I-CORP' in line]
i_cw = [line for line in file if ' _ _ I-CW' in line]
i_per = [line for line in file if ' _ _ I-PER' in line]
i_loc = [line for line in file if ' _ _ I-LOC' in line]

#dictionary of labels
lwtr = {'O': o_label, 
        'B-PROD': b_prod,
        'B-GRP': b_grp,
        'B-CORP': b_corp,
        'B-CW': b_cw,
        'B-PER': b_per,
        'B-LOC': b_loc,
        'I-PROD': i_prod,
        'I-GRP': i_grp,
        'I-CORP': i_corp,
        'I-CW': i_cw,
        'I-PER': i_per,
        'I-LOC': i_loc}

from numpy import random

for index, line in enumerate(file): #traverse through file. index:value
    x = random.binomial(n=1, p=0.5, size=1) #randomizer. x will be 0 or 1
    #x==1 means successly random / startswith() is to filter out id lines / line.strip() is to filter out '\n' lines
    if x == 1 and not line.startswith('# id ') and line.strip(): 
        curr_label = line.split(' _ _ ')[1].strip() #label found in current line. will be used as the key for lwtr dict
        lwtr_file[index] = random.choice(lwtr[curr_label]) #access lwtr dict to randomly choose replacement token of same label and reassign it to current line in file

lwtr_file

# Write to file
with open('SemEval2022-Task11_Train-Dev/EN-English/en_train_lwtr.conll', 'w') as file:
    for line in lwtr_file:
        file.write(line)

# Spanish

In [18]:
with open('SemEval2022-Task11_Train-Dev/ES-Spanish/es_train.conll') as f:
    file = [i for i in f.readlines()]

#each dataset should have 15,300 training sentences
count = 0
for i in file:
    if i.startswith('# id '):
        count += 1
assert count == 15300

lwtr_file = file.copy() #make new list to prevent overwritting original file-list


#for Label-wise Token Replacement, we need to catalogue each possible label
o_label = [line for line in file if ' _ _ O' in line]
b_prod = [line for line in file if ' _ _ B-PROD' in line]
b_grp = [line for line in file if ' _ _ B-GRP' in line]
b_corp = [line for line in file if ' _ _ B-CORP' in line]
b_cw = [line for line in file if ' _ _ B-CW' in line]
b_per = [line for line in file if ' _ _ B-PER' in line]
b_loc = [line for line in file if ' _ _ B-LOC' in line]
i_prod = [line for line in file if ' _ _ I-PROD' in line]
i_grp = [line for line in file if ' _ _ I-GRP' in line]
i_corp = [line for line in file if ' _ _ I-CORP' in line]
i_cw = [line for line in file if ' _ _ I-CW' in line]
i_per = [line for line in file if ' _ _ I-PER' in line]
i_loc = [line for line in file if ' _ _ I-LOC' in line]

#dictionary of labels
lwtr = {'O': o_label, 
        'B-PROD': b_prod,
        'B-GRP': b_grp,
        'B-CORP': b_corp,
        'B-CW': b_cw,
        'B-PER': b_per,
        'B-LOC': b_loc,
        'I-PROD': i_prod,
        'I-GRP': i_grp,
        'I-CORP': i_corp,
        'I-CW': i_cw,
        'I-PER': i_per,
        'I-LOC': i_loc}

from numpy import random

for index, line in enumerate(file): #traverse through file. index:value
    x = random.binomial(n=1, p=0.5, size=1) #randomizer. x will be 0 or 1
    #x==1 means successly random / startswith() is to filter out id lines / line.strip() is to filter out '\n' lines
    if x == 1 and not line.startswith('# id ') and line.strip(): 
        curr_label = line.split(' _ _ ')[1].strip() #label found in current line. will be used as the key for lwtr dict
        lwtr_file[index] = random.choice(lwtr[curr_label]) #access lwtr dict to randomly choose replacement token of same label and reassign it to current line in file

lwtr_file

# Write to file
with open('SemEval2022-Task11_Train-Dev/ES-Spanish/es_train_lwtr.conll', 'w') as file:
    for line in lwtr_file:
        file.write(line)

# Farsi

In [20]:
with open('SemEval2022-Task11_Train-Dev/FA-Farsi/fa_train.conll') as f:
    file = [i for i in f.readlines()]

#each dataset should have 15,300 training sentences
count = 0
for i in file:
    if i.startswith('# id '):
        count += 1
assert count == 15300

lwtr_file = file.copy() #make new list to prevent overwritting original file-list


#for Label-wise Token Replacement, we need to catalogue each possible label
o_label = [line for line in file if ' _ _ O' in line]
b_prod = [line for line in file if ' _ _ B-PROD' in line]
b_grp = [line for line in file if ' _ _ B-GRP' in line]
b_corp = [line for line in file if ' _ _ B-CORP' in line]
b_cw = [line for line in file if ' _ _ B-CW' in line]
b_per = [line for line in file if ' _ _ B-PER' in line]
b_loc = [line for line in file if ' _ _ B-LOC' in line]
i_prod = [line for line in file if ' _ _ I-PROD' in line]
i_grp = [line for line in file if ' _ _ I-GRP' in line]
i_corp = [line for line in file if ' _ _ I-CORP' in line]
i_cw = [line for line in file if ' _ _ I-CW' in line]
i_per = [line for line in file if ' _ _ I-PER' in line]
i_loc = [line for line in file if ' _ _ I-LOC' in line]

#dictionary of labels
lwtr = {'O': o_label, 
        'B-PROD': b_prod,
        'B-GRP': b_grp,
        'B-CORP': b_corp,
        'B-CW': b_cw,
        'B-PER': b_per,
        'B-LOC': b_loc,
        'I-PROD': i_prod,
        'I-GRP': i_grp,
        'I-CORP': i_corp,
        'I-CW': i_cw,
        'I-PER': i_per,
        'I-LOC': i_loc}

from numpy import random

for index, line in enumerate(file): #traverse through file. index:value
    x = random.binomial(n=1, p=0.5, size=1) #randomizer. x will be 0 or 1
    #x==1 means successly random / startswith() is to filter out id lines / line.strip() is to filter out '\n' lines
    if x == 1 and not line.startswith('# id ') and line.strip(): 
        curr_label = line.split(' _ _ ')[1].strip() #label found in current line. will be used as the key for lwtr dict
        lwtr_file[index] = random.choice(lwtr[curr_label]) #access lwtr dict to randomly choose replacement token of same label and reassign it to current line in file

lwtr_file

# Write to file
with open('SemEval2022-Task11_Train-Dev/FA-Farsi/fa_train_lwtr.conll', 'w') as file:
    for line in lwtr_file:
        file.write(line)

# Hindi

In [21]:
with open('SemEval2022-Task11_Train-Dev/HI-Hindi/hi_train.conll') as f:
    file = [i for i in f.readlines()]

#each dataset should have 15,300 training sentences
count = 0
for i in file:
    if i.startswith('# id '):
        count += 1
assert count == 15300

lwtr_file = file.copy() #make new list to prevent overwritting original file-list


#for Label-wise Token Replacement, we need to catalogue each possible label
o_label = [line for line in file if ' _ _ O' in line]
b_prod = [line for line in file if ' _ _ B-PROD' in line]
b_grp = [line for line in file if ' _ _ B-GRP' in line]
b_corp = [line for line in file if ' _ _ B-CORP' in line]
b_cw = [line for line in file if ' _ _ B-CW' in line]
b_per = [line for line in file if ' _ _ B-PER' in line]
b_loc = [line for line in file if ' _ _ B-LOC' in line]
i_prod = [line for line in file if ' _ _ I-PROD' in line]
i_grp = [line for line in file if ' _ _ I-GRP' in line]
i_corp = [line for line in file if ' _ _ I-CORP' in line]
i_cw = [line for line in file if ' _ _ I-CW' in line]
i_per = [line for line in file if ' _ _ I-PER' in line]
i_loc = [line for line in file if ' _ _ I-LOC' in line]

#dictionary of labels
lwtr = {'O': o_label, 
        'B-PROD': b_prod,
        'B-GRP': b_grp,
        'B-CORP': b_corp,
        'B-CW': b_cw,
        'B-PER': b_per,
        'B-LOC': b_loc,
        'I-PROD': i_prod,
        'I-GRP': i_grp,
        'I-CORP': i_corp,
        'I-CW': i_cw,
        'I-PER': i_per,
        'I-LOC': i_loc}

from numpy import random

for index, line in enumerate(file): #traverse through file. index:value
    x = random.binomial(n=1, p=0.5, size=1) #randomizer. x will be 0 or 1
    #x==1 means successly random / startswith() is to filter out id lines / line.strip() is to filter out '\n' lines
    if x == 1 and not line.startswith('# id ') and line.strip(): 
        curr_label = line.split(' _ _ ')[1].strip() #label found in current line. will be used as the key for lwtr dict
        lwtr_file[index] = random.choice(lwtr[curr_label]) #access lwtr dict to randomly choose replacement token of same label and reassign it to current line in file

lwtr_file

# Write to file
with open('SemEval2022-Task11_Train-Dev/HI-Hindi/hi_train_lwtr.conll', 'w') as file:
    for line in lwtr_file:
        file.write(line)

# Korean

In [19]:
with open('SemEval2022-Task11_Train-Dev/KO-Korean/ko_train.conll') as f:
    file = [i for i in f.readlines()]

#each dataset should have 15,300 training sentences
count = 0
for i in file:
    if i.startswith('# id '):
        count += 1
assert count == 15300

lwtr_file = file.copy() #make new list to prevent overwritting original file-list


#for Label-wise Token Replacement, we need to catalogue each possible label
o_label = [line for line in file if ' _ _ O' in line]
b_prod = [line for line in file if ' _ _ B-PROD' in line]
b_grp = [line for line in file if ' _ _ B-GRP' in line]
b_corp = [line for line in file if ' _ _ B-CORP' in line]
b_cw = [line for line in file if ' _ _ B-CW' in line]
b_per = [line for line in file if ' _ _ B-PER' in line]
b_loc = [line for line in file if ' _ _ B-LOC' in line]
i_prod = [line for line in file if ' _ _ I-PROD' in line]
i_grp = [line for line in file if ' _ _ I-GRP' in line]
i_corp = [line for line in file if ' _ _ I-CORP' in line]
i_cw = [line for line in file if ' _ _ I-CW' in line]
i_per = [line for line in file if ' _ _ I-PER' in line]
i_loc = [line for line in file if ' _ _ I-LOC' in line]

#dictionary of labels
lwtr = {'O': o_label, 
        'B-PROD': b_prod,
        'B-GRP': b_grp,
        'B-CORP': b_corp,
        'B-CW': b_cw,
        'B-PER': b_per,
        'B-LOC': b_loc,
        'I-PROD': i_prod,
        'I-GRP': i_grp,
        'I-CORP': i_corp,
        'I-CW': i_cw,
        'I-PER': i_per,
        'I-LOC': i_loc}

from numpy import random

for index, line in enumerate(file): #traverse through file. index:value
    x = random.binomial(n=1, p=0.5, size=1) #randomizer. x will be 0 or 1
    #x==1 means successly random / startswith() is to filter out id lines / line.strip() is to filter out '\n' lines
    if x == 1 and not line.startswith('# id ') and line.strip(): 
        curr_label = line.split(' _ _ ')[1].strip() #label found in current line. will be used as the key for lwtr dict
        lwtr_file[index] = random.choice(lwtr[curr_label]) #access lwtr dict to randomly choose replacement token of same label and reassign it to current line in file

lwtr_file

# Write to file
with open('SemEval2022-Task11_Train-Dev/KO-Korean/ko_train_lwtr.conll', 'w') as file:
    for line in lwtr_file:
        file.write(line)

# Dutch

In [22]:
with open('SemEval2022-Task11_Train-Dev/NL-Dutch/nl_train.conll') as f:
    file = [i for i in f.readlines()]

#each dataset should have 15,300 training sentences
count = 0
for i in file:
    if i.startswith('# id '):
        count += 1
assert count == 15300

lwtr_file = file.copy() #make new list to prevent overwritting original file-list


#for Label-wise Token Replacement, we need to catalogue each possible label
o_label = [line for line in file if ' _ _ O' in line]
b_prod = [line for line in file if ' _ _ B-PROD' in line]
b_grp = [line for line in file if ' _ _ B-GRP' in line]
b_corp = [line for line in file if ' _ _ B-CORP' in line]
b_cw = [line for line in file if ' _ _ B-CW' in line]
b_per = [line for line in file if ' _ _ B-PER' in line]
b_loc = [line for line in file if ' _ _ B-LOC' in line]
i_prod = [line for line in file if ' _ _ I-PROD' in line]
i_grp = [line for line in file if ' _ _ I-GRP' in line]
i_corp = [line for line in file if ' _ _ I-CORP' in line]
i_cw = [line for line in file if ' _ _ I-CW' in line]
i_per = [line for line in file if ' _ _ I-PER' in line]
i_loc = [line for line in file if ' _ _ I-LOC' in line]

#dictionary of labels
lwtr = {'O': o_label, 
        'B-PROD': b_prod,
        'B-GRP': b_grp,
        'B-CORP': b_corp,
        'B-CW': b_cw,
        'B-PER': b_per,
        'B-LOC': b_loc,
        'I-PROD': i_prod,
        'I-GRP': i_grp,
        'I-CORP': i_corp,
        'I-CW': i_cw,
        'I-PER': i_per,
        'I-LOC': i_loc}

from numpy import random

for index, line in enumerate(file): #traverse through file. index:value
    x = random.binomial(n=1, p=0.5, size=1) #randomizer. x will be 0 or 1
    #x==1 means successly random / startswith() is to filter out id lines / line.strip() is to filter out '\n' lines
    if x == 1 and not line.startswith('# id ') and line.strip(): 
        curr_label = line.split(' _ _ ')[1].strip() #label found in current line. will be used as the key for lwtr dict
        lwtr_file[index] = random.choice(lwtr[curr_label]) #access lwtr dict to randomly choose replacement token of same label and reassign it to current line in file

lwtr_file

# Write to file
with open('SemEval2022-Task11_Train-Dev/NL-Dutch/nl_train_lwtr.conll', 'w') as file:
    for line in lwtr_file:
        file.write(line)

# Russian

In [23]:
with open('SemEval2022-Task11_Train-Dev/RU-Russian/ru_train.conll') as f:
    file = [i for i in f.readlines()]

#each dataset should have 15,300 training sentences
count = 0
for i in file:
    if i.startswith('# id '):
        count += 1
assert count == 15300

lwtr_file = file.copy() #make new list to prevent overwritting original file-list


#for Label-wise Token Replacement, we need to catalogue each possible label
o_label = [line for line in file if ' _ _ O' in line]
b_prod = [line for line in file if ' _ _ B-PROD' in line]
b_grp = [line for line in file if ' _ _ B-GRP' in line]
b_corp = [line for line in file if ' _ _ B-CORP' in line]
b_cw = [line for line in file if ' _ _ B-CW' in line]
b_per = [line for line in file if ' _ _ B-PER' in line]
b_loc = [line for line in file if ' _ _ B-LOC' in line]
i_prod = [line for line in file if ' _ _ I-PROD' in line]
i_grp = [line for line in file if ' _ _ I-GRP' in line]
i_corp = [line for line in file if ' _ _ I-CORP' in line]
i_cw = [line for line in file if ' _ _ I-CW' in line]
i_per = [line for line in file if ' _ _ I-PER' in line]
i_loc = [line for line in file if ' _ _ I-LOC' in line]

#dictionary of labels
lwtr = {'O': o_label, 
        'B-PROD': b_prod,
        'B-GRP': b_grp,
        'B-CORP': b_corp,
        'B-CW': b_cw,
        'B-PER': b_per,
        'B-LOC': b_loc,
        'I-PROD': i_prod,
        'I-GRP': i_grp,
        'I-CORP': i_corp,
        'I-CW': i_cw,
        'I-PER': i_per,
        'I-LOC': i_loc}

from numpy import random

for index, line in enumerate(file): #traverse through file. index:value
    x = random.binomial(n=1, p=0.5, size=1) #randomizer. x will be 0 or 1
    #x==1 means successly random / startswith() is to filter out id lines / line.strip() is to filter out '\n' lines
    if x == 1 and not line.startswith('# id ') and line.strip(): 
        curr_label = line.split(' _ _ ')[1].strip() #label found in current line. will be used as the key for lwtr dict
        lwtr_file[index] = random.choice(lwtr[curr_label]) #access lwtr dict to randomly choose replacement token of same label and reassign it to current line in file

lwtr_file

# Write to file
with open('SemEval2022-Task11_Train-Dev/RU-Russian/ru_train_lwtr.conll', 'w') as file:
    for line in lwtr_file:
        file.write(line)

# Turkish

In [24]:
with open('SemEval2022-Task11_Train-Dev/TR-Turkish/tr_train.conll') as f:
    file = [i for i in f.readlines()]

#each dataset should have 15,300 training sentences
count = 0
for i in file:
    if i.startswith('# id '):
        count += 1
assert count == 15300

lwtr_file = file.copy() #make new list to prevent overwritting original file-list


#for Label-wise Token Replacement, we need to catalogue each possible label
o_label = [line for line in file if ' _ _ O' in line]
b_prod = [line for line in file if ' _ _ B-PROD' in line]
b_grp = [line for line in file if ' _ _ B-GRP' in line]
b_corp = [line for line in file if ' _ _ B-CORP' in line]
b_cw = [line for line in file if ' _ _ B-CW' in line]
b_per = [line for line in file if ' _ _ B-PER' in line]
b_loc = [line for line in file if ' _ _ B-LOC' in line]
i_prod = [line for line in file if ' _ _ I-PROD' in line]
i_grp = [line for line in file if ' _ _ I-GRP' in line]
i_corp = [line for line in file if ' _ _ I-CORP' in line]
i_cw = [line for line in file if ' _ _ I-CW' in line]
i_per = [line for line in file if ' _ _ I-PER' in line]
i_loc = [line for line in file if ' _ _ I-LOC' in line]

#dictionary of labels
lwtr = {'O': o_label, 
        'B-PROD': b_prod,
        'B-GRP': b_grp,
        'B-CORP': b_corp,
        'B-CW': b_cw,
        'B-PER': b_per,
        'B-LOC': b_loc,
        'I-PROD': i_prod,
        'I-GRP': i_grp,
        'I-CORP': i_corp,
        'I-CW': i_cw,
        'I-PER': i_per,
        'I-LOC': i_loc}

from numpy import random

for index, line in enumerate(file): #traverse through file. index:value
    x = random.binomial(n=1, p=0.5, size=1) #randomizer. x will be 0 or 1
    #x==1 means successly random / startswith() is to filter out id lines / line.strip() is to filter out '\n' lines
    if x == 1 and not line.startswith('# id ') and line.strip(): 
        curr_label = line.split(' _ _ ')[1].strip() #label found in current line. will be used as the key for lwtr dict
        lwtr_file[index] = random.choice(lwtr[curr_label]) #access lwtr dict to randomly choose replacement token of same label and reassign it to current line in file

lwtr_file

# Write to file
with open('SemEval2022-Task11_Train-Dev/TR-Turkish/tr_train_lwtr.conll', 'w') as file:
    for line in lwtr_file:
        file.write(line)

# Chinese

In [25]:
with open('SemEval2022-Task11_Train-Dev/ZH-Chinese/zh_train.conll') as f:
    file = [i for i in f.readlines()]

#each dataset should have 15,300 training sentences
count = 0
for i in file:
    if i.startswith('# id '):
        count += 1
assert count == 15300

lwtr_file = file.copy() #make new list to prevent overwritting original file-list


#for Label-wise Token Replacement, we need to catalogue each possible label
o_label = [line for line in file if ' _ _ O' in line]
b_prod = [line for line in file if ' _ _ B-PROD' in line]
b_grp = [line for line in file if ' _ _ B-GRP' in line]
b_corp = [line for line in file if ' _ _ B-CORP' in line]
b_cw = [line for line in file if ' _ _ B-CW' in line]
b_per = [line for line in file if ' _ _ B-PER' in line]
b_loc = [line for line in file if ' _ _ B-LOC' in line]
i_prod = [line for line in file if ' _ _ I-PROD' in line]
i_grp = [line for line in file if ' _ _ I-GRP' in line]
i_corp = [line for line in file if ' _ _ I-CORP' in line]
i_cw = [line for line in file if ' _ _ I-CW' in line]
i_per = [line for line in file if ' _ _ I-PER' in line]
i_loc = [line for line in file if ' _ _ I-LOC' in line]

#dictionary of labels
lwtr = {'O': o_label, 
        'B-PROD': b_prod,
        'B-GRP': b_grp,
        'B-CORP': b_corp,
        'B-CW': b_cw,
        'B-PER': b_per,
        'B-LOC': b_loc,
        'I-PROD': i_prod,
        'I-GRP': i_grp,
        'I-CORP': i_corp,
        'I-CW': i_cw,
        'I-PER': i_per,
        'I-LOC': i_loc}

from numpy import random

for index, line in enumerate(file): #traverse through file. index:value
    x = random.binomial(n=1, p=0.5, size=1) #randomizer. x will be 0 or 1
    #x==1 means successly random / startswith() is to filter out id lines / line.strip() is to filter out '\n' lines
    if x == 1 and not line.startswith('# id ') and line.strip(): 
        curr_label = line.split(' _ _ ')[1].strip() #label found in current line. will be used as the key for lwtr dict
        lwtr_file[index] = random.choice(lwtr[curr_label]) #access lwtr dict to randomly choose replacement token of same label and reassign it to current line in file

lwtr_file

# Write to file
with open('SemEval2022-Task11_Train-Dev/ZH-Chinese/zh_train_lwtr.conll', 'w') as file:
    for line in lwtr_file:
        file.write(line)

# SR - Synonym Replacement

From Adel & Dai:

Synonym replacement (SR): Our second approach is similar to LwTR, except that we replace the
token with one of its synonyms retrieved from WordNet. Note that the retrieved synonym may consist of
more than one token. However, its BIO-labels can be derived using a simple rule: If the replaced token
is the first token within a mention (i.e., the corresponding label is ‘B-EntityType’), we assign the same
label to the first token of the retrieved multi-word synonym, and ‘I-EntityType’ to the other tokens. If the
replaced token is inside a mention (i.e., the corresponding label is ‘I-EntityType’), we assign its label to
all tokens of the multi-word synonym.


https://fasttext.cc/docs/en/pretrained-vectors.html
    
https://radimrehurek.com/gensim/models/fasttext.html

In [15]:
def syn_replace(train_file, word_vector):
    from numpy import random
    from gensim.test.utils import datapath
    from gensim.models.fasttext import load_facebook_vectors
    
    wv = load_facebook_vectors(word_vector) #load language word vector
    
    sr_file = [] #eventual file to write into .conll 
    aug_count = 0
    no_aug_count = 0
    for index, line in enumerate(file):
        if not line.startswith('# id ') and line.strip(): #filter out '#id' and '\n'
            x = random.binomial(n=1, p=0.5, size=1) #random coin flip, 1 is augment, 0 is no augmentation
            if x == 1: #augment by replacing current word with the most similar (synonym) word in word vector
                separate = line.split(' _ _ ') #to get a list of just the word with the label
                word = separate[0] #isolate the word in the line, without it's label
                synonym = wv.most_similar(word)[0][0] #synonym of isolated word. 'most_similar' returns list of most top 10 most similar words, with corresponding percentage likelihoods. we just need the MOST likely ([0]), without the percentage ([0])
                separate[0] = synonym #replace original word with its new synonym
                sr_file.append(' _ _ '.join(separate)) #join back together in format of 'synonym _ _ label'
                
                aug_count += 1
                
            else: #no augmentation, still keep original line
                sr_file.append(line)
                
                no_aug_count += 1
        else: #append #id's and \n
            sr_file.append(line)
    print(f'{aug_count} instances of synonym replacement.')
    print(f'{no_aug_count} instances of no synonym replacement.')
    return sr_file

In [None]:
a = 

# Bangla

In [16]:
with open('SemEval2022-Task11_Train-Dev/BN-Bangla/bn_train.conll') as f:
    file = f.readlines() #original bangla training data

sr_file = syn_replace(file, 'SemEval2022-Task11_Train-Dev/BN-Bangla/wiki.bn/wiki.bn.bin')

96233 instances of synonym replacement.
95664 instances of no synonym replacement.


In [17]:
# Write to file
with open('SemEval2022-Task11_Train-Dev/BN-Bangla/bn_train_sr.conll', 'w') as file:
    for line in sr_file:
        file.write(line)

# German

In [18]:
with open('SemEval2022-Task11_Train-Dev/DE-German/de_train.conll') as f:
    file = f.readlines() #original bangla training data

sr_file = syn_replace(file, 'SemEval2022-Task11_Train-Dev/DE-German/wiki.de/wiki.de.bin')

109267 instances of synonym replacement.
109455 instances of no synonym replacement.


In [19]:
# Write to file
with open('SemEval2022-Task11_Train-Dev/DE-German/de_train_sr.conll', 'w') as file:
    for line in sr_file:
        file.write(line)

# Korean

In [4]:
with open('SemEval2022-Task11_Train-Dev/KO-Korean/ko_train.conll') as f:
    file = f.readlines() #original bangla training data

sr_file = syn_replace(file, 'SemEval2022-Task11_Train-Dev/KO-Korean/wiki.ko/wiki.ko.bin')


111393 instances of synonym replacement.
111310 instances of no synonym replacement.


In [5]:
# Write to file
with open('SemEval2022-Task11_Train-Dev/KO-Korean/ko_train_sr.conll', 'w') as file:
    for line in sr_file:
        file.write(line)

In [None]:
sr_file

# MR - Mention Replacement

From Adel & Dai:

Mention replacement (MR): For each mention in the instance, we use a binomial distribution to
randomly decide whether it should be replaced. If yes, we randomly select another mention from the
original training set which has the same entity type as the replacement. The corresponding BIO-label
sequence can be changed accordingly. 

# Bangla

In [223]:
############## DATA STRUCTURES SETUP ###############

with open('SemEval2022-Task11_Train-Dev/BN-Bangla/bn_train.conll') as f:
    file = [i for i in f.readlines()]

from collections import defaultdict

mention_rep = defaultdict(list) #create default dictionary of lists
entities = ['PER', 'LOC', 'GRP', 'CORP', 'PROD', 'CW'] #all basic NER labels

begin = False #variable for when you've first encountered and captured the head of a mention (B-)
inside = False #variable for when you're inside the mention (I-). There can be 1+ of these

for entity in entities: #traverse through the entities
    for line in file: #traverse through each line of the language file
        if begin: #check to see if you've already run into the head of the mention
            if f'I-{entity}' in line: #if not, then see if the I- is in this line
                temp_list.append(line) #if so, you must've gone to the 'elif' below and created the temp_list varible. Append current I- onto this list
                inside = True #switch to True to alert that you're inside the mention from now on
            
            else: #if there's no I- in this line that means you've 1) got the head, and 2) either gotten all I-'s already or there's no more I-'s, and you're done trying to capture a mention
                if mention_rep[entity]: #check to see if you're already started storing values into this entity (PER, LOC, etc.)
                    mention_rep[entity].append(temp_list) #if so, simply append the temp_list onto the existing list
                else: #if not that means this is your first entry for an entity's values. defaultdict(list) can't perform append when this happens...
                    mention_rep[entity] = [temp_list] #...so we'll have to store the temp_list as a list to prevent the temp_list's contents from being broken into individual values and stored into the dict's value list
                begin = False #revert begin to False for next loop
                inside = False #revert inside for the same reason
            
        elif f'B-{entity}' in line: #if you haven't already run into the head of the mention (B-), check to see if it's in this current loop/line
            temp_list = [] #create a temporary list that you'll use to collect a mention's beginning (B-) and insides (I-). this will also reset the temp_list for subsequent loops 
            temp_list.append(line) #add the beginning labels (B-) to this list
            begin = True #switch to True to alert that you've found the head label

#assert every mention was stored as a list
for i in mention_rep: #traverse the dict's keys
    for j in mention_rep[i]: #traverse each value in the value-list of the key
        assert type(j) is list #check if every value is a list. this is to ensure consistency later on


################################################# PERFORM MENTION REPLACEMENT ########################################################
import numpy as np #will need ndarray

mr_file = []
labels = tuple(mention_rep.keys()) #set of labels: PER, LOC, GRP, CORP, PROD, CW

#capture_inside = False
for index, line in enumerate(file): #traverse through file by line
    #print('line:', line)
    if line.strip().endswith(labels): #endswith() is to filter out anything other than mentions
        if line.strip().split(' _ _ ')[1].startswith('B'): #only get the beginnings of mentions (B-)
            #print('is label')
            x = random.binomial(n = 1, p = 0.5, size = 1) #randomizer. x will be 0 or 1
            #print(f'x = {x}')
            if x == 1: #x==1 means successfully random, perform mention replacement
                curr_label = line.split(' _ _ ')[1].strip()[2:] #current label type will be used as key next. '2:' is to remove the 'B-' or 'I-' from beginning of label
                #print(f'curr_label: {curr_label}')
                new_mention = np.random.choice(np.array(mention_rep[curr_label], dtype = 'object')) #have to make this ndarray because the values of our mention_rep[key] dictionary is lists of LISTS. it becomes deprecated. same logic applies as LwTR above
                #print(f'new_mention: {new_mention}')
                if len(new_mention) == 1: #just insert sole element
                    #print('len 1')
                    mr_file.append(new_mention[0])
                    #print(f'{index+1}: {mr_file}')
                else: #multi-word mention
                    #print('multi-word')
                    for mention in new_mention: #traverse through new_mention list
                        #test_file.insert(index+idx, mention) #subsequently insert each part of new_mention 
                        mr_file.append(mention)
                        #print(f'{index+1}: {mr_file}\n')
            else: #B-, but no data augmentation
                #print('label, but no data augmentation')
                mr_file.append(line) #add line (non-replaced mention beginning (B-))
                capture_inside = True #this will enable capturing the inside (I-) of the mention if it appears next, instead of disregarding it
                #print(f'{index+1}: {mr_file}\n')
        elif capture_inside: #add the inside of mention, because we just had a non-replaced mention beginning (B-), so we must capture all insides of original mention
            #print('unsuccessful mention replacement. capturing current inside mention, I-')
            mr_file.append(line) #append non mention replaced inside (I-)
            #print(f'{index+1}: {mr_file}\n')
#         else:
#             print('sucessful mention replacement. not capturing current inside of mention')
            
    else: #everything that doesn't have a mention label in it
        #print('not label')
        mr_file.append(line) #append current line
        #print(f'{index+1}: {mr_file}\n')
        capture_inside = False #switch off the flag to indicate we must capture a mention inside

        
############################################################### CHECKING ############################################################
new_count = 0
for i in mr_file:
    if i.startswith('# id '):
        new_count += 1
assert original_count == new_count == 15300

############################################################ WRITING TO NEW FILE #####################################################
with open('SemEval2022-Task11_Train-Dev/BN-Bangla/bn_train_mr.conll', 'w') as file:
    for line in mr_file:
        file.write(line)

# German

In [229]:
############## DATA STRUCTURES SETUP ###############

with open('SemEval2022-Task11_Train-Dev/DE-German/de_train.conll') as f:
    file = [i for i in f.readlines()]

from collections import defaultdict

mention_rep = defaultdict(list) #create default dictionary of lists
entities = ['PER', 'LOC', 'GRP', 'CORP', 'PROD', 'CW'] #all basic NER labels

begin = False #variable for when you've first encountered and captured the head of a mention (B-)
inside = False #variable for when you're inside the mention (I-). There can be 1+ of these

for entity in entities: #traverse through the entities
    for line in file: #traverse through each line of the language file
        if begin: #check to see if you've already run into the head of the mention
            if f'I-{entity}' in line: #if not, then see if the I- is in this line
                temp_list.append(line) #if so, you must've gone to the 'elif' below and created the temp_list varible. Append current I- onto this list
                inside = True #switch to True to alert that you're inside the mention from now on
            
            else: #if there's no I- in this line that means you've 1) got the head, and 2) either gotten all I-'s already or there's no more I-'s, and you're done trying to capture a mention
                if mention_rep[entity]: #check to see if you're already started storing values into this entity (PER, LOC, etc.)
                    mention_rep[entity].append(temp_list) #if so, simply append the temp_list onto the existing list
                else: #if not that means this is your first entry for an entity's values. defaultdict(list) can't perform append when this happens...
                    mention_rep[entity] = [temp_list] #...so we'll have to store the temp_list as a list to prevent the temp_list's contents from being broken into individual values and stored into the dict's value list
                begin = False #revert begin to False for next loop
                inside = False #revert inside for the same reason
            
        elif f'B-{entity}' in line: #if you haven't already run into the head of the mention (B-), check to see if it's in this current loop/line
            temp_list = [] #create a temporary list that you'll use to collect a mention's beginning (B-) and insides (I-). this will also reset the temp_list for subsequent loops 
            temp_list.append(line) #add the beginning labels (B-) to this list
            begin = True #switch to True to alert that you've found the head label

#assert every mention was stored as a list
for i in mention_rep: #traverse the dict's keys
    for j in mention_rep[i]: #traverse each value in the value-list of the key
        assert type(j) is list #check if every value is a list. this is to ensure consistency later on


################################################# PERFORM MENTION REPLACEMENT ########################################################
import numpy as np #will need ndarray

mr_file = []
labels = tuple(mention_rep.keys()) #set of labels: PER, LOC, GRP, CORP, PROD, CW

#capture_inside = False
for index, line in enumerate(file): #traverse through file by line
    #print('line:', line)
    if line.strip().endswith(labels): #endswith() is to filter out anything other than mentions
        if line.strip().split(' _ _ ')[1].startswith('B'): #only get the beginnings of mentions (B-)
            #print('is label')
            x = random.binomial(n = 1, p = 0.5, size = 1) #randomizer. x will be 0 or 1
            #print(f'x = {x}')
            if x == 1: #x==1 means successfully random, perform mention replacement
                curr_label = line.split(' _ _ ')[1].strip()[2:] #current label type will be used as key next. '2:' is to remove the 'B-' or 'I-' from beginning of label
                #print(f'curr_label: {curr_label}')
                new_mention = np.random.choice(np.array(mention_rep[curr_label], dtype = 'object')) #have to make this ndarray because the values of our mention_rep[key] dictionary is lists of LISTS. it becomes deprecated. same logic applies as LwTR above
                #print(f'new_mention: {new_mention}')
                if len(new_mention) == 1: #just insert sole element
                    #print('len 1')
                    mr_file.append(new_mention[0])
                    #print(f'{index+1}: {mr_file}')
                else: #multi-word mention
                    #print('multi-word')
                    for mention in new_mention: #traverse through new_mention list
                        #test_file.insert(index+idx, mention) #subsequently insert each part of new_mention 
                        mr_file.append(mention)
                        #print(f'{index+1}: {mr_file}\n')
            else: #B-, but no data augmentation
                #print('label, but no data augmentation')
                mr_file.append(line) #add line (non-replaced mention beginning (B-))
                capture_inside = True #this will enable capturing the inside (I-) of the mention if it appears next, instead of disregarding it
                #print(f'{index+1}: {mr_file}\n')
        elif capture_inside: #add the inside of mention, because we just had a non-replaced mention beginning (B-), so we must capture all insides of original mention
            #print('unsuccessful mention replacement. capturing current inside mention, I-')
            mr_file.append(line) #append non mention replaced inside (I-)
            #print(f'{index+1}: {mr_file}\n')
#         else:
#             print('sucessful mention replacement. not capturing current inside of mention')
            
    else: #everything that doesn't have a mention label in it
        #print('not label')
        mr_file.append(line) #append current line
        #print(f'{index+1}: {mr_file}\n')
        capture_inside = False #switch off the flag to indicate we must capture a mention inside

        
############################################################### CHECKING ############################################################
new_count = 0
for i in mr_file:
    if i.startswith('# id '):
        new_count += 1
assert original_count == new_count == 15300

############################################################ WRITING TO NEW FILE #####################################################
with open('SemEval2022-Task11_Train-Dev/DE-German/de_train_mr.conll', 'w') as file:
    for line in mr_file:
        file.write(line)

# English

In [230]:
############## DATA STRUCTURES SETUP ###############

with open('SemEval2022-Task11_Train-Dev/EN-English/en_train.conll') as f:
    file = [i for i in f.readlines()]

from collections import defaultdict

mention_rep = defaultdict(list) #create default dictionary of lists
entities = ['PER', 'LOC', 'GRP', 'CORP', 'PROD', 'CW'] #all basic NER labels

begin = False #variable for when you've first encountered and captured the head of a mention (B-)
inside = False #variable for when you're inside the mention (I-). There can be 1+ of these

for entity in entities: #traverse through the entities
    for line in file: #traverse through each line of the language file
        if begin: #check to see if you've already run into the head of the mention
            if f'I-{entity}' in line: #if not, then see if the I- is in this line
                temp_list.append(line) #if so, you must've gone to the 'elif' below and created the temp_list varible. Append current I- onto this list
                inside = True #switch to True to alert that you're inside the mention from now on
            
            else: #if there's no I- in this line that means you've 1) got the head, and 2) either gotten all I-'s already or there's no more I-'s, and you're done trying to capture a mention
                if mention_rep[entity]: #check to see if you're already started storing values into this entity (PER, LOC, etc.)
                    mention_rep[entity].append(temp_list) #if so, simply append the temp_list onto the existing list
                else: #if not that means this is your first entry for an entity's values. defaultdict(list) can't perform append when this happens...
                    mention_rep[entity] = [temp_list] #...so we'll have to store the temp_list as a list to prevent the temp_list's contents from being broken into individual values and stored into the dict's value list
                begin = False #revert begin to False for next loop
                inside = False #revert inside for the same reason
            
        elif f'B-{entity}' in line: #if you haven't already run into the head of the mention (B-), check to see if it's in this current loop/line
            temp_list = [] #create a temporary list that you'll use to collect a mention's beginning (B-) and insides (I-). this will also reset the temp_list for subsequent loops 
            temp_list.append(line) #add the beginning labels (B-) to this list
            begin = True #switch to True to alert that you've found the head label

#assert every mention was stored as a list
for i in mention_rep: #traverse the dict's keys
    for j in mention_rep[i]: #traverse each value in the value-list of the key
        assert type(j) is list #check if every value is a list. this is to ensure consistency later on


################################################# PERFORM MENTION REPLACEMENT ########################################################
import numpy as np #will need ndarray

mr_file = []
labels = tuple(mention_rep.keys()) #set of labels: PER, LOC, GRP, CORP, PROD, CW

#capture_inside = False
for index, line in enumerate(file): #traverse through file by line
    #print('line:', line)
    if line.strip().endswith(labels): #endswith() is to filter out anything other than mentions
        if line.strip().split(' _ _ ')[1].startswith('B'): #only get the beginnings of mentions (B-)
            #print('is label')
            x = random.binomial(n = 1, p = 0.5, size = 1) #randomizer. x will be 0 or 1
            #print(f'x = {x}')
            if x == 1: #x==1 means successfully random, perform mention replacement
                curr_label = line.split(' _ _ ')[1].strip()[2:] #current label type will be used as key next. '2:' is to remove the 'B-' or 'I-' from beginning of label
                #print(f'curr_label: {curr_label}')
                new_mention = np.random.choice(np.array(mention_rep[curr_label], dtype = 'object')) #have to make this ndarray because the values of our mention_rep[key] dictionary is lists of LISTS. it becomes deprecated. same logic applies as LwTR above
                #print(f'new_mention: {new_mention}')
                if len(new_mention) == 1: #just insert sole element
                    #print('len 1')
                    mr_file.append(new_mention[0])
                    #print(f'{index+1}: {mr_file}')
                else: #multi-word mention
                    #print('multi-word')
                    for mention in new_mention: #traverse through new_mention list
                        #test_file.insert(index+idx, mention) #subsequently insert each part of new_mention 
                        mr_file.append(mention)
                        #print(f'{index+1}: {mr_file}\n')
            else: #B-, but no data augmentation
                #print('label, but no data augmentation')
                mr_file.append(line) #add line (non-replaced mention beginning (B-))
                capture_inside = True #this will enable capturing the inside (I-) of the mention if it appears next, instead of disregarding it
                #print(f'{index+1}: {mr_file}\n')
        elif capture_inside: #add the inside of mention, because we just had a non-replaced mention beginning (B-), so we must capture all insides of original mention
            #print('unsuccessful mention replacement. capturing current inside mention, I-')
            mr_file.append(line) #append non mention replaced inside (I-)
            #print(f'{index+1}: {mr_file}\n')
#         else:
#             print('sucessful mention replacement. not capturing current inside of mention')
            
    else: #everything that doesn't have a mention label in it
        #print('not label')
        mr_file.append(line) #append current line
        #print(f'{index+1}: {mr_file}\n')
        capture_inside = False #switch off the flag to indicate we must capture a mention inside

        
############################################################### CHECKING ############################################################
new_count = 0
for i in mr_file:
    if i.startswith('# id '):
        new_count += 1
assert original_count == new_count == 15300

############################################################ WRITING TO NEW FILE #####################################################
with open('SemEval2022-Task11_Train-Dev/EN-English/en_train_mr.conll', 'w') as file:
    for line in mr_file:
        file.write(line)

# Spanish

In [231]:
############## DATA STRUCTURES SETUP ###############

with open('SemEval2022-Task11_Train-Dev/DE-German/de_train.conll') as f:
    file = [i for i in f.readlines()]

from collections import defaultdict

mention_rep = defaultdict(list) #create default dictionary of lists
entities = ['PER', 'LOC', 'GRP', 'CORP', 'PROD', 'CW'] #all basic NER labels

begin = False #variable for when you've first encountered and captured the head of a mention (B-)
inside = False #variable for when you're inside the mention (I-). There can be 1+ of these

for entity in entities: #traverse through the entities
    for line in file: #traverse through each line of the language file
        if begin: #check to see if you've already run into the head of the mention
            if f'I-{entity}' in line: #if not, then see if the I- is in this line
                temp_list.append(line) #if so, you must've gone to the 'elif' below and created the temp_list varible. Append current I- onto this list
                inside = True #switch to True to alert that you're inside the mention from now on
            
            else: #if there's no I- in this line that means you've 1) got the head, and 2) either gotten all I-'s already or there's no more I-'s, and you're done trying to capture a mention
                if mention_rep[entity]: #check to see if you're already started storing values into this entity (PER, LOC, etc.)
                    mention_rep[entity].append(temp_list) #if so, simply append the temp_list onto the existing list
                else: #if not that means this is your first entry for an entity's values. defaultdict(list) can't perform append when this happens...
                    mention_rep[entity] = [temp_list] #...so we'll have to store the temp_list as a list to prevent the temp_list's contents from being broken into individual values and stored into the dict's value list
                begin = False #revert begin to False for next loop
                inside = False #revert inside for the same reason
            
        elif f'B-{entity}' in line: #if you haven't already run into the head of the mention (B-), check to see if it's in this current loop/line
            temp_list = [] #create a temporary list that you'll use to collect a mention's beginning (B-) and insides (I-). this will also reset the temp_list for subsequent loops 
            temp_list.append(line) #add the beginning labels (B-) to this list
            begin = True #switch to True to alert that you've found the head label

#assert every mention was stored as a list
for i in mention_rep: #traverse the dict's keys
    for j in mention_rep[i]: #traverse each value in the value-list of the key
        assert type(j) is list #check if every value is a list. this is to ensure consistency later on


################################################# PERFORM MENTION REPLACEMENT ########################################################
import numpy as np #will need ndarray

mr_file = []
labels = tuple(mention_rep.keys()) #set of labels: PER, LOC, GRP, CORP, PROD, CW

#capture_inside = False
for index, line in enumerate(file): #traverse through file by line
    #print('line:', line)
    if line.strip().endswith(labels): #endswith() is to filter out anything other than mentions
        if line.strip().split(' _ _ ')[1].startswith('B'): #only get the beginnings of mentions (B-)
            #print('is label')
            x = random.binomial(n = 1, p = 0.5, size = 1) #randomizer. x will be 0 or 1
            #print(f'x = {x}')
            if x == 1: #x==1 means successfully random, perform mention replacement
                curr_label = line.split(' _ _ ')[1].strip()[2:] #current label type will be used as key next. '2:' is to remove the 'B-' or 'I-' from beginning of label
                #print(f'curr_label: {curr_label}')
                new_mention = np.random.choice(np.array(mention_rep[curr_label], dtype = 'object')) #have to make this ndarray because the values of our mention_rep[key] dictionary is lists of LISTS. it becomes deprecated. same logic applies as LwTR above
                #print(f'new_mention: {new_mention}')
                if len(new_mention) == 1: #just insert sole element
                    #print('len 1')
                    mr_file.append(new_mention[0])
                    #print(f'{index+1}: {mr_file}')
                else: #multi-word mention
                    #print('multi-word')
                    for mention in new_mention: #traverse through new_mention list
                        #test_file.insert(index+idx, mention) #subsequently insert each part of new_mention 
                        mr_file.append(mention)
                        #print(f'{index+1}: {mr_file}\n')
            else: #B-, but no data augmentation
                #print('label, but no data augmentation')
                mr_file.append(line) #add line (non-replaced mention beginning (B-))
                capture_inside = True #this will enable capturing the inside (I-) of the mention if it appears next, instead of disregarding it
                #print(f'{index+1}: {mr_file}\n')
        elif capture_inside: #add the inside of mention, because we just had a non-replaced mention beginning (B-), so we must capture all insides of original mention
            #print('unsuccessful mention replacement. capturing current inside mention, I-')
            mr_file.append(line) #append non mention replaced inside (I-)
            #print(f'{index+1}: {mr_file}\n')
#         else:
#             print('sucessful mention replacement. not capturing current inside of mention')
            
    else: #everything that doesn't have a mention label in it
        #print('not label')
        mr_file.append(line) #append current line
        #print(f'{index+1}: {mr_file}\n')
        capture_inside = False #switch off the flag to indicate we must capture a mention inside

        
############################################################### CHECKING ############################################################
new_count = 0
for i in mr_file:
    if i.startswith('# id '):
        new_count += 1
assert original_count == new_count == 15300

############################################################ WRITING TO NEW FILE #####################################################
with open('SemEval2022-Task11_Train-Dev/ES-Spanish/es_train_mr.conll', 'w') as file:
    for line in mr_file:
        file.write(line)

# Farsi

In [232]:
############## DATA STRUCTURES SETUP ###############

with open('SemEval2022-Task11_Train-Dev/FA-Farsi/fa_train.conll') as f:
    file = [i for i in f.readlines()]

from collections import defaultdict

mention_rep = defaultdict(list) #create default dictionary of lists
entities = ['PER', 'LOC', 'GRP', 'CORP', 'PROD', 'CW'] #all basic NER labels

begin = False #variable for when you've first encountered and captured the head of a mention (B-)
inside = False #variable for when you're inside the mention (I-). There can be 1+ of these

for entity in entities: #traverse through the entities
    for line in file: #traverse through each line of the language file
        if begin: #check to see if you've already run into the head of the mention
            if f'I-{entity}' in line: #if not, then see if the I- is in this line
                temp_list.append(line) #if so, you must've gone to the 'elif' below and created the temp_list varible. Append current I- onto this list
                inside = True #switch to True to alert that you're inside the mention from now on
            
            else: #if there's no I- in this line that means you've 1) got the head, and 2) either gotten all I-'s already or there's no more I-'s, and you're done trying to capture a mention
                if mention_rep[entity]: #check to see if you're already started storing values into this entity (PER, LOC, etc.)
                    mention_rep[entity].append(temp_list) #if so, simply append the temp_list onto the existing list
                else: #if not that means this is your first entry for an entity's values. defaultdict(list) can't perform append when this happens...
                    mention_rep[entity] = [temp_list] #...so we'll have to store the temp_list as a list to prevent the temp_list's contents from being broken into individual values and stored into the dict's value list
                begin = False #revert begin to False for next loop
                inside = False #revert inside for the same reason
            
        elif f'B-{entity}' in line: #if you haven't already run into the head of the mention (B-), check to see if it's in this current loop/line
            temp_list = [] #create a temporary list that you'll use to collect a mention's beginning (B-) and insides (I-). this will also reset the temp_list for subsequent loops 
            temp_list.append(line) #add the beginning labels (B-) to this list
            begin = True #switch to True to alert that you've found the head label

#assert every mention was stored as a list
for i in mention_rep: #traverse the dict's keys
    for j in mention_rep[i]: #traverse each value in the value-list of the key
        assert type(j) is list #check if every value is a list. this is to ensure consistency later on


################################################# PERFORM MENTION REPLACEMENT ########################################################
import numpy as np #will need ndarray

mr_file = []
labels = tuple(mention_rep.keys()) #set of labels: PER, LOC, GRP, CORP, PROD, CW

#capture_inside = False
for index, line in enumerate(file): #traverse through file by line
    #print('line:', line)
    if line.strip().endswith(labels): #endswith() is to filter out anything other than mentions
        if line.strip().split(' _ _ ')[1].startswith('B'): #only get the beginnings of mentions (B-)
            #print('is label')
            x = random.binomial(n = 1, p = 0.5, size = 1) #randomizer. x will be 0 or 1
            #print(f'x = {x}')
            if x == 1: #x==1 means successfully random, perform mention replacement
                curr_label = line.split(' _ _ ')[1].strip()[2:] #current label type will be used as key next. '2:' is to remove the 'B-' or 'I-' from beginning of label
                #print(f'curr_label: {curr_label}')
                new_mention = np.random.choice(np.array(mention_rep[curr_label], dtype = 'object')) #have to make this ndarray because the values of our mention_rep[key] dictionary is lists of LISTS. it becomes deprecated. same logic applies as LwTR above
                #print(f'new_mention: {new_mention}')
                if len(new_mention) == 1: #just insert sole element
                    #print('len 1')
                    mr_file.append(new_mention[0])
                    #print(f'{index+1}: {mr_file}')
                else: #multi-word mention
                    #print('multi-word')
                    for mention in new_mention: #traverse through new_mention list
                        #test_file.insert(index+idx, mention) #subsequently insert each part of new_mention 
                        mr_file.append(mention)
                        #print(f'{index+1}: {mr_file}\n')
            else: #B-, but no data augmentation
                #print('label, but no data augmentation')
                mr_file.append(line) #add line (non-replaced mention beginning (B-))
                capture_inside = True #this will enable capturing the inside (I-) of the mention if it appears next, instead of disregarding it
                #print(f'{index+1}: {mr_file}\n')
        elif capture_inside: #add the inside of mention, because we just had a non-replaced mention beginning (B-), so we must capture all insides of original mention
            #print('unsuccessful mention replacement. capturing current inside mention, I-')
            mr_file.append(line) #append non mention replaced inside (I-)
            #print(f'{index+1}: {mr_file}\n')
#         else:
#             print('sucessful mention replacement. not capturing current inside of mention')
            
    else: #everything that doesn't have a mention label in it
        #print('not label')
        mr_file.append(line) #append current line
        #print(f'{index+1}: {mr_file}\n')
        capture_inside = False #switch off the flag to indicate we must capture a mention inside

        
############################################################### CHECKING ############################################################
new_count = 0
for i in mr_file:
    if i.startswith('# id '):
        new_count += 1
assert original_count == new_count == 15300

############################################################ WRITING TO NEW FILE #####################################################
with open('SemEval2022-Task11_Train-Dev/FA-Farsi/fa_train_mr.conll', 'w') as file:
    for line in mr_file:
        file.write(line)

# Hindi

In [233]:
############## DATA STRUCTURES SETUP ###############

with open('SemEval2022-Task11_Train-Dev/HI-Hindi/hi_train.conll') as f:
    file = [i for i in f.readlines()]

from collections import defaultdict

mention_rep = defaultdict(list) #create default dictionary of lists
entities = ['PER', 'LOC', 'GRP', 'CORP', 'PROD', 'CW'] #all basic NER labels

begin = False #variable for when you've first encountered and captured the head of a mention (B-)
inside = False #variable for when you're inside the mention (I-). There can be 1+ of these

for entity in entities: #traverse through the entities
    for line in file: #traverse through each line of the language file
        if begin: #check to see if you've already run into the head of the mention
            if f'I-{entity}' in line: #if not, then see if the I- is in this line
                temp_list.append(line) #if so, you must've gone to the 'elif' below and created the temp_list varible. Append current I- onto this list
                inside = True #switch to True to alert that you're inside the mention from now on
            
            else: #if there's no I- in this line that means you've 1) got the head, and 2) either gotten all I-'s already or there's no more I-'s, and you're done trying to capture a mention
                if mention_rep[entity]: #check to see if you're already started storing values into this entity (PER, LOC, etc.)
                    mention_rep[entity].append(temp_list) #if so, simply append the temp_list onto the existing list
                else: #if not that means this is your first entry for an entity's values. defaultdict(list) can't perform append when this happens...
                    mention_rep[entity] = [temp_list] #...so we'll have to store the temp_list as a list to prevent the temp_list's contents from being broken into individual values and stored into the dict's value list
                begin = False #revert begin to False for next loop
                inside = False #revert inside for the same reason
            
        elif f'B-{entity}' in line: #if you haven't already run into the head of the mention (B-), check to see if it's in this current loop/line
            temp_list = [] #create a temporary list that you'll use to collect a mention's beginning (B-) and insides (I-). this will also reset the temp_list for subsequent loops 
            temp_list.append(line) #add the beginning labels (B-) to this list
            begin = True #switch to True to alert that you've found the head label

#assert every mention was stored as a list
for i in mention_rep: #traverse the dict's keys
    for j in mention_rep[i]: #traverse each value in the value-list of the key
        assert type(j) is list #check if every value is a list. this is to ensure consistency later on


################################################# PERFORM MENTION REPLACEMENT ########################################################
import numpy as np #will need ndarray

mr_file = []
labels = tuple(mention_rep.keys()) #set of labels: PER, LOC, GRP, CORP, PROD, CW

#capture_inside = False
for index, line in enumerate(file): #traverse through file by line
    #print('line:', line)
    if line.strip().endswith(labels): #endswith() is to filter out anything other than mentions
        if line.strip().split(' _ _ ')[1].startswith('B'): #only get the beginnings of mentions (B-)
            #print('is label')
            x = random.binomial(n = 1, p = 0.5, size = 1) #randomizer. x will be 0 or 1
            #print(f'x = {x}')
            if x == 1: #x==1 means successfully random, perform mention replacement
                curr_label = line.split(' _ _ ')[1].strip()[2:] #current label type will be used as key next. '2:' is to remove the 'B-' or 'I-' from beginning of label
                #print(f'curr_label: {curr_label}')
                new_mention = np.random.choice(np.array(mention_rep[curr_label], dtype = 'object')) #have to make this ndarray because the values of our mention_rep[key] dictionary is lists of LISTS. it becomes deprecated. same logic applies as LwTR above
                #print(f'new_mention: {new_mention}')
                if len(new_mention) == 1: #just insert sole element
                    #print('len 1')
                    mr_file.append(new_mention[0])
                    #print(f'{index+1}: {mr_file}')
                else: #multi-word mention
                    #print('multi-word')
                    for mention in new_mention: #traverse through new_mention list
                        #test_file.insert(index+idx, mention) #subsequently insert each part of new_mention 
                        mr_file.append(mention)
                        #print(f'{index+1}: {mr_file}\n')
            else: #B-, but no data augmentation
                #print('label, but no data augmentation')
                mr_file.append(line) #add line (non-replaced mention beginning (B-))
                capture_inside = True #this will enable capturing the inside (I-) of the mention if it appears next, instead of disregarding it
                #print(f'{index+1}: {mr_file}\n')
        elif capture_inside: #add the inside of mention, because we just had a non-replaced mention beginning (B-), so we must capture all insides of original mention
            #print('unsuccessful mention replacement. capturing current inside mention, I-')
            mr_file.append(line) #append non mention replaced inside (I-)
            #print(f'{index+1}: {mr_file}\n')
#         else:
#             print('sucessful mention replacement. not capturing current inside of mention')
            
    else: #everything that doesn't have a mention label in it
        #print('not label')
        mr_file.append(line) #append current line
        #print(f'{index+1}: {mr_file}\n')
        capture_inside = False #switch off the flag to indicate we must capture a mention inside

        
############################################################### CHECKING ############################################################
new_count = 0
for i in mr_file:
    if i.startswith('# id '):
        new_count += 1
assert original_count == new_count == 15300

############################################################ WRITING TO NEW FILE #####################################################
with open('SemEval2022-Task11_Train-Dev/HI-Hindi/hi_train_mr.conll', 'w') as file:
    for line in mr_file:
        file.write(line)

# Korean

In [234]:
############## DATA STRUCTURES SETUP ###############

with open('SemEval2022-Task11_Train-Dev/KO-Korean/ko_train.conll') as f:
    file = [i for i in f.readlines()]

from collections import defaultdict

mention_rep = defaultdict(list) #create default dictionary of lists
entities = ['PER', 'LOC', 'GRP', 'CORP', 'PROD', 'CW'] #all basic NER labels

begin = False #variable for when you've first encountered and captured the head of a mention (B-)
inside = False #variable for when you're inside the mention (I-). There can be 1+ of these

for entity in entities: #traverse through the entities
    for line in file: #traverse through each line of the language file
        if begin: #check to see if you've already run into the head of the mention
            if f'I-{entity}' in line: #if not, then see if the I- is in this line
                temp_list.append(line) #if so, you must've gone to the 'elif' below and created the temp_list varible. Append current I- onto this list
                inside = True #switch to True to alert that you're inside the mention from now on
            
            else: #if there's no I- in this line that means you've 1) got the head, and 2) either gotten all I-'s already or there's no more I-'s, and you're done trying to capture a mention
                if mention_rep[entity]: #check to see if you're already started storing values into this entity (PER, LOC, etc.)
                    mention_rep[entity].append(temp_list) #if so, simply append the temp_list onto the existing list
                else: #if not that means this is your first entry for an entity's values. defaultdict(list) can't perform append when this happens...
                    mention_rep[entity] = [temp_list] #...so we'll have to store the temp_list as a list to prevent the temp_list's contents from being broken into individual values and stored into the dict's value list
                begin = False #revert begin to False for next loop
                inside = False #revert inside for the same reason
            
        elif f'B-{entity}' in line: #if you haven't already run into the head of the mention (B-), check to see if it's in this current loop/line
            temp_list = [] #create a temporary list that you'll use to collect a mention's beginning (B-) and insides (I-). this will also reset the temp_list for subsequent loops 
            temp_list.append(line) #add the beginning labels (B-) to this list
            begin = True #switch to True to alert that you've found the head label

#assert every mention was stored as a list
for i in mention_rep: #traverse the dict's keys
    for j in mention_rep[i]: #traverse each value in the value-list of the key
        assert type(j) is list #check if every value is a list. this is to ensure consistency later on


################################################# PERFORM MENTION REPLACEMENT ########################################################
import numpy as np #will need ndarray

mr_file = []
labels = tuple(mention_rep.keys()) #set of labels: PER, LOC, GRP, CORP, PROD, CW

#capture_inside = False
for index, line in enumerate(file): #traverse through file by line
    #print('line:', line)
    if line.strip().endswith(labels): #endswith() is to filter out anything other than mentions
        if line.strip().split(' _ _ ')[1].startswith('B'): #only get the beginnings of mentions (B-)
            #print('is label')
            x = random.binomial(n = 1, p = 0.5, size = 1) #randomizer. x will be 0 or 1
            #print(f'x = {x}')
            if x == 1: #x==1 means successfully random, perform mention replacement
                curr_label = line.split(' _ _ ')[1].strip()[2:] #current label type will be used as key next. '2:' is to remove the 'B-' or 'I-' from beginning of label
                #print(f'curr_label: {curr_label}')
                new_mention = np.random.choice(np.array(mention_rep[curr_label], dtype = 'object')) #have to make this ndarray because the values of our mention_rep[key] dictionary is lists of LISTS. it becomes deprecated. same logic applies as LwTR above
                #print(f'new_mention: {new_mention}')
                if len(new_mention) == 1: #just insert sole element
                    #print('len 1')
                    mr_file.append(new_mention[0])
                    #print(f'{index+1}: {mr_file}')
                else: #multi-word mention
                    #print('multi-word')
                    for mention in new_mention: #traverse through new_mention list
                        #test_file.insert(index+idx, mention) #subsequently insert each part of new_mention 
                        mr_file.append(mention)
                        #print(f'{index+1}: {mr_file}\n')
            else: #B-, but no data augmentation
                #print('label, but no data augmentation')
                mr_file.append(line) #add line (non-replaced mention beginning (B-))
                capture_inside = True #this will enable capturing the inside (I-) of the mention if it appears next, instead of disregarding it
                #print(f'{index+1}: {mr_file}\n')
        elif capture_inside: #add the inside of mention, because we just had a non-replaced mention beginning (B-), so we must capture all insides of original mention
            #print('unsuccessful mention replacement. capturing current inside mention, I-')
            mr_file.append(line) #append non mention replaced inside (I-)
            #print(f'{index+1}: {mr_file}\n')
#         else:
#             print('sucessful mention replacement. not capturing current inside of mention')
            
    else: #everything that doesn't have a mention label in it
        #print('not label')
        mr_file.append(line) #append current line
        #print(f'{index+1}: {mr_file}\n')
        capture_inside = False #switch off the flag to indicate we must capture a mention inside

        
############################################################### CHECKING ############################################################
new_count = 0
for i in mr_file:
    if i.startswith('# id '):
        new_count += 1
assert original_count == new_count == 15300

############################################################ WRITING TO NEW FILE #####################################################
with open('SemEval2022-Task11_Train-Dev/KO-Korean/ko_train_mr.conll', 'w') as file:
    for line in mr_file:
        file.write(line)

# Dutch

In [235]:
############## DATA STRUCTURES SETUP ###############

with open('SemEval2022-Task11_Train-Dev/NL-Dutch/nl_train.conll') as f:
    file = [i for i in f.readlines()]

from collections import defaultdict

mention_rep = defaultdict(list) #create default dictionary of lists
entities = ['PER', 'LOC', 'GRP', 'CORP', 'PROD', 'CW'] #all basic NER labels

begin = False #variable for when you've first encountered and captured the head of a mention (B-)
inside = False #variable for when you're inside the mention (I-). There can be 1+ of these

for entity in entities: #traverse through the entities
    for line in file: #traverse through each line of the language file
        if begin: #check to see if you've already run into the head of the mention
            if f'I-{entity}' in line: #if not, then see if the I- is in this line
                temp_list.append(line) #if so, you must've gone to the 'elif' below and created the temp_list varible. Append current I- onto this list
                inside = True #switch to True to alert that you're inside the mention from now on
            
            else: #if there's no I- in this line that means you've 1) got the head, and 2) either gotten all I-'s already or there's no more I-'s, and you're done trying to capture a mention
                if mention_rep[entity]: #check to see if you're already started storing values into this entity (PER, LOC, etc.)
                    mention_rep[entity].append(temp_list) #if so, simply append the temp_list onto the existing list
                else: #if not that means this is your first entry for an entity's values. defaultdict(list) can't perform append when this happens...
                    mention_rep[entity] = [temp_list] #...so we'll have to store the temp_list as a list to prevent the temp_list's contents from being broken into individual values and stored into the dict's value list
                begin = False #revert begin to False for next loop
                inside = False #revert inside for the same reason
            
        elif f'B-{entity}' in line: #if you haven't already run into the head of the mention (B-), check to see if it's in this current loop/line
            temp_list = [] #create a temporary list that you'll use to collect a mention's beginning (B-) and insides (I-). this will also reset the temp_list for subsequent loops 
            temp_list.append(line) #add the beginning labels (B-) to this list
            begin = True #switch to True to alert that you've found the head label

#assert every mention was stored as a list
for i in mention_rep: #traverse the dict's keys
    for j in mention_rep[i]: #traverse each value in the value-list of the key
        assert type(j) is list #check if every value is a list. this is to ensure consistency later on


################################################# PERFORM MENTION REPLACEMENT ########################################################
import numpy as np #will need ndarray

mr_file = []
labels = tuple(mention_rep.keys()) #set of labels: PER, LOC, GRP, CORP, PROD, CW

#capture_inside = False
for index, line in enumerate(file): #traverse through file by line
    #print('line:', line)
    if line.strip().endswith(labels): #endswith() is to filter out anything other than mentions
        if line.strip().split(' _ _ ')[1].startswith('B'): #only get the beginnings of mentions (B-)
            #print('is label')
            x = random.binomial(n = 1, p = 0.5, size = 1) #randomizer. x will be 0 or 1
            #print(f'x = {x}')
            if x == 1: #x==1 means successfully random, perform mention replacement
                curr_label = line.split(' _ _ ')[1].strip()[2:] #current label type will be used as key next. '2:' is to remove the 'B-' or 'I-' from beginning of label
                #print(f'curr_label: {curr_label}')
                new_mention = np.random.choice(np.array(mention_rep[curr_label], dtype = 'object')) #have to make this ndarray because the values of our mention_rep[key] dictionary is lists of LISTS. it becomes deprecated. same logic applies as LwTR above
                #print(f'new_mention: {new_mention}')
                if len(new_mention) == 1: #just insert sole element
                    #print('len 1')
                    mr_file.append(new_mention[0])
                    #print(f'{index+1}: {mr_file}')
                else: #multi-word mention
                    #print('multi-word')
                    for mention in new_mention: #traverse through new_mention list
                        #test_file.insert(index+idx, mention) #subsequently insert each part of new_mention 
                        mr_file.append(mention)
                        #print(f'{index+1}: {mr_file}\n')
            else: #B-, but no data augmentation
                #print('label, but no data augmentation')
                mr_file.append(line) #add line (non-replaced mention beginning (B-))
                capture_inside = True #this will enable capturing the inside (I-) of the mention if it appears next, instead of disregarding it
                #print(f'{index+1}: {mr_file}\n')
        elif capture_inside: #add the inside of mention, because we just had a non-replaced mention beginning (B-), so we must capture all insides of original mention
            #print('unsuccessful mention replacement. capturing current inside mention, I-')
            mr_file.append(line) #append non mention replaced inside (I-)
            #print(f'{index+1}: {mr_file}\n')
#         else:
#             print('sucessful mention replacement. not capturing current inside of mention')
            
    else: #everything that doesn't have a mention label in it
        #print('not label')
        mr_file.append(line) #append current line
        #print(f'{index+1}: {mr_file}\n')
        capture_inside = False #switch off the flag to indicate we must capture a mention inside

        
############################################################### CHECKING ############################################################
new_count = 0
for i in mr_file:
    if i.startswith('# id '):
        new_count += 1
assert original_count == new_count == 15300

############################################################ WRITING TO NEW FILE #####################################################
with open('SemEval2022-Task11_Train-Dev/NL-Dutch/nl_train_mr.conll', 'w') as file:
    for line in mr_file:
        file.write(line)

# Russian

In [236]:
############## DATA STRUCTURES SETUP ###############

with open('SemEval2022-Task11_Train-Dev/RU-Russian/ru_train.conll') as f:
    file = [i for i in f.readlines()]

from collections import defaultdict

mention_rep = defaultdict(list) #create default dictionary of lists
entities = ['PER', 'LOC', 'GRP', 'CORP', 'PROD', 'CW'] #all basic NER labels

begin = False #variable for when you've first encountered and captured the head of a mention (B-)
inside = False #variable for when you're inside the mention (I-). There can be 1+ of these

for entity in entities: #traverse through the entities
    for line in file: #traverse through each line of the language file
        if begin: #check to see if you've already run into the head of the mention
            if f'I-{entity}' in line: #if not, then see if the I- is in this line
                temp_list.append(line) #if so, you must've gone to the 'elif' below and created the temp_list varible. Append current I- onto this list
                inside = True #switch to True to alert that you're inside the mention from now on
            
            else: #if there's no I- in this line that means you've 1) got the head, and 2) either gotten all I-'s already or there's no more I-'s, and you're done trying to capture a mention
                if mention_rep[entity]: #check to see if you're already started storing values into this entity (PER, LOC, etc.)
                    mention_rep[entity].append(temp_list) #if so, simply append the temp_list onto the existing list
                else: #if not that means this is your first entry for an entity's values. defaultdict(list) can't perform append when this happens...
                    mention_rep[entity] = [temp_list] #...so we'll have to store the temp_list as a list to prevent the temp_list's contents from being broken into individual values and stored into the dict's value list
                begin = False #revert begin to False for next loop
                inside = False #revert inside for the same reason
            
        elif f'B-{entity}' in line: #if you haven't already run into the head of the mention (B-), check to see if it's in this current loop/line
            temp_list = [] #create a temporary list that you'll use to collect a mention's beginning (B-) and insides (I-). this will also reset the temp_list for subsequent loops 
            temp_list.append(line) #add the beginning labels (B-) to this list
            begin = True #switch to True to alert that you've found the head label

#assert every mention was stored as a list
for i in mention_rep: #traverse the dict's keys
    for j in mention_rep[i]: #traverse each value in the value-list of the key
        assert type(j) is list #check if every value is a list. this is to ensure consistency later on


################################################# PERFORM MENTION REPLACEMENT ########################################################
import numpy as np #will need ndarray

mr_file = []
labels = tuple(mention_rep.keys()) #set of labels: PER, LOC, GRP, CORP, PROD, CW

#capture_inside = False
for index, line in enumerate(file): #traverse through file by line
    #print('line:', line)
    if line.strip().endswith(labels): #endswith() is to filter out anything other than mentions
        if line.strip().split(' _ _ ')[1].startswith('B'): #only get the beginnings of mentions (B-)
            #print('is label')
            x = random.binomial(n = 1, p = 0.5, size = 1) #randomizer. x will be 0 or 1
            #print(f'x = {x}')
            if x == 1: #x==1 means successfully random, perform mention replacement
                curr_label = line.split(' _ _ ')[1].strip()[2:] #current label type will be used as key next. '2:' is to remove the 'B-' or 'I-' from beginning of label
                #print(f'curr_label: {curr_label}')
                new_mention = np.random.choice(np.array(mention_rep[curr_label], dtype = 'object')) #have to make this ndarray because the values of our mention_rep[key] dictionary is lists of LISTS. it becomes deprecated. same logic applies as LwTR above
                #print(f'new_mention: {new_mention}')
                if len(new_mention) == 1: #just insert sole element
                    #print('len 1')
                    mr_file.append(new_mention[0])
                    #print(f'{index+1}: {mr_file}')
                else: #multi-word mention
                    #print('multi-word')
                    for mention in new_mention: #traverse through new_mention list
                        #test_file.insert(index+idx, mention) #subsequently insert each part of new_mention 
                        mr_file.append(mention)
                        #print(f'{index+1}: {mr_file}\n')
            else: #B-, but no data augmentation
                #print('label, but no data augmentation')
                mr_file.append(line) #add line (non-replaced mention beginning (B-))
                capture_inside = True #this will enable capturing the inside (I-) of the mention if it appears next, instead of disregarding it
                #print(f'{index+1}: {mr_file}\n')
        elif capture_inside: #add the inside of mention, because we just had a non-replaced mention beginning (B-), so we must capture all insides of original mention
            #print('unsuccessful mention replacement. capturing current inside mention, I-')
            mr_file.append(line) #append non mention replaced inside (I-)
            #print(f'{index+1}: {mr_file}\n')
#         else:
#             print('sucessful mention replacement. not capturing current inside of mention')
            
    else: #everything that doesn't have a mention label in it
        #print('not label')
        mr_file.append(line) #append current line
        #print(f'{index+1}: {mr_file}\n')
        capture_inside = False #switch off the flag to indicate we must capture a mention inside

        
############################################################### CHECKING ############################################################
new_count = 0
for i in mr_file:
    if i.startswith('# id '):
        new_count += 1
assert original_count == new_count == 15300

############################################################ WRITING TO NEW FILE #####################################################
with open('SemEval2022-Task11_Train-Dev/RU-Russian/ru_train_mr.conll', 'w') as file:
    for line in mr_file:
        file.write(line)

# Turkish

In [238]:
############## DATA STRUCTURES SETUP ###############

with open('SemEval2022-Task11_Train-Dev/TR-Turkish/tr_train.conll') as f:
    file = [i for i in f.readlines()]

from collections import defaultdict

mention_rep = defaultdict(list) #create default dictionary of lists
entities = ['PER', 'LOC', 'GRP', 'CORP', 'PROD', 'CW'] #all basic NER labels

begin = False #variable for when you've first encountered and captured the head of a mention (B-)
inside = False #variable for when you're inside the mention (I-). There can be 1+ of these

for entity in entities: #traverse through the entities
    for line in file: #traverse through each line of the language file
        if begin: #check to see if you've already run into the head of the mention
            if f'I-{entity}' in line: #if not, then see if the I- is in this line
                temp_list.append(line) #if so, you must've gone to the 'elif' below and created the temp_list varible. Append current I- onto this list
                inside = True #switch to True to alert that you're inside the mention from now on
            
            else: #if there's no I- in this line that means you've 1) got the head, and 2) either gotten all I-'s already or there's no more I-'s, and you're done trying to capture a mention
                if mention_rep[entity]: #check to see if you're already started storing values into this entity (PER, LOC, etc.)
                    mention_rep[entity].append(temp_list) #if so, simply append the temp_list onto the existing list
                else: #if not that means this is your first entry for an entity's values. defaultdict(list) can't perform append when this happens...
                    mention_rep[entity] = [temp_list] #...so we'll have to store the temp_list as a list to prevent the temp_list's contents from being broken into individual values and stored into the dict's value list
                begin = False #revert begin to False for next loop
                inside = False #revert inside for the same reason
            
        elif f'B-{entity}' in line: #if you haven't already run into the head of the mention (B-), check to see if it's in this current loop/line
            temp_list = [] #create a temporary list that you'll use to collect a mention's beginning (B-) and insides (I-). this will also reset the temp_list for subsequent loops 
            temp_list.append(line) #add the beginning labels (B-) to this list
            begin = True #switch to True to alert that you've found the head label

#assert every mention was stored as a list
for i in mention_rep: #traverse the dict's keys
    for j in mention_rep[i]: #traverse each value in the value-list of the key
        assert type(j) is list #check if every value is a list. this is to ensure consistency later on


################################################# PERFORM MENTION REPLACEMENT ########################################################
import numpy as np #will need ndarray

mr_file = []
labels = tuple(mention_rep.keys()) #set of labels: PER, LOC, GRP, CORP, PROD, CW

#capture_inside = False
for index, line in enumerate(file): #traverse through file by line
    #print('line:', line)
    if line.strip().endswith(labels): #endswith() is to filter out anything other than mentions
        if line.strip().split(' _ _ ')[1].startswith('B'): #only get the beginnings of mentions (B-)
            #print('is label')
            x = random.binomial(n = 1, p = 0.5, size = 1) #randomizer. x will be 0 or 1
            #print(f'x = {x}')
            if x == 1: #x==1 means successfully random, perform mention replacement
                curr_label = line.split(' _ _ ')[1].strip()[2:] #current label type will be used as key next. '2:' is to remove the 'B-' or 'I-' from beginning of label
                #print(f'curr_label: {curr_label}')
                new_mention = np.random.choice(np.array(mention_rep[curr_label], dtype = 'object')) #have to make this ndarray because the values of our mention_rep[key] dictionary is lists of LISTS. it becomes deprecated. same logic applies as LwTR above
                #print(f'new_mention: {new_mention}')
                if len(new_mention) == 1: #just insert sole element
                    #print('len 1')
                    mr_file.append(new_mention[0])
                    #print(f'{index+1}: {mr_file}')
                else: #multi-word mention
                    #print('multi-word')
                    for mention in new_mention: #traverse through new_mention list
                        #test_file.insert(index+idx, mention) #subsequently insert each part of new_mention 
                        mr_file.append(mention)
                        #print(f'{index+1}: {mr_file}\n')
            else: #B-, but no data augmentation
                #print('label, but no data augmentation')
                mr_file.append(line) #add line (non-replaced mention beginning (B-))
                capture_inside = True #this will enable capturing the inside (I-) of the mention if it appears next, instead of disregarding it
                #print(f'{index+1}: {mr_file}\n')
        elif capture_inside: #add the inside of mention, because we just had a non-replaced mention beginning (B-), so we must capture all insides of original mention
            #print('unsuccessful mention replacement. capturing current inside mention, I-')
            mr_file.append(line) #append non mention replaced inside (I-)
            #print(f'{index+1}: {mr_file}\n')
#         else:
#             print('sucessful mention replacement. not capturing current inside of mention')
            
    else: #everything that doesn't have a mention label in it
        #print('not label')
        mr_file.append(line) #append current line
        #print(f'{index+1}: {mr_file}\n')
        capture_inside = False #switch off the flag to indicate we must capture a mention inside

        
############################################################### CHECKING ############################################################
new_count = 0
for i in mr_file:
    if i.startswith('# id '):
        new_count += 1
assert original_count == new_count == 15300

############################################################ WRITING TO NEW FILE #####################################################
with open('SemEval2022-Task11_Train-Dev/TR-Turkish/tr_train_mr.conll', 'w') as file:
    for line in mr_file:
        file.write(line)

# Chinese

In [239]:
############## DATA STRUCTURES SETUP ###############

with open('SemEval2022-Task11_Train-Dev/ZH-Chinese/zh_train.conll') as f:
    file = [i for i in f.readlines()]

from collections import defaultdict

mention_rep = defaultdict(list) #create default dictionary of lists
entities = ['PER', 'LOC', 'GRP', 'CORP', 'PROD', 'CW'] #all basic NER labels

begin = False #variable for when you've first encountered and captured the head of a mention (B-)
inside = False #variable for when you're inside the mention (I-). There can be 1+ of these

for entity in entities: #traverse through the entities
    for line in file: #traverse through each line of the language file
        if begin: #check to see if you've already run into the head of the mention
            if f'I-{entity}' in line: #if not, then see if the I- is in this line
                temp_list.append(line) #if so, you must've gone to the 'elif' below and created the temp_list varible. Append current I- onto this list
                inside = True #switch to True to alert that you're inside the mention from now on
            
            else: #if there's no I- in this line that means you've 1) got the head, and 2) either gotten all I-'s already or there's no more I-'s, and you're done trying to capture a mention
                if mention_rep[entity]: #check to see if you're already started storing values into this entity (PER, LOC, etc.)
                    mention_rep[entity].append(temp_list) #if so, simply append the temp_list onto the existing list
                else: #if not that means this is your first entry for an entity's values. defaultdict(list) can't perform append when this happens...
                    mention_rep[entity] = [temp_list] #...so we'll have to store the temp_list as a list to prevent the temp_list's contents from being broken into individual values and stored into the dict's value list
                begin = False #revert begin to False for next loop
                inside = False #revert inside for the same reason
            
        elif f'B-{entity}' in line: #if you haven't already run into the head of the mention (B-), check to see if it's in this current loop/line
            temp_list = [] #create a temporary list that you'll use to collect a mention's beginning (B-) and insides (I-). this will also reset the temp_list for subsequent loops 
            temp_list.append(line) #add the beginning labels (B-) to this list
            begin = True #switch to True to alert that you've found the head label

#assert every mention was stored as a list
for i in mention_rep: #traverse the dict's keys
    for j in mention_rep[i]: #traverse each value in the value-list of the key
        assert type(j) is list #check if every value is a list. this is to ensure consistency later on


################################################# PERFORM MENTION REPLACEMENT ########################################################
import numpy as np #will need ndarray

mr_file = []
labels = tuple(mention_rep.keys()) #set of labels: PER, LOC, GRP, CORP, PROD, CW

#capture_inside = False
for index, line in enumerate(file): #traverse through file by line
    #print('line:', line)
    if line.strip().endswith(labels): #endswith() is to filter out anything other than mentions
        if line.strip().split(' _ _ ')[1].startswith('B'): #only get the beginnings of mentions (B-)
            #print('is label')
            x = random.binomial(n = 1, p = 0.5, size = 1) #randomizer. x will be 0 or 1
            #print(f'x = {x}')
            if x == 1: #x==1 means successfully random, perform mention replacement
                curr_label = line.split(' _ _ ')[1].strip()[2:] #current label type will be used as key next. '2:' is to remove the 'B-' or 'I-' from beginning of label
                #print(f'curr_label: {curr_label}')
                new_mention = np.random.choice(np.array(mention_rep[curr_label], dtype = 'object')) #have to make this ndarray because the values of our mention_rep[key] dictionary is lists of LISTS. it becomes deprecated. same logic applies as LwTR above
                #print(f'new_mention: {new_mention}')
                if len(new_mention) == 1: #just insert sole element
                    #print('len 1')
                    mr_file.append(new_mention[0])
                    #print(f'{index+1}: {mr_file}')
                else: #multi-word mention
                    #print('multi-word')
                    for mention in new_mention: #traverse through new_mention list
                        #test_file.insert(index+idx, mention) #subsequently insert each part of new_mention 
                        mr_file.append(mention)
                        #print(f'{index+1}: {mr_file}\n')
            else: #B-, but no data augmentation
                #print('label, but no data augmentation')
                mr_file.append(line) #add line (non-replaced mention beginning (B-))
                capture_inside = True #this will enable capturing the inside (I-) of the mention if it appears next, instead of disregarding it
                #print(f'{index+1}: {mr_file}\n')
        elif capture_inside: #add the inside of mention, because we just had a non-replaced mention beginning (B-), so we must capture all insides of original mention
            #print('unsuccessful mention replacement. capturing current inside mention, I-')
            mr_file.append(line) #append non mention replaced inside (I-)
            #print(f'{index+1}: {mr_file}\n')
#         else:
#             print('sucessful mention replacement. not capturing current inside of mention')
            
    else: #everything that doesn't have a mention label in it
        #print('not label')
        mr_file.append(line) #append current line
        #print(f'{index+1}: {mr_file}\n')
        capture_inside = False #switch off the flag to indicate we must capture a mention inside

        
############################################################### CHECKING ############################################################
new_count = 0
for i in mr_file:
    if i.startswith('# id '):
        new_count += 1
assert original_count == new_count == 15300

############################################################ WRITING TO NEW FILE #####################################################
with open('SemEval2022-Task11_Train-Dev/ZH-Chinese/zh_train_mr.conll', 'w') as file:
    for line in mr_file:
        file.write(line)

# SiS - Shuffle within Segments

From Adel & Dai:
    
Shuffle within segments (SiS): We first split the token sequence into segments of the same label. Thus,
each segment corresponds to either a mention or a sequence of out-of-mention tokens.

In [46]:
# Labels are as follows
# PER : Person
# LOC : Location
# GRP : Group
# CORP : Corporation
# PROD : Product
# CW: Creative Work
# O: out-of-mention label (i.e. unrelated to named entity)

# B - beginning of label, I - inside label
from numpy import random

def shuffle(file):
    labels = tuple(['-PER\n', '-LOC\n', '-GRP\n', '-CORP\n', '-PROD\n', '-CW\n', ' _ _ O']) #set of labels: PER, LOC, GRP, CORP, PROD, CW, and O
    shuffle_file = []
    prev_label = None
    temp_list = []
    shuffle_count = 0
    non_shuffle_count = 0
    
    for index, line in enumerate(file):
        #print(f'Loop: {index+1} current line: {line} temp_list: {temp_list}')
        #print(f'shuffle_file: {shuffle_file}')
        if not line.startswith('# id ') and line.strip(): #to filter out #id's and \n
            for label in labels:
                if label in line:
                    curr_label = label
                    break
                    
            if curr_label != prev_label and temp_list: #when you've come to a new label, and not on your first loop
                x = random.binomial(n=1, p=0.5, size=1)
                if x == 1: #perform shuffle data augmnetation
                    random.shuffle(temp_list)
                    #print('data not augmenting')
                    #print(f'temp_list before looping through: {temp_list}')
                    for shuff_line in temp_list: #traverse through temp list
                        shuffle_file.append(shuff_line)
                    temp_list.clear() #reset temp_list
                    temp_list.append(line) #begin new temp_list with current new label
                    prev_label = curr_label #reassign current label value to prev_label to be used as reference for next loop
                    shuffle_count += 1
                    #print('data augmenting')
                    #print(f'after new temp_list: {temp_list}')
                else: #don't shuffle, just append
                    #print('data not augmenting')
                    #print(f'temp_list before looping through: {temp_list}')
                    for original_line in temp_list:
                        shuffle_file.append(original_line)
                    temp_list.clear() #reset temp_list
                    temp_list.append(line) #begin new temp_list with current new label
                    prev_label = curr_label #reassign current label value to prev_label to be used as reference for next loop
                    non_shuffle_count += 1
                    #print('data not augmenting')
                    #print(f'after new temp_list: {temp_list}')
                #temp_list.append(line)
            else: #not on transitionary label. this could mean you're on your first label or another of the same label you've been on
                temp_list.append(line) #add current line to temp_list
                #print(f'{index+1} {line} prev_label: {prev_label}')
                prev_label = curr_label #reassign current label value to prev_label to be used as reference for next loop
                #print(f'{index+1} {line} curr_label: {curr_label}')
        elif line == '\n': #end of current #id NER, shuffle and dump current temp_list holdings
            x = random.binomial(n=1, p=0.5, size=1)
            if x == 1: #perform shuffle data augmnetation
                random.shuffle(temp_list)
                #print('data not augmenting')
                #print(f'temp_list before looping through: {temp_list}')
                for shuff_line in temp_list: #traverse through temp list
                    shuffle_file.append(shuff_line)
                shuffle_file.append(line)
                temp_list.clear() #reset temp_list
                prev_label = None #reassign current label value to prev_label to be used as reference for next loop
                shuffle_count += 1
                #print('data augmenting')
                #print(f'after new temp_list: {temp_list}')
            else: #don't shuffle, just append
                #print('data not augmenting')
                #print(f'temp_list before looping through: {temp_list}')
                for original_line in temp_list:
                    shuffle_file.append(original_line)
                shuffle_file.append(line)
                temp_list.clear() #reset temp_list
                prev_label = None #reassig
                non_shuffle_count += 1
        else:
            shuffle_file.append(line) #append #id's and \n
            prev_label = None
    print(f'{shuffle_count} instances of shuffling.')
    print(f'{non_shuffle_count} instances of not shuffling.')
    return shuffle_file

# Representative Languages: BN, KO, DE

# Bangla

In [49]:
with open('SemEval2022-Task11_Train-Dev/BN-Bangla/bn_train.conll') as f:
    file = f.readlines()
bn_shuffle_file = shuffle(file)

with open('SemEval2022-Task11_Train-Dev/BN-Bangla/bn_train_sis.conll', 'w') as file:
    for line in bn_shuffle_file:
        file.write(line)

21890 instances of shuffling.
21737 instances of not shuffling.


# Korean

In [50]:
with open('SemEval2022-Task11_Train-Dev/KO-Korean/ko_train.conll') as f:
    file = f.readlines()
ko_shuffle_file = shuffle(file)

with open('SemEval2022-Task11_Train-Dev/KO-Korean/ko_train_sis.conll', 'w') as file:
    for line in ko_shuffle_file:
        file.write(line)

27721 instances of shuffling.
27416 instances of not shuffling.


# German

In [51]:
with open('SemEval2022-Task11_Train-Dev/DE-German/de_train.conll') as f:
    file = f.readlines()
de_shuffle_file = shuffle(file)

with open('SemEval2022-Task11_Train-Dev/DE-German/de_train_sis.conll', 'w') as file:
    for line in de_shuffle_file:
        file.write(line)

28571 instances of shuffling.
28287 instances of not shuffling.


# Other Languages

Turkish

In [66]:
with open('SemEval2022-Task11_Train-Dev/TR-Turkish/tr_train.conll') as f:
    file = f.readlines()
tr_shuffle_file = shuffle(file)

with open('SemEval2022-Task11_Train-Dev/TR-Turkish/tr_train_sis.conll', 'w') as file:
    for line in tr_shuffle_file:
        file.write(line)

28752 instances of shuffling.
29116 instances of not shuffling.


Chinese

In [67]:
with open('SemEval2022-Task11_Train-Dev/ZH-Chinese/zh_train.conll') as f:
    file = f.readlines()
zh_shuffle_file = shuffle(file)

with open('SemEval2022-Task11_Train-Dev/ZH-Chinese/zh_train_sis.conll', 'w') as file:
    for line in zh_shuffle_file:
        file.write(line)

29806 instances of shuffling.
29967 instances of not shuffling.


Hindi

In [68]:
with open('SemEval2022-Task11_Train-Dev/HI-Hindi/hi_train.conll') as f:
    file = f.readlines()
hi_shuffle_file = shuffle(file)

with open('SemEval2022-Task11_Train-Dev/HI-Hindi/hi_train_sis.conll', 'w') as file:
    for line in hi_shuffle_file:
        file.write(line)

22100 instances of shuffling.
21787 instances of not shuffling.


Farsi

In [69]:
with open('SemEval2022-Task11_Train-Dev/FA-Farsi/fa_train.conll') as f:
    file = f.readlines()
fa_shuffle_file = shuffle(file)

with open('SemEval2022-Task11_Train-Dev/FA-Farsi/fa_train_sis.conll', 'w') as file:
    for line in fa_shuffle_file:
        file.write(line)

27936 instances of shuffling.
27531 instances of not shuffling.


Russian

In [70]:
with open('SemEval2022-Task11_Train-Dev/RU-Russian/ru_train.conll') as f:
    file = f.readlines()
ru_shuffle_file = shuffle(file)

with open('SemEval2022-Task11_Train-Dev/RU-Russian/ru_train_sis.conll', 'w') as file:
    for line in ru_shuffle_file:
        file.write(line)

26226 instances of shuffling.
25948 instances of not shuffling.


Dutch

In [71]:
with open('SemEval2022-Task11_Train-Dev/NL-Dutch/nl_train.conll') as f:
    file = f.readlines()
nl_shuffle_file = shuffle(file)

with open('SemEval2022-Task11_Train-Dev/NL-Dutch/nl_train_sis.conll', 'w') as file:
    for line in nl_shuffle_file:
        file.write(line)

28834 instances of shuffling.
29065 instances of not shuffling.


Spanish

In [72]:
with open('SemEval2022-Task11_Train-Dev/ES-Spanish/es_train.conll') as f:
    file = f.readlines()
es_shuffle_file = shuffle(file)

with open('SemEval2022-Task11_Train-Dev/ES-Spanish/es_train_sis.conll', 'w') as file:
    for line in es_shuffle_file:
        file.write(line)

28962 instances of shuffling.
29003 instances of not shuffling.


English

In [73]:
with open('SemEval2022-Task11_Train-Dev/EN-English/en_train.conll') as f:
    file = f.readlines()
en_shuffle_file = shuffle(file)

with open('SemEval2022-Task11_Train-Dev/EN-English/en_train_sis.conll', 'w') as file:
    for line in en_shuffle_file:
        file.write(line)

29220 instances of shuffling.
29870 instances of not shuffling.


In [None]:
with open('SemEval2022-Task11_Train-Dev/EN-English/en_train.conll') as f:
    file = f.readlines()
test_shuffle = shuffle(file)

with open('SemEval2022-Task11_Train-Dev/EN-English/test_sis.conll', 'w') as file:
    for line in test_shuffle:
        file.write(line)

# All

All: We also explore to augment the training set using all aforementioned augmentation methods. That
is, for each training instance, we create multiple augmented instances, one per augmentation method.


In [19]:
a = {'TR':('Turkish', 'tr'),
    'ZH':('Chinese', 'zh'),
    'HI':('Hindi', 'hi'),
    'FA':('Farsi', 'fa'),
    'DE':('German', 'de'),
    'RU':('Russian', 'ru'),
    'NL':('Dutch', 'nl'),
    'ES':('Spanish', 'es'),
    'EN':('English', 'en'),}

for i, j in a.items():
    print(i, j[1])

TR tr
ZH zh
HI hi
FA fa
DE de
RU ru
NL nl
ES es
EN en


# Combinations

In [52]:
a = {'TR':('Turkish', 'tr'),
    'ZH':('Chinese', 'zh'),
    'HI':('Hindi', 'hi'),
    'FA':('Farsi', 'fa'),
    'DE':('German', 'de'),
    'RU':('Russian', 'ru'),
    'NL':('Dutch', 'nl'),
    'ES':('Spanish', 'es'),
    'EN':('English', 'en')}

for i, j in a.items():
    with open(f'SemEval2022-Task11_Train-Dev/{i}-{j[0]}/{j[1]}_train.conll') as f:
        og_file = f.readlines()
    with open(f'SemEval2022-Task11_Train-Dev/{i}-{j[0]}/{j[1]}_train_lwtr.conll') as f:
        lwtr_file = f.readlines()
    with open(f'SemEval2022-Task11_Train-Dev/{i}-{j[0]}/{j[1]}_train_mr.conll') as f:
        mr_file = f.readlines()
    with open(f'SemEval2022-Task11_Train-Dev/{i}-{j[0]}/{j[1]}_train_sr.conll') as f:
        sr_file = f.readlines()
    with open(f'SemEval2022-Task11_Train-Dev/{i}-{j[0]}/{j[1]}_train_sis.conll') as f:
        sis_file = f.readlines()

    combined_file = og_file + lwtr_file + mr_file + sr_file + sis_file

    with open('SemEval2022-Task11_Train-Dev/KO-Korean/ko_train_combined.conll', 'w') as f:
        for line in combined_file:
            f.write(line)

In [53]:
with open(f'SemEval2022-Task11_Train-Dev/ES-Spanish/es_train.conll') as f:
    og_file = f.readlines()
with open(f'SemEval2022-Task11_Train-Dev/ES-Spanish/es_train_lwtr.conll') as f:
    lwtr_file = f.readlines()
with open(f'SemEval2022-Task11_Train-Dev/ES-Spanish/es_train_mr.conll') as f:
    mr_file = f.readlines()
with open(f'SemEval2022-Task11_Train-Dev/ES-Spanish/es_train_sr.conll') as f:
    sr_file = f.readlines()
with open(f'SemEval2022-Task11_Train-Dev/ES-Spanish/es_train_sis.conll') as f:
    sis_file = f.readlines()

combined_file = og_file + lwtr_file + mr_file + sr_file + sis_file

with open('SemEval2022-Task11_Train-Dev/ES-Spanish/es_train_combined.conll', 'w') as f:
    for line in combined_file:
        f.write(line)

Fix SR

In [41]:
with open('SemEval2022-Task11_Train-Dev/EN-English/en_train_combined.conll') as f:
    file = f.readlines()

In [48]:
a = '  _ _ O'
a

'\u202f _ _ O'

In [42]:
for i in file:
    if '\xa0' in line.split(' _ _ ')[0] or ' ' in line.split(' _ _ ')[0] or '\u3000' in line.split(' _ _ ')[0] or '\xa0—' in line.split(' _ _ ')[0] or '—\xa0' in line.split(' _ _ ')[0]:
        print(line)
    elif

In [38]:
for index, line in enumerate(file):
    if '\xa0—' in line:
        file[index] = line.replace('\xa0—', '')
    elif '—\xa0' in line:
        file[index] = line.replace('—\xa0', '')

In [39]:
with open('SemEval2022-Task11_Train-Dev/EN-English/en_train_combined.conll', 'w') as f:
    for i in file:
        f.write(i)

In [6]:
#for  space + - 
for index, line in enumerate(file):
    if '\xa0— ' in line:
        temp = line.split(' _ _ ')
        temp[0] = temp[0][:-2]
        file[index] = ' _ _ '.join(temp)
    elif '—\xa0' in line:
        temp = line.split(' _ _ ')
        temp[0] = temp[0][2:]
        file[index] = ' _ _ '.join(temp)
    elif '\xa0' in line.split(' _ _ ')[0]:
        file[index] = line.replace("\xa0", "")

In [7]:
#import unicodedata

for index, line in enumerate(file):
    if line.strip() and not line.startswith("# id "):
        #line = unicodedata.normalize("NFKD", line.split(' _ _ '))
        if '\xa0' in line.split(' _ _ ')[0] or ' ' in line.split(' _ _ ')[0] or '\u3000' in line.split(' _ _ ')[0]:
            #print('inside if: ', line)
            temp = line.split(' _ _ ')
            #print('temp: ', temp)
            temp[0] = temp[0].split()[0]
            #print('new temp: ', temp)
            file[index] = ' _ _ '.join(temp)
            #print('a[index]:', a[index])
    
            

In [8]:
with open('SemEval2022-Task11_Train-Dev/DE-German/de_train_all.conll', 'w') as f:
    for i in file:
        f.write(i)

In [79]:
a = ['records ile _ _ I-CORP\n',
     'slkfd _ _ O\n',
     '# id sdf',
     '\n',
     'and the _ _ B-GRP']

In [198]:
b = '– darunter _ _ O'
b

'–\xa0darunter _ _ O'

In [36]:
a = ['日本代表　試合別出場記録 _ _ O\n', 'slkfd _ _ O\n']
for index, line in a:
    if '\u3000' in line.split(' _ _ ')[0]:
        temp = line
        a[index] = line.split(' _ _ ')[0].split('\u300')[0]

yes


In [38]:
a.split(' _ _ ')[0].split('\u3000')[0]

'日本代表'

In [None]:
class NERBaseAnnotator(pl.LightningModule):
    def __init__(self,
                 train_data=None,
                 dev_data=None,
                 lr=1e-5,
                 dropout_rate=0.1,
                 batch_size=16,
                 tag_to_id=None,
                 stage='fit',
                 pad_token_id=1,
                 encoder_model='xlm-roberta-large',
                 num_gpus=1):
        super(NERBaseAnnotator, self).__init__()

        self.train_data = train_data
        self.dev_data = dev_data

        self.id_to_tag = {v: k for k, v in tag_to_id.items()}
        self.tag_to_id = tag_to_id
        self.batch_size = batch_size

        self.stage = stage
        self.num_gpus = num_gpus
        self.target_size = len(self.id_to_tag)

        # set the default baseline model here
        self.pad_token_id = pad_token_id

        self.encoder_model = encoder_model
        self.encoder = AutoModel.from_pretrained(encoder_model, return_dict=True)

        self.feedforward = nn.Linear(in_features=self.encoder.config.hidden_size, out_features=self.target_size)
        self.crf_layer = ConditionalRandomField(num_tags=self.target_size, constraints=allowed_transitions(constraint_type="BIO", labels=self.id_to_tag))

        self.lr = lr
        self.dropout = nn.Dropout(dropout_rate)

        self.span_f1 = SpanF1()
        self.setup_model(self.stage)
        self.save_hyperparameters('pad_token_id', 'encoder_model')

    def setup_model(self, stage_name):
        if stage_name == 'fit' and self.train_data is not None:
            # Calculate total steps
            train_batches = len(self.train_data) // (self.batch_size * self.num_gpus)
            self.total_steps = 50 * train_batches

            self.warmup_steps = int(self.total_steps * 0.01)


In [62]:
def blah(number):
    return Example(number)

class Example(object, number):
    def __init__(self):
        self.itsProblem = "problem"

    def the_example(self, loss):
        loss = "5"
        return loss

In [63]:
a = Example(5)
print(a.the_example)

<bound method Example.the_example of <__main__.Example object at 0x00000205D2766880>>


In [65]:
a.the_example()

'5'

In [None]:
import time

from utils.utils import get_reader, train_model, create_model, save_model, parse_args, get_tagset

if __name__ == '__main__':
    timestamp = time.time()
    sg = parse_args()
    out_dir_path = sg.out_dir + '/' + sg.model_name

    # load the dataset first
    train_data = get_reader(file_path=sg.train, target_vocab=get_tagset(sg.iob_tagging), encoder_model=sg.encoder_model, max_instances=sg.max_instances, max_length=sg.max_length)
    dev_data = get_reader(file_path=sg.dev, target_vocab=get_tagset(sg.iob_tagging), encoder_model=sg.encoder_model, max_instances=sg.max_instances, max_length=sg.max_length)
    
    best_loss = float('inf')
    #epochs = 10, dropout [0.5, 0.6, 0.7, 0.8], batch_size [32, 64], learning_rate [0.01, 0.001, 0.0001]
    epochs = [2, 10]
    learning_rate = [0.01, 0.0001]
    dropout = [0.5, 0.8]
    batch_size = [32, 64]
    
    for epoch in epochs:
        for lr in learning_rate
            for drop in dropout:
                for batch in batch_size:
                    model = create_model(train_data=train_data, dev_data=dev_data, tag_to_id=train_data.get_target_vocab(),
                                         dropout_rate=drop, batch_size=batch, stage=sg.stage, lr=lr,
                                         encoder_model=sg.encoder_model, num_gpus=sg.gpus)

                    trainer = train_model(model=model, out_dir=out_dir_path, epochs=epochs)

                    # use pytorch lightnings saver here.
                    out_model_path = save_model(trainer=trainer, out_dir=out_dir_path, model_name=f'{sg.model_name}_e{epoch}_lr{lr}_d{drop}_bs{batch}', timestamp=timestamp)



In [24]:
a = 'ko_train'
b = str(10)
c = a+b

In [25]:
c

'ko_train10'