## Goals
Write a program that easily adjusts the labels of the Gold Standard Dataset.

### Importing files to modify tags

In [1]:
import csv
from tqdm import tqdm

In [2]:
epi_train_tokens, epi_train_labels = [],[]
with open('epi_gold_train.tsv','r', encoding="utf-8") as f:
    reader = csv.reader(f, delimiter="\t")
    sentence_tokens,sentences_tags=[],[]
    for row in reader:
        if len(row)%2==0:
            if len(row)==0:
                if len(sentence_tokens) != len(sentences_tags):
                    print('uh oh', sentence_tokens, sentences_tags, sep='\n')
                epi_train_tokens.append(sentence_tokens.copy())
                epi_train_labels.append(sentences_tags.copy())
                sentence_tokens.clear()
                sentences_tags.clear()
            else:
                sentence_tokens.append(row[0])
                sentences_tags.append(row[1])
f.close()
print(len(epi_train_tokens),len(epi_train_labels))

4426 4426


In [3]:
epi_val_tokens, epi_val_labels= [],[]
with open('epi_gold_val.tsv','r', encoding="utf-8") as f:
    reader = csv.reader(f, delimiter="\t")
    sentence_tokens,sentences_tags=[],[]
    for row in reader:
        if len(row)%2==0:
            if len(row)==0:
                epi_val_tokens.append(sentence_tokens.copy())
                epi_val_labels.append(sentences_tags.copy())
                sentence_tokens.clear()
                sentences_tags.clear()
            else:
                sentence_tokens.append(row[0])
                sentences_tags.append(row[1])
f.close()
print(len(epi_val_tokens),len(epi_val_labels))

1206 1206


In [4]:
#Need to modify test to calculate labeling accuracy
epi_test_tokens, epi_test_labels= [],[]
with open('epi_gold_test.tsv','r', encoding="utf-8") as f:
    reader = csv.reader(f, delimiter="\t")
    sentence_tokens,sentences_tags=[],[]
    for row in reader:
        if len(row)%2==0:
            if len(row)==0:
                epi_test_tokens.append(sentence_tokens.copy())
                epi_test_labels.append(sentences_tags.copy())
                sentence_tokens.clear()
                sentences_tags.clear()
            else:
                sentence_tokens.append(row[0])
                sentences_tags.append(row[1])
f.close()
print(len(epi_test_tokens),len(epi_test_labels))

537 537


### Replacing ABRV -> DIS

In [5]:
def replace_abrv_labels(tokens,labels):
    if len(tokens)!=len(labels):
        raise IndexError("Num Sentences {} and Num Sentence Labels {} Mismatch".format(len(tokens),len(labels)))
    
    for i in tqdm(range(len(tokens))):
        if len(tokens[i])!=len(labels[i]):
            raise IndexError("Sentence Length {} and Label Length {} Mismatch".format(len(tokens[i]),len(labels[i])))
        for j in range(len(labels[i])):
            if labels[i][j]=='B-ABRV':
                labels[i][j]=='O'
            if labels[i][j]=='I-ABRV':
                labels[i][j]=='O'
    
    return tokens, labels

In [6]:
mod_train_tokens, mod_train_labels = replace_abrv_labels(epi_train_tokens,epi_train_labels)

100%|██████████| 4426/4426 [00:00<00:00, 148086.61it/s]


In [7]:
mod_val_tokens, mod_val_labels = replace_abrv_labels(epi_val_tokens,epi_val_labels)

100%|██████████| 1206/1206 [00:00<00:00, 150250.42it/s]


In [8]:
mod_test_tokens, mod_test_labels = replace_abrv_labels(epi_test_tokens,epi_test_labels)

100%|██████████| 537/537 [00:00<00:00, 149309.99it/s]


#### Saving the labels

In [9]:
with open('epi_gold_train-abrv.tsv', "w") as f:
    for i in range(len(mod_train_tokens)): #For sentence in list of sentences
        for j in range(len(mod_train_tokens[i])): #for token in sentence
            output = str(mod_train_tokens[i][j]) +'\t' +str(mod_train_labels[i][j])+'\n'
            f.write(output)
        f.write('\n')
f.close()

In [10]:
with open('epi_gold_val-abrv.tsv', "w") as f:
    for i in range(len(mod_val_tokens)): #For sentence in list of sentences
        for j in range(len(mod_val_tokens[i])): #for token in sentence
            output = str(mod_val_tokens[i][j]) +'\t' +str(mod_val_labels[i][j])+'\n'
            f.write(output)
        f.write('\n')
f.close()

In [11]:
with open('epi_gold_test-abrv.tsv', "w") as f:
    for i in range(len(mod_test_tokens)): #For sentence in list of sentences
        for j in range(len(mod_test_tokens[i])): #for token in sentence
            output = str(mod_test_tokens[i][j]) +'\t' +str(mod_test_labels[i][j])+'\n'
            f.write(output)
        f.write('\n')
f.close()

### Replacing ABRV -> DIS

In [12]:
def replace_abrv_labels(tokens,labels):
    if len(tokens)!=len(labels):
        raise IndexError("Num Sentences {} and Num Sentence Labels {} Mismatch".format(len(tokens),len(labels)))
    
    for i in tqdm(range(len(tokens))):
        if len(tokens[i])!=len(labels[i]):
            raise IndexError("Sentence Length {} and Label Length {} Mismatch".format(len(tokens[i]),len(labels[i])))
        for j in range(len(labels[i])):
            if labels[i][j]=='B-ABRV':
                labels[i][j]=='B-DIS'
            if labels[i][j]=='I-ABRV':
                labels[i][j]=='I-DIS'
    
    return tokens, labels

In [13]:
mod_train_tokens, mod_train_labels = replace_abrv_labels(epi_train_tokens,epi_train_labels)

100%|██████████| 4426/4426 [00:00<00:00, 145529.15it/s]


In [14]:
mod_val_tokens, mod_val_labels = replace_abrv_labels(epi_val_tokens,epi_val_labels)

100%|██████████| 1206/1206 [00:00<00:00, 145224.96it/s]


In [15]:
mod_test_tokens, mod_test_labels = replace_abrv_labels(epi_test_tokens,epi_test_labels)

100%|██████████| 537/537 [00:00<00:00, 142094.58it/s]


#### Saving the labels

In [16]:
with open('epi_gold_train_abrv>dis.tsv', "w") as f:
    for i in range(len(mod_train_tokens)): #For sentence in list of sentences
        for j in range(len(mod_train_tokens[i])): #for token in sentence
            output = str(mod_train_tokens[i][j]) +'\t' +str(mod_train_labels[i][j])+'\n'
            f.write(output)
        f.write('\n')
f.close()

In [17]:
with open('epi_gold_val_abrv>dis.tsv', "w") as f:
    for i in range(len(mod_val_tokens)): #For sentence in list of sentences
        for j in range(len(mod_val_tokens[i])): #for token in sentence
            output = str(mod_val_tokens[i][j]) +'\t' +str(mod_val_labels[i][j])+'\n'
            f.write(output)
        f.write('\n')
f.close()

In [18]:
with open('epi_gold_test_abrv>dis.tsv', "w") as f:
    for i in range(len(mod_test_tokens)): #For sentence in list of sentences
        for j in range(len(mod_test_tokens[i])): #for token in sentence
            output = str(mod_test_tokens[i][j]) +'\t' +str(mod_test_labels[i][j])+'\n'
            f.write(output)
        f.write('\n')
f.close()

### Removing DIS & ABRV 

In [19]:
def remove_disease_labels(tokens,labels):
    if len(tokens)!=len(labels):
        raise IndexError("Num Sentences {} and Num Sentence Labels {} Mismatch".format(len(tokens),len(labels)))
    
    for i in tqdm(range(len(tokens))):
        if len(tokens[i])!=len(labels[i]):
            raise IndexError("Sentence Length {} and Label Length {} Mismatch".format(len(tokens[i]),len(labels[i])))
        for j in range(len(labels[i])):
            if labels[i][j] in {'B-ABRV','I-ABRV','B-DIS','I-DIS'}:
                labels[i][j]='O'
    
    return tokens, labels

In [20]:
mod_train_tokens, mod_train_labels = remove_disease_labels(epi_train_tokens,epi_train_labels)

100%|██████████| 4426/4426 [00:00<00:00, 227399.55it/s]


In [21]:
mod_val_tokens, mod_val_labels = remove_disease_labels(epi_val_tokens,epi_val_labels)

100%|██████████| 1206/1206 [00:00<00:00, 208290.33it/s]


In [22]:
mod_test_tokens, mod_test_labels = remove_disease_labels(epi_test_tokens,epi_test_labels)

100%|██████████| 537/537 [00:00<00:00, 230442.12it/s]


#### Saving the labels

In [23]:
with open('epi_gold_train-dz.tsv', "w") as f:
    for i in range(len(mod_train_tokens)): #For sentence in list of sentences
        for j in range(len(mod_train_tokens[i])): #for token in sentence
            output = str(mod_train_tokens[i][j]) +'\t' +str(mod_train_labels[i][j])+'\n'
            f.write(output)
        f.write('\n')
f.close()

In [24]:
with open('epi_gold_val-dz.tsv', "w") as f:
    for i in range(len(mod_val_tokens)): #For sentence in list of sentences
        for j in range(len(mod_val_tokens[i])): #for token in sentence
            output = str(mod_val_tokens[i][j]) +'\t' +str(mod_val_labels[i][j])+'\n'
            f.write(output)
        f.write('\n')
f.close()

In [25]:
with open('epi_gold_test-dz.tsv', "w") as f:
    for i in range(len(mod_test_tokens)): #For sentence in list of sentences
        for j in range(len(mod_test_tokens[i])): #for token in sentence
            output = str(mod_test_tokens[i][j]) +'\t' +str(mod_test_labels[i][j])+'\n'
            f.write(output)
        f.write('\n')
f.close()

### Removing DIS, ABRV, & ETHN 

In [26]:
def remove_disease_ethnicity(tokens,labels):
    if len(tokens)!=len(labels):
        raise IndexError("Num Sentences {} and Num Sentence Labels {} Mismatch".format(len(tokens),len(labels)))
    
    for i in tqdm(range(len(tokens))):
        if len(tokens[i])!=len(labels[i]):
            raise IndexError("Sentence Length {} and Label Length {} Mismatch".format(len(tokens[i]),len(labels[i])))
        for j in range(len(labels[i])):
            if labels[i][j] in {'B-ABRV','I-ABRV','B-DIS','I-DIS','B-ETHN','I-ETHN'}:
                labels[i][j]='O'
    
    return tokens, labels

In [27]:
mod_train_tokens, mod_train_labels = remove_disease_ethnicity(epi_train_tokens,epi_train_labels)

100%|██████████| 4426/4426 [00:00<00:00, 236216.13it/s]


In [28]:
mod_val_tokens, mod_val_labels = remove_disease_ethnicity(epi_val_tokens,epi_val_labels)

100%|██████████| 1206/1206 [00:00<00:00, 234409.87it/s]


In [29]:
mod_test_tokens, mod_test_labels = remove_disease_ethnicity(epi_test_tokens,epi_test_labels)

100%|██████████| 537/537 [00:00<00:00, 231579.40it/s]


In [30]:
mod_train_val_tokens = mod_train_tokens+mod_val_tokens
mod_train_val_labels = mod_train_labels+mod_val_labels

with open('epi_gold_train_val-dz.tsv', "w") as f:
    for i in range(len(mod_train_val_tokens)): #For sentence in list of sentences
        for j in range(len(mod_train_val_tokens[i])): #for token in sentence
            output = str(mod_train_val_tokens[i][j]) +'\t' +str(mod_train_val_labels[i][j])+'\n'
            f.write(output)
        f.write('\n')
f.close()

#### Saving the labels

In [31]:
with open('epi_gold_train-dz-ethn.tsv', "w") as f:
    for i in range(len(mod_train_tokens)): #For sentence in list of sentences
        for j in range(len(mod_train_tokens[i])): #for token in sentence
            output = str(mod_train_tokens[i][j]) +'\t' +str(mod_train_labels[i][j])+'\n'
            f.write(output)
        f.write('\n')
f.close()

In [32]:
with open('epi_gold_val-dz-ethn.tsv', "w") as f:
    for i in range(len(mod_val_tokens)): #For sentence in list of sentences
        for j in range(len(mod_val_tokens[i])): #for token in sentence
            output = str(mod_val_tokens[i][j]) +'\t' +str(mod_val_labels[i][j])+'\n'
            f.write(output)
        f.write('\n')
f.close()

In [33]:
with open('epi_gold_test-dz-ethn.tsv', "w") as f:
    for i in range(len(mod_test_tokens)): #For sentence in list of sentences
        for j in range(len(mod_test_tokens[i])): #for token in sentence
            output = str(mod_test_tokens[i][j]) +'\t' +str(mod_test_labels[i][j])+'\n'
            f.write(output)
        f.write('\n')
f.close()

In [34]:
mod_train_val_tokens = mod_train_tokens+mod_val_tokens
mod_train_val_labels = mod_train_labels+mod_val_labels

with open('epi_gold_train_val-dz-ethn.tsv', "w") as f:
    for i in range(len(mod_train_val_tokens)): #For sentence in list of sentences
        for j in range(len(mod_train_val_tokens[i])): #for token in sentence
            output = str(mod_train_val_tokens[i][j]) +'\t' +str(mod_train_val_labels[i][j])+'\n'
            f.write(output)
        f.write('\n')
f.close()

### Adding Location (CoNLL++)

In [35]:
#!pip install datasets
from datasets import load_dataset
coNLL = load_dataset("conllpp")
coNLL

Reusing dataset conllpp (/home/wzkariampuzha/.cache/huggingface/datasets/conllpp/conllpp/1.0.0/04f15f257dff3fe0fb36e049b73d51ecdf382698682f5e590b7fb13898206ba2)


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})

In [36]:
#NER_tag '5' is B-LOC, '6' is I-LOC
#Get numbers on the amount of location 

def read_loc_dataset(dataset):
    token_docs = []
    tag_docs = []
    for sentence in dataset:
        #Only add sentences that actually have location tags (i.e. meaningfully annotated sentences)
        if (5 in sentence['ner_tags'] or 6 in sentence['ner_tags']):
            tags = []
            #Only keep location tags
            for tag in sentence['ner_tags']:
                label = 'O'
                if tag ==5:
                    label = 'B-LOC'
                if tag == 6:
                    label = 'I-LOC'
                tags.append(label)
            
            #Raise error if mismatch
            if len(sentence['tokens']) != len(tags):
                print('mismatch')
                print(sentence['tokens'])
                print(tags)
            
            token_docs.append(sentence['tokens'])
            tag_docs.append(tags)
        
    return token_docs, tag_docs

In [37]:
train_texts_loc, train_tags_loc = read_loc_dataset(coNLL["train"])
val_texts_loc, val_tags_loc = read_loc_dataset(coNLL["validation"])
test_texts_loc, test_tags_loc = read_loc_dataset(coNLL["test"])

In [38]:
#Show the data
for i in range(2): #for sentence in abstract
    for j in range(len((train_texts_loc[i]))): #for token in sentence
        print(train_texts_loc[i][j], train_tags_loc[i][j])
    print('')

BRUSSELS B-LOC
1996-08-22 O

Germany B-LOC
's O
representative O
to O
the O
European O
Union O
's O
veterinary O
committee O
Werner O
Zwingmann O
said O
on O
Wednesday O
consumers O
should O
buy O
sheepmeat O
from O
countries O
other O
than O
Britain B-LOC
until O
the O
scientific O
advice O
was O
clearer O
. O



#### Saving the modified train set + location

In [39]:
#Add extra location data to the modified (gold-dz-ethn) data
train_texts = mod_train_tokens + train_texts_loc + val_texts_loc + test_texts_loc
train_tags = mod_train_labels + train_tags_loc + val_tags_loc + test_tags_loc
print(len(train_texts),len(train_tags))

12135 12135


In [40]:
with open('epi_gold_train++loc-dz-ethn.tsv', "w") as f:
    for i in tqdm(range(len(train_texts))): #For sentence in abstract
        for j in range(len(train_texts[i])): #for token in sentence
            output = str(train_texts[i][j]) +'\t' +str(train_tags[i][j])+'\n'
            f.write(output)
        f.write('\n')
f.close()

100%|██████████| 12135/12135 [00:00<00:00, 41823.51it/s]


#### Saving the modified train+val set + location

In [41]:
train_texts+=mod_val_tokens
train_tags+=mod_val_labels
with open('epi_gold_train_val++loc-dz-ethn.tsv', "w") as f:
    for i in tqdm(range(len(train_texts))): #For sentence in abstract
        for j in range(len(train_texts[i])): #for token in sentence
            output = str(train_texts[i][j]) +'\t' +str(train_tags[i][j])+'\n'
            f.write(output)
        f.write('\n')
f.close()

100%|██████████| 13341/13341 [00:00<00:00, 40978.85it/s]


### Converting the inputs to identify sentences (multilabel sentence classification)
Text classification at the sentence level if the sentence has B-STAT

In [42]:
#Input will be 2 lists of lists, Output will be list of tuple size 9.
def tokens2sents(tokens,sentence_labels):
    if len(tokens)!=len(sentence_labels):
        raise IndexError("Num Sentences {} and Num Sentence sentence_labels {} Mismatch".format(len(tokens),len(sentence_labels)))
    output = []
    
    for i in tqdm(range(len(tokens))):
        if len(tokens[i])!=len(sentence_labels[i]):
            raise IndexError("Sentence Length {} and Label Length {} Mismatch".format(len(tokens[i]),len(sentence_labels[i])))
        sentence = ' '.join(tokens[i])
        stat_label = int('B-STAT' in sentence_labels[i] or 'I-STAT' in sentence_labels[i]) #boolean evaluates to 1 if True, 0 if False
        epi_label = int('B-EPI' in sentence_labels[i] or 'I-EPI' in sentence_labels[i])
        dis_label = int('B-DIS' in sentence_labels[i] or 'I-DIS' in sentence_labels[i])
        abrv_label = int('B-ABRV' in sentence_labels[i] or 'I-ABRV' in sentence_labels[i])
        loc_label = int('B-LOC' in sentence_labels[i] or 'I-LOC' in sentence_labels[i])
        ethn_label = int('B-ETHN' in sentence_labels[i] or 'I-ETHN' in sentence_labels[i])
        date_label = int('B-DATE' in sentence_labels[i] or 'I-DATE' in sentence_labels[i])
        sex_label = int('B-SEX' in sentence_labels[i] or 'I-SEX' in sentence_labels[i])
        output.append((str(sentence),str(stat_label),str(epi_label),str(dis_label),str(abrv_label),str(loc_label),str(ethn_label),str(date_label),str(sex_label)))
        
    return output

In [43]:
train_output = tokens2sents(epi_train_tokens,epi_train_labels)

100%|██████████| 4426/4426 [00:00<00:00, 75913.30it/s]


In [44]:
val_output = tokens2sents(epi_val_tokens,epi_val_labels)

100%|██████████| 1206/1206 [00:00<00:00, 77887.57it/s]


In [45]:
test_output = tokens2sents(epi_test_tokens,epi_test_labels)

100%|██████████| 537/537 [00:00<00:00, 74538.88it/s]


#### Saving the labels

In [46]:
with open('epi_gold_train_sents.tsv', "w") as f:
    for i in range(len(train_output)): #For sentence in list of sentences
        output = '\t'.join(train_output[i])+'\n'
        f.write(output)
f.close()

In [47]:
with open('epi_gold_val_sents.tsv', "w") as f:
    for i in range(len(val_output)): #For sentence in list of sentences
        output = '\t'.join(val_output[i])+'\n'
        f.write(output)
f.close()

In [48]:
with open('epi_gold_test_sents.tsv', "w") as f:
    for i in range(len(test_output)): #For sentence in list of sentences
        output = '\t'.join(test_output[i])+'\n'
        f.write(output)
f.close()