This file is parsing i2b2 training data and annotating it with the CoNLL BIO scheme, which has this form:

In [1]:
__author__ = "Maximilian Hofer"
__copyright__ = "Copyright 2018"
__license__ = "MIT"

In [1]:
import os
from nltk import pos_tag, RegexpParser
import pandas as pd
import numpy as np

In [2]:
a_ids = []
e_ids = []

for filename in os.listdir("./data/annotations"):
    if filename[0] != ".":  # ignore hidden files
        a_ids.append(int(filename))
for filename in os.listdir("./data/entries"):
    if filename[0] != ".": 
        e_ids.append(int(filename))
    
a_ids = tuple(sorted(a_ids)) 
e_ids = tuple(sorted(e_ids))

intersection = list(set(a_ids) & set(e_ids))
if len(intersection) == len(a_ids):
    print("Success: all anotations have a corresponding entry.", len(intersection))

Success: all anotations have a corresponding entry. 261


## Build corpora

In [3]:
# build annotation and entry corpora

a_corpus = []
e_corpus = []

# only annotations and corresponding files
for file in a_ids:
    path = "./data/annotations/" + str(file)
    with open(path) as f:
        content = f.read().splitlines()
        a_corpus.append(content)

    path = "./data/entries/" + str(file)
    with open(path) as f:
        #content = f.readlines()
        content = f.read().splitlines()
        e_corpus.append(content)
    

## Set up dataframe

In [5]:
#  ["id", "row", "offset", "word", "POS", "chunk", "NER"]
entries_cols = ["id", "row", "offset", "word"]
entries_df = pd.DataFrame(columns=entries_cols)

In [6]:
entries_df.head()

Unnamed: 0,id,row,offset,word


In [7]:
annotations_cols = ["id", "NER_tag", "row", "offset", "length"]
annotations_df = pd.DataFrame(columns=annotations_cols)

In [8]:
annotations_df.head()

Unnamed: 0,id,NER_tag,row,offset,length


## Number of annotations

In [36]:
med_count = 0
dosage_count = 0
mode_count = 0
freq_count = 0
dur_count = 0
reason_count = 0

for document in a_corpus:
    for line in document:
        if "m=\"nm\"" not in line:
            med_count += 1
        if "do=\"nm\"" not in line:
            dosage_count += 1
        if "mo=\"nm\"" not in line:
            mode_count += 1
        if "f=\"nm\"" not in line:
            freq_count += 1
        if "du=\"nm\"" not in line:
            dur_count += 1
        if "r=\"nm\"" not in line:
            reason_count += 1
        
print("Medication annotations: ", med_count)
print("Dosage annotations: ", dosage_count)
print("Mode annotations: ", mode_count)
print("Frequency annotations: ", freq_count)
print("Duration annotations: ", dur_count)
print("Reason annotations: ", reason_count)


Medication annotations:  9318
Dosage annotations:  4666
Mode annotations:  3513
Frequency annotations:  4229
Duration annotations:  571
Reason annotations:  1694


## Build annotations data frame

In [37]:
annotations_df = pd.DataFrame(columns=annotations_cols)  # reset df
tmp_list = []

for i, document in enumerate(a_corpus):
    
    for row in document:
        row = row.split("||")
        # print(row, "\n")
        
        for tag in row: 
            # print(tag)
            tag = tag.split("=")
            if ":" in tag[1]:
                tag_label = tag[0].lstrip(" ")
                tag_row_a = tag[1].split(" ")[-2:][0].split(":")[0]
                tag_row_b = tag[1].split(" ")[-2:][1].split(":")[0]
                
                # some annotations have non-standard formatting (losing 64 instances)
                try:
                    tag_offset_a = int(tag[1].split(" ")[-2:][0].split(":")[1])
                    tag_offset_b = int(tag[1].split(" ")[-2:][1].split(":")[1])
                    length = tag_offset_b - tag_offset_a + 1
                    
                    # 1 row = 1 token with a tag
                    first = True
                    BIO_tag = "B-"
                    if length > 1 and tag_row_a == tag_row_b:
                        for offset in range(tag_offset_a, tag_offset_b+1):
                            if first: 
                                tag_label = BIO_tag + tag_label
                                first = False
                            else:
                                tag_label = tag_label.replace("B-", "I-")
                            tmp_list.append([a_ids[i], tag_label, tag_row_a, offset, 1])
                    # TODO: tags over line breaks
                    else:
                        tmp_list.append([a_ids[i], BIO_tag + tag_label, tag_row_a, tag_offset_a, length])
                except:
                    pass             

annotations_df = pd.DataFrame(tmp_list, columns=annotations_cols)
annotations_df.reset_index(inplace=True)
                        

In [38]:
annotations_df = annotations_df.drop(columns=["index", "length"])
annotations_df.shape

(37071, 4)

In [39]:
annotations_df.head()

Unnamed: 0,id,NER_tag,row,offset
0,661,B-m,16,0
1,661,I-m,16,1
2,661,B-do,16,2
3,661,I-do,16,3
4,661,B-mo,16,4


In [40]:
entries_df.head()

Unnamed: 0,id,row,offset,word
0,0,0,0,-DOCSTART-
1,0,0,0,-EMPTYLINE-
2,661,1,0,RECORD
3,661,1,1,#661
4,661,2,0,753455514


In [41]:
e_corpus[0][0].split(" ")

['RECORD', '#661']

## Build entries data frame

In [42]:
entries_df = pd.DataFrame(columns=entries_cols)  # reset df
tmp_list = []

for doc_i, document in enumerate(e_corpus):
    
    tmp_list.append([0, 0, 0, "-DOCSTART-"])
    tmp_list.append([0, 0, 0, "-EMPTYLINE-"])
    
    for row_i, row in enumerate(document):
        row_split = row.split(" ")
        for word_i, word in enumerate(row_split):
            word = word.rstrip(".")  # strip "." from end of word
            word = word.replace("\t", "")
            word_id = a_ids[doc_i]
            word_row = row_i+1  # 1-based indexing 
            word_offset = word_i # 0-based indexing
            
            if len(word) > 0 and "|" not in word:
                tmp_list.append([word_id, word_row, word_offset, word])
        
    tmp_list.append([0, 0, 0, "-EMPTYLINE-"])

entries_df = pd.DataFrame(tmp_list, columns=entries_cols)


In [43]:
entries_df.head()

Unnamed: 0,id,row,offset,word
0,0,0,0,-DOCSTART-
1,0,0,0,-EMPTYLINE-
2,661,1,0,RECORD
3,661,1,1,#661
4,661,2,0,753455514


In [44]:
annotations_df.head()

Unnamed: 0,id,NER_tag,row,offset
0,661,B-m,16,0
1,661,I-m,16,1
2,661,B-do,16,2
3,661,I-do,16,3
4,661,B-mo,16,4


In [45]:
ner_counter = [1 for i in annotations_df["NER_tag"] if "B-" in i]
print(len(ner_counter), "named entities")

23926 named entities


## Joing entries and annotations

In [46]:
# ensure correct dtypes
annotations_df[['id', 'row', 'offset']] = annotations_df[['id', 'row', 'offset']].apply(pd.to_numeric)
annotations_df['NER_tag'] = annotations_df["NER_tag"].astype(str)
entries_df[['id', 'row', 'offset']] = entries_df[['id', 'row', 'offset']].apply(pd.to_numeric)
entries_df["word"] = entries_df["word"].astype(str)


In [47]:
result_df = pd.merge(entries_df, annotations_df, how="left", on=['id', 'row', 'offset'])

In [48]:
# replace NaNs with "O"
print("columns with missing data:\n", result_df.isna().any())
result_df = result_df.fillna("O")

columns with missing data:
 id         False
row        False
offset     False
word       False
NER_tag     True
dtype: bool


In [49]:
print("columns with missing data:\n", result_df.isna().any())

columns with missing data:
 id         False
row        False
offset     False
word       False
NER_tag    False
dtype: bool


In [50]:
result_df = result_df.drop(columns=["id", "row", "offset"])
result_df.head()

Unnamed: 0,word,NER_tag
0,-DOCSTART-,O
1,-EMPTYLINE-,O
2,RECORD,O
3,#661,O
4,753455514,O


In [51]:
result_df.shape

(295530, 2)

In [53]:
# 71 fewer annotations than expected as annotations over line breaks are not included
ner_counter = [1 for i in result_df["NER_tag"] if "B-" in i]
print(len(ner_counter), "named entities")

23920 named entities


# POS tagger

In [None]:
from nltk.chunk.regexp import RegexpChunkParser, ChunkRule, RegexpParser
from nltk.tree import Tree

In [54]:
text = result_df["word"].tolist()
text_pos = pos_tag(text)
text_pos_list = [i[1] for i in text_pos]

In [55]:
len(text_pos_list)

295530

In [56]:
result_df.columns

Index(['word', 'NER_tag'], dtype='object')

In [57]:
result_df["POS_tag"] = text_pos_list

In [58]:
result_df.head()

Unnamed: 0,word,NER_tag,POS_tag
0,-DOCSTART-,O,JJ
1,-EMPTYLINE-,O,NN
2,RECORD,O,NNP
3,#661,O,VBZ
4,753455514,O,CD


# CoNLL chunk tagger

In [60]:
text_test = "EU rejects German call to boycott British lamb.".split(" ")
text_pos_test = pos_tag(text_test)

In [61]:
text_pos_test

[('EU', 'NNP'),
 ('rejects', 'VBZ'),
 ('German', 'JJ'),
 ('call', 'NN'),
 ('to', 'TO'),
 ('boycott', 'VB'),
 ('British', 'JJ'),
 ('lamb.', 'NN')]

### Noun phrases

In [62]:
rule_0 = ChunkRule("<DT>?<JJ.*>*<NN.*>+", "More complete chunk NP sequences")

chunk_parser_np = RegexpChunkParser([rule_0],chunk_label='NP')

chunk_result_tree_np = chunk_parser_np.parse(text_pos)


In [63]:
chunk_tag_np = []

for i in chunk_result_tree_np:
    if isinstance(i, Tree):
        for j in range(0, len(i)):
            if j == 0:
                # print("B-" + i.label())
                chunk_tag_np.append("B-" + i.label())
            else:
                chunk_tag_np.append("I-" + i.label())
                # print("I-" + i.label())
    else:
        # print("O")
        chunk_tag_np.append("O")


In [64]:
len(chunk_tag_np) == result_df.shape[0]  # check that chunk col has same length

True

### Verb phrases

In [65]:
rule_1 = ChunkRule("<VBD|IN|\.>", "Verb phrases")

chunk_parser_vp = RegexpChunkParser([rule_1],chunk_label='VP')

chunk_result_tree_vp = chunk_parser_vp.parse(text_pos)

In [66]:
chunk_tag_vp = []

for i in chunk_result_tree_vp:
    if isinstance(i, Tree):
        for j in range(0, len(i)):
            if j == 0:
                # print("B-" + i.label())
                chunk_tag_vp.append("B-" + i.label())
            else:
                chunk_tag_vp.append("I-" + i.label())
                # print("I-" + i.label())
    else:
        # print("O")
        chunk_tag_vp.append("O")


In [67]:
len(chunk_tag_np) == result_df.shape[0] == len(chunk_tag_vp)

True

In [68]:
# augment chunk tags with verb phrase tags
for i, entry in enumerate(chunk_tag_np):
    if entry == "O":
        chunk_tag_np[i] = chunk_tag_vp[i]

There are not prepositional phrases.

In [69]:
result_df["chunk_tag"] = chunk_tag_np

In [70]:
result_df = result_df[['word', 'POS_tag', 'chunk_tag', 'NER_tag']]  # order columns

In [71]:
result_df.shape

(295530, 4)

In [72]:
result_df[['word', 'POS_tag', 'chunk_tag', 'NER_tag']] = result_df[['word', 'POS_tag', 'chunk_tag', 'NER_tag']].astype(str)
result_df.dtypes

word         object
POS_tag      object
chunk_tag    object
NER_tag      object
dtype: object

### Data split

In [73]:
result_df.shape

(295530, 4)

In [74]:
result_df = result_df.reindex()

In [75]:
# find indices of new documents
result_df[result_df["word"] == "-DOCSTART-"].index.values.tolist()

[0,
 1276,
 2643,
 3501,
 4340,
 6074,
 7605,
 8338,
 9487,
 10492,
 11741,
 13190,
 13931,
 15348,
 17511,
 18295,
 19684,
 20642,
 21666,
 22511,
 23821,
 24967,
 25764,
 26824,
 27569,
 28615,
 29809,
 31305,
 32560,
 33630,
 34563,
 35506,
 36434,
 37709,
 38972,
 40162,
 41597,
 42686,
 43633,
 46140,
 46842,
 48292,
 48972,
 51051,
 51837,
 52981,
 54031,
 54675,
 55827,
 56924,
 57645,
 58901,
 61094,
 62032,
 63227,
 64828,
 65691,
 66136,
 66757,
 67576,
 68389,
 69526,
 70216,
 71449,
 72154,
 74099,
 74623,
 75956,
 76659,
 77380,
 79531,
 80166,
 80972,
 82251,
 83738,
 84245,
 84563,
 85644,
 86514,
 87226,
 87789,
 91627,
 92746,
 93458,
 93789,
 94769,
 95618,
 96868,
 98917,
 99644,
 100381,
 101563,
 103006,
 103894,
 104587,
 105233,
 106952,
 107650,
 108486,
 109462,
 110366,
 110989,
 112770,
 114223,
 115632,
 116943,
 118646,
 120498,
 121307,
 123082,
 124532,
 125961,
 126684,
 127701,
 129822,
 130768,
 131468,
 132054,
 133185,
 134006,
 134851,
 137453,
 138

In [76]:
train = 202062  
dev = 247618
result_train_df = result_df.iloc[:train]
result_dev_df = result_df.iloc[train:dev]
result_test_df = result_df.iloc[dev:]

In [77]:
result_test_df.tail()

Unnamed: 0,word,POS_tag,chunk_tag,NER_tag
295525,6/10,CD,O,O
295526,T:,NNP,B-NP,O
295527,1/22,CD,O,O
295528,[report_end],NNP,B-NP,O
295529,-EMPTYLINE-,NN,I-NP,O


In [78]:
print("train shape ", result_train_df.shape)
print("dev shape ", result_dev_df.shape)
print("test shape ", result_test_df.shape)

train shape  (202062, 4)
dev shape  (45556, 4)
test shape  (47912, 4)


In [79]:
result_df.to_csv("result_df_NER_POS_chunk.csv")

# Write to txt

In [80]:
np.savetxt("train.txt", result_train_df.values, fmt="%s")
np.savetxt("valid.txt", result_dev_df.values, fmt="%s")
np.savetxt("test.txt", result_test_df.values, fmt="%s")