In [28]:
from collections import Counter
import glob
import html
import os
import re

import numpy as np
import pandas as pd

from bs4 import BeautifulSoup

In [122]:
pd.set_option('max_colwidth', None)
pd.set_option('display.max_rows', 300)

The i2b2 challenges do not provide data in the IOB format.  In this notebook we convert them to that format so they can be fed into our models and do some cleanup of the data, for example only including docs with annotations and filling in any empty labels with 'O' (for outside.)

# Convert i2b2 2010 Data to IOB Format

The code in this section is modified from the Medium article [Named Entity Recognition for Clinical Text](https://medium.com/atlas-research/ner-for-clinical-text-7c73caddd180).  It has been modified to work with the LLMs in the primary notebook. Also, it was constructed to handle the i2b2 2011 task (co-reference resolution).  Unfortunately, the formats are not the same but close.

Interestingly, that article is not in fact about NER, it's just about getting the i2b2 data into IOB format!

In [94]:
def find_files(rootdir, suffix):
    files = []
    for dirname, _, filenames in os.walk(rootdir):
        for filename in filenames:
            if filename.endswith(suffix):
                files.append(os.path.join(dirname, filename))
    return files

In [95]:
annotation_files = find_files('data/i2b2/2010/', '.con')
txt_files = find_files('data/i2b2/2010/', '.txt')

It looks like we have some files that have not been annotated.

In [96]:
len(annotation_files), len(txt_files)

(426, 693)

Only include those text files that have been annotated.

In [97]:
filename = lambda fn,suf: fn.split('/')[-1].replace(suf, '')

In [98]:
annotated_file_set = set([filename(fn, '.con') for fn in annotation_files])

In [99]:
filterd_txt_files = [fn for fn in txt_files if filename(fn, '.txt') in annotated_file_set]
txt_files = filterd_txt_files

Good - it looks like we have 426 annotated files.

In [100]:
len(annotation_files), len(txt_files)

(426, 426)

Reorder these so the ids match

In [101]:
afiles_by_id = {filename(fn, '.con'): fn for fn in annotation_files}
tfiles_by_id = {filename(fn, '.txt'): fn for fn in txt_files}

In [102]:
a_ids, annotation_files, txt_files = zip(*[(id_, afn, tfiles_by_id[id_]) for id_, afn in afiles_by_id.items()])

What are the entity types and their frequency?

In [103]:
entity_type_ctr = Counter()
for fn in annotation_files:
    with open(fn) as f:
        for l in f:
            entity_type_ctr.update([l.rstrip().split('||')[-1].split('=')[-1].replace('"', '')])

In [104]:
entity_type_ctr.most_common()

[('problem', 19665), ('treatment', 14188), ('test', 13833)]

In [105]:
def read_file(path):
    with open(path) as f:
        content = f.read().splitlines()
    return content

In [106]:
# Build annotation and entry corpora
annotation_corpus = [read_file(fn) for fn in annotation_files]
txt_corpus = [read_file(fn) for fn in txt_files]

In [107]:
entries_cols = ["docid", "row", "offset", "word"]
entries_df = pd.DataFrame(columns=entries_cols)

annotations_cols = ["docid", "NER_tag", "row", "offset", "length"]
annotations_df = pd.DataFrame(columns=annotations_cols)

In [108]:
annotations_df = pd.DataFrame(columns=annotations_cols)  # Reset df
tmp_list = []  # Set up variable to hold row info

for i, document in enumerate(annotation_corpus):
    
    for row in document:
        text_info, type_info = row.split("||")
        
        text = text_info.split('"')[1]
        
        offset_start = text_info.split(' ')[-2]
        offset_end = text_info.split(' ')[-1]
        
        line = offset_start.split(':')[0] # Given one sentence to line, 
                                          # line number will be the same for offset_start and offset_end
        
        word_offset_start = int(offset_start.split(':')[1])
        word_offset_end = int(offset_end.split(':')[1])
        length = word_offset_end-word_offset_start +1
        
        a_type = type_info.split('"')[-2]
        
        # Split text into tokens with IOB tags
        first = True  # Set up flag to id start of text
        BIO_tag = "B-"
        if length > 1:  # Isolate text with multiple tokens 
            for offset in range(word_offset_start, word_offset_end+1):
                if first:
                    tag_label = BIO_tag + a_type # Set tag for first word to start with B-
                    first = False  # Change flag
                else:
                    tag_label = tag_label.replace("B-", "I-")
                tmp_list.append([a_ids[i], tag_label, line, offset, 1])                
        else:
            tmp_list.append([a_ids[i], BIO_tag + a_type, line, word_offset_start, length])
        
annotations_df = pd.DataFrame(tmp_list, columns=annotations_cols)
annotations_df = annotations_df.drop(columns=["length"])

In [123]:
annotations_df.head()

Unnamed: 0,docid,NER_tag,row,offset
0,405507617,B-problem,61,18
1,405507617,I-problem,61,19
2,405507617,I-problem,61,20
3,405507617,I-problem,61,21
4,405507617,I-problem,61,22


In [110]:
entries_df = pd.DataFrame(columns=entries_cols)  # Reset df
tmp_list = []

for doc_i, document in enumerate(txt_corpus):
    for row_i, row in enumerate(document):
        row_split = row.split(" ")
        for word_i, word in enumerate(row_split):
            word = word.replace("\t", "")
            word_id = a_ids[doc_i]
            word_row = row_i+1  # 1-based indexing 
            word_offset = word_i # 0-based indexing
            if len(word) > 0 and "|" not in word:
                tmp_list.append([word_id, word_row, word_offset, word])

entries_df = pd.DataFrame(tmp_list, columns=entries_cols)

ner_counter = [1 for i in annotations_df["NER_tag"] if "B-" in i]
print(len(ner_counter), "named entities")

47686 named entities


In [111]:
entries_df

Unnamed: 0,docid,row,offset,word
0,405507617,1,0,405507617
1,405507617,2,0,FIH
2,405507617,3,0,2887168
3,405507617,4,0,132052
4,405507617,5,0,543394
...,...,...,...,...
416776,0390,118,0,Dr.
416777,0390,118,1,Thorebreutz
416778,0390,118,2,","
416779,0390,118,3,E


In [112]:
# Ensure correct dtypes

annotations_df[['row', 'offset']] = annotations_df[['row', 'offset']].apply(pd.to_numeric)
annotations_df['NER_tag'] = annotations_df["NER_tag"].astype(str)
entries_df[['row', 'offset']] = entries_df[['row', 'offset']].apply(pd.to_numeric)
entries_df["word"] = entries_df["word"].astype(str)

result_df = pd.merge(entries_df, annotations_df, how="left", on=['docid', 'row', 'offset'])

# Check for NaNs (should be only in NER_tag, where NaNs will be replaced with "O" (outside))
print("Columns with missing data:\n", result_df.isna().any())

Columns with missing data:
 docid      False
row        False
offset     False
word       False
NER_tag     True
dtype: bool


In [113]:
result_df = result_df.fillna("O")
result_df = result_df.drop(columns=["row", "offset"])

ner_counter = [1 for i in result_df["NER_tag"] if "B-" in i]
print(len(ner_counter), "named entities and", result_df.shape[0], "tokens")

47678 named entities and 416789 tokens


In [114]:
result_df.head(10)

Unnamed: 0,docid,word,NER_tag
0,405507617,405507617,O
1,405507617,FIH,O
2,405507617,2887168,O
3,405507617,132052,O
4,405507617,543394,O
5,405507617,11/12/2002,O
6,405507617,12:00:00,O
7,405507617,AM,O
8,405507617,Discharge,O
9,405507617,Summary,O


In [115]:
n_docs = result_df.docid.nunique()
n_words = result_df.word.count()

In [116]:
n_docs, n_words, int(np.round(n_words/n_docs))

(426, 416789, 978)

In [117]:
frequencies = result_df.NER_tag.value_counts()
frequencies

NER_tag
O              317223
I-problem       27936
B-problem       19663
B-treatment     14187
B-test          13828
I-treatment     12054
I-test          11898
Name: count, dtype: int64

In [118]:
tags = {}
for tag, count in zip(frequencies.index, frequencies):
    if tag != "O":
        if tag[2:] not in tags.keys():
            tags[tag[2:]] = count
        else:
            tags[tag[2:]] += count
    continue

print(sorted(tags.items(), key=lambda x: x[1], reverse=True))

[('problem', 47599), ('treatment', 26241), ('test', 25726)]


In [119]:
print("Columns with missing data:\n", result_df.isna().any())

Columns with missing data:
 docid      False
word       False
NER_tag    False
dtype: bool


In [120]:
result_df.to_pickle('data/i2b2/2010/i2b2_dataset_df.pkl')