## Ingestion, Summary Exploration, and Processing of Notes Events records from MIMIC-III

In [2]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.17.0-py3-none-any.whl (3.8 MB)
[K     |████████████████████████████████| 3.8 MB 3.3 MB/s eta 0:00:01
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 7.5 MB/s eta 0:00:01
Collecting tokenizers!=0.11.3,>=0.11.1
  Downloading tokenizers-0.11.6-cp39-cp39-macosx_10_11_x86_64.whl (3.6 MB)
[K     |████████████████████████████████| 3.6 MB 11.5 MB/s eta 0:00:01
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.47-py2.py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 8.6 MB/s eta 0:00:01
Installing collected packages: tokenizers, sacremoses, huggingface-hub, transformers
Successfully installed huggingface-hub-0.4.0 sacremoses-0.0.47 tokenizers-0.11.6 transformers-4.17.0


In [3]:
import pandas as pd
import re
import transformers as ppb
import numpy as np
from scipy import sparse

In [None]:
DATA_DIR = "./data"
notes_file = DATA_DIR + '/NOTEEVENTS.csv'

In [None]:
full_notes = pd.read_csv(notes_file)

In [None]:
print(full_notes.columns)
full_notes.head()

Examine single note:

In [9]:
print('Characters in first note:',len(full_notes['TEXT'][0]))

# partially truncating to avoid sharing full note due to confidentiality concerns
full_notes['TEXT'][0][50:600]

Characters in first note: 889


'ate:  [**2151-8-4**]\n\n\nService:\nADDENDUM:\n\nRADIOLOGIC STUDIES:  Radiologic studies also included a chest\nCT, which confirmed cavitary lesions in the left lung apex\nconsistent with infectious process/tuberculosis.  This also\nmoderate-sized left pleural effusion.\n\nHEAD CT:  Head CT showed no intracranial hemorrhage or mass\neffect, but old infarction consistent with past medical\nhistory.\n\nABDOMINAL CT:  Abdominal CT showed lesions of\nT10 and sacrum most likely secondary to osteoporosis. These can\nbe followed by repeat imaging as an outpatient.\n\n\n\n'

### Using RoBERTa 

In [4]:
#For RoBERTA:
from transformers import RobertaConfig, RobertaModel
from transformers import RobertaTokenizer

#Tokenizer:
tokenizer_r = RobertaTokenizer.from_pretrained("roberta-base")


# Initializing a RoBERTa configuration
configuration = RobertaConfig()

# Initializing a model from the configuration
model_r = RobertaModel(configuration)

In [5]:
tokenized_r = full_notes['TEXT'].apply((lambda x: tokenizer_r.encode(x, truncation=True, add_special_tokens=True)))
max_len = 0
for i in tokenized_r.values:
    if len(i) > max_len:
        max_len = len(i)

padded_r = np.array([i + [0]*(max_len-len(i)) for i in tokenized_r.values])


In [17]:
print(padded_r.shape)
padded_r.head()

(2083180, 512)

In [None]:
# crashed kernel
sparse_padded_r = sparse.csr_matrix(padded_r)

In [None]:
padded_r_file = DATA_DIR + '/sparse_padded_r'
sparse.save_npz(padded_r_file, sparse_padded_r)

### Using BioClinicalBERT

In [4]:
#from transformers import AutoTokenizer, AutoModel
tokenizer = ppb.AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
model = ppb.AutoModel.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")

Some weights of the model checkpoint at emilyalsentzer/Bio_ClinicalBERT were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [11]:
#Tokenize: 
tokenized_bc = full_notes['TEXT'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))
print('BioClinicalBERT tokenized shape:',tokenized_bc.shape)
tokenized_bc.head()

BioClinicalBERT tokenized shape: (2083180,)


0    [101, 10296, 2236, 131, 164, 115, 115, 18615, ...
1    [101, 10296, 2236, 131, 164, 115, 115, 20915, ...
2    [101, 10296, 2236, 131, 164, 115, 115, 20915, ...
3    [101, 10296, 2236, 131, 164, 115, 115, 19538, ...
4    [101, 10296, 2236, 131, 164, 115, 115, 22148, ...
Name: TEXT, dtype: object

In [5]:
#Tokenize and pad: 
tokenized_bc = full_notes['TEXT'].apply((lambda x: tokenizer.encode(x, truncation=True, max_length=512, \
    padding="max_length", add_special_tokens=True)))

print('BioClinicalBERT tokenized shape:',tokenized_bc.shape)
tokenized_bc.head()

BioClinicalBERT tokenized shape: (2083180,)


0    [101, 10296, 2236, 131, 164, 115, 115, 18615, ...
1    [101, 10296, 2236, 131, 164, 115, 115, 20915, ...
2    [101, 10296, 2236, 131, 164, 115, 115, 20915, ...
3    [101, 10296, 2236, 131, 164, 115, 115, 19538, ...
4    [101, 10296, 2236, 131, 164, 115, 115, 22148, ...
Name: TEXT, dtype: object

In [6]:
len(tokenized_bc[0])

512

In [7]:
# didn't work
tokenized_bc_np = np.array(tokenized_bc, dtype=int)

In [14]:
# didn't work
sparse_bc = sparse.csr_matrix(np.array(tokenized_bc, dtype=int))

In [None]:
from numpy import asarray
from numpy import savetxt

padded_bc_ar = asarray(padded_bc)
padded_r_file = DATA_DIR + '/padded_bc'
savetxt(padded_r_file, padded_bc_ar, delimiter=",")

### Using NLTK to tokenize and manually produce vocab

In [4]:
from collections import Counter
import json
from pathlib import Path

from nltk.tokenize import WordPunctTokenizer
tokenizer = WordPunctTokenizer()
counter = Counter()
counter.update(['<pad>', '<unk>'])
notes_text_list = list(full_notes['TEXT'])
print(f"Number of documents: {len(notes_text_list)}")

Number of documents: 2083180


In [7]:
for i, line in enumerate(notes_text_list):
    tokens = tokenizer.tokenize(line)
    tokens = [t.lower() for t in tokens]
    counter.update(tokens)
    #if i < 3:
    #    print(f"String of line {i}: {line.strip()}")
    #    print(f"Tokens of line {i}: {tokens}")
counter = dict(counter)

#print(f"Vocab size before frequency filtering: {len(counter)}")

vocab = {}
for word, freq in counter.items():
    #if freq < 3 and word not in ['<pad>', '<unk>']:
    #    continue
    vocab[word] = len(vocab)

print(f"Vocab size after frequency filtering: {len(vocab)}")
output_filepath = DATA_DIR + '/unigram_vocab.json'
json.dump(vocab, open(output_filepath, mode='w'))

Vocab size after frequency filtering: 552755


In [15]:
print('Number of documents:',full_notes.shape[0])

Number of documents: 2083180


In [9]:
print('Number of tokens:',sum(counter.values()))

Number of tokens: 753498364


Below is code for manually tokenizing; so far has been killing kernel so moved on to nltk above

In [14]:
# split on both spaces and the "\n" character for more accurate word count
re.split("\s+|\n",full_notes['TEXT'][0])

['Admission',
 'Date:',
 '[**2151-7-16**]',
 'Discharge',
 'Date:',
 '[**2151-8-4**]',
 'Service:',
 'ADDENDUM:',
 'RADIOLOGIC',
 'STUDIES:',
 'Radiologic',
 'studies',
 'also',
 'included',
 'a',
 'chest',
 'CT,',
 'which',
 'confirmed',
 'cavitary',
 'lesions',
 'in',
 'the',
 'left',
 'lung',
 'apex',
 'consistent',
 'with',
 'infectious',
 'process/tuberculosis.',
 'This',
 'also',
 'moderate-sized',
 'left',
 'pleural',
 'effusion.',
 'HEAD',
 'CT:',
 'Head',
 'CT',
 'showed',
 'no',
 'intracranial',
 'hemorrhage',
 'or',
 'mass',
 'effect,',
 'but',
 'old',
 'infarction',
 'consistent',
 'with',
 'past',
 'medical',
 'history.',
 'ABDOMINAL',
 'CT:',
 'Abdominal',
 'CT',
 'showed',
 'lesions',
 'of',
 'T10',
 'and',
 'sacrum',
 'most',
 'likely',
 'secondary',
 'to',
 'osteoporosis.',
 'These',
 'can',
 'be',
 'followed',
 'by',
 'repeat',
 'imaging',
 'as',
 'an',
 'outpatient.',
 '[**First',
 'Name8',
 '(NamePattern2)',
 '**]',
 '[**First',
 'Name4',
 '(NamePattern1)',
 '1775**

In [15]:
len(re.split("\s+|\n",full_notes['TEXT'][0]))

111

Test splitting into words on small sample (killed kernel after 12 min on first attempt to run the full dataset)

In [5]:
sample_notes = full_notes.sample(n = 200, random_state=53)

In [6]:
sample_notes['WORDS'] = sample_notes['TEXT'].str.split("\s+|\n")
sample_notes.head()

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,CHARTDATE,CHARTTIME,STORETIME,CATEGORY,DESCRIPTION,CGID,ISERROR,TEXT,WORDS
956690,961313,14734,,2127-05-21,2127-05-21 09:01:00,,Radiology,ABDOMEN (SUPINE ONLY),,,[**2127-5-21**] 9:01 AM\n ABDOMEN (SUPINE ONLY...,"[[**2127-5-21**], 9:01, AM, ABDOMEN, (SUPINE, ..."
1712826,1713696,2576,110661.0,2169-11-15,2169-11-15 19:49:00,2169-11-15 19:50:00,Nursing/other,Report,21232.0,,Agree with above note by [**Initials (NamePatt...,"[Agree, with, above, note, by, [**Initials, (N..."
1033727,1043826,63616,175657.0,2113-11-21,2113-11-21 08:28:00,,Radiology,CHEST (PA & LAT),,,"[**Last Name (LF) **],[**First Name3 (LF) **] ...","[[**Last, Name, (LF), **],[**First, Name3, (LF..."
1004537,1007150,28502,183617.0,2199-02-26,2199-02-26 03:15:00,,Radiology,CHEST (PORTABLE AP),,,[**2199-2-26**] 3:15 AM\n CHEST (PORTABLE AP) ...,"[[**2199-2-26**], 3:15, AM, CHEST, (PORTABLE, ..."
1044022,1055606,96629,,2118-02-17,2118-02-17 11:00:00,,Radiology,CT C-SPINE W/O CONTRAST,,,[**2118-2-17**] 11:00 AM\n CT C-SPINE W/O CONT...,"[[**2118-2-17**], 11:00, AM, CT, C-SPINE, W/O,..."


In [4]:
n_rows = full_notes.shape[0]
split_size = round(n_rows/3)
print('split size for easier processing:', split_size)

notes_1 = full_notes.iloc[:split_size]
notes_2 = full_notes.iloc[split_size:split_size*2]
# splitting the last third into 2 due to some extremely large notes in final third (killed kernel mutliple times)
notes_3 = full_notes.iloc[split_size*2:round(split_size*2.5)]
notes_4 = full_notes.iloc[round(split_size*2.5):]
print('notes shapes:',notes_1.shape, notes_2.shape, notes_3.shape, notes_4.shape)

split size for easier processing: 694393
notes shapes: (694393, 11) (694393, 11) (347196, 11) (347198, 11)


In [5]:
# takes approx 1.5 min to run
notes_1['WORDS'] = notes_1['TEXT'].str.split("\s+|\n")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  notes_1['WORDS'] = notes_1['TEXT'].str.split("\s+|\n")


In [None]:
full_notes_toks = pd.concat([notes_1, notes_2, notes_3, notes_4])
full_notes_toks.shape

### References

[Analytics Vidhya: Fine-tune a RoBERTa Encoder-Decoder model trained on MLM for Text Generation](https://medium.com/analytics-vidhya/fine-tune-a-roberta-encoder-decoder-model-trained-on-mlm-for-text-generation-23da5f3c1858)  
[StackOverflow: nlp - How to use Bert for long text classification?](https://stackoverflow.com/questions/58636587/how-to-use-bert-for-long-text-classification)  
