## Ingestion, Summary Exploration, and Processing of Notes Events records from MIMIC-III

In [1]:
import pandas as pd
import re
import transformers as ppb
import numpy as np
from scipy import sparse

In [2]:
DATA_DIR = "./data"
notes_file = DATA_DIR + '/NOTEEVENTS.csv'

### Creating Dataframe of Preferred Note per Hospital Visit

In [32]:
full_notes = pd.read_csv(notes_file)

  exec(code_obj, self.user_global_ns, self.user_ns)


Confirm some notes don't have hospital stays associated (HADM_ID) and some don't have discharge or nursing notes:

In [33]:
print(full_notes.shape)
full_notes['CATEGORY'].unique()

(2083180, 11)


array(['Discharge summary', 'Echo', 'ECG', 'Nursing', 'Physician ',
       'Rehab Services', 'Case Management ', 'Respiratory ', 'Nutrition',
       'General', 'Social Work', 'Pharmacy', 'Consult', 'Radiology',
       'Nursing/other'], dtype=object)

In [34]:
# only consider hospital stays
notes_hospital = full_notes[full_notes['HADM_ID'].notna()]
notes_hospital.shape

(1851344, 11)

In [35]:
# compare all notes with nursing and discharge
num_records = notes_hospital.loc[:,('HADM_ID','SUBJECT_ID','TEXT')].groupby(['HADM_ID','SUBJECT_ID']).count()
num_discharge_summaries = notes_hospital[notes_hospital['CATEGORY']=='Discharge summary'].loc[:,('HADM_ID','SUBJECT_ID','TEXT')].groupby(['HADM_ID','SUBJECT_ID']).count()
num_nursing_summaries = notes_hospital[(notes_hospital['CATEGORY']=='Nursing/other') | \
    (notes_hospital['CATEGORY']=='Nursing') ].loc[:,('HADM_ID','SUBJECT_ID','TEXT')].\
        groupby(['HADM_ID','SUBJECT_ID']).count()

joined = num_records.join(num_discharge_summaries,lsuffix='_full',rsuffix='_discharge')
joined = joined.join(num_nursing_summaries)
print(joined.shape)
# records without discharge summaries
joined[joined['TEXT_discharge'].isna()]

(58361, 3)


Unnamed: 0_level_0,Unnamed: 1_level_0,TEXT_full,TEXT_discharge,TEXT
HADM_ID,SUBJECT_ID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
100017.0,16229,7,,5.0
100023.0,1189,2,,2.0
100033.0,26773,8,,4.0
100052.0,17261,6,,6.0
100055.0,1549,11,,9.0
...,...,...,...,...
199810.0,2977,6,,1.0
199832.0,23992,2,,2.0
199924.0,31368,5,,3.0
199954.0,13446,2,,2.0


In [39]:
# records without discharge or nursing notes
joined[(joined['TEXT'].isna()) & joined['TEXT_discharge'].isna()].sort_values('TEXT_full',ascending=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,TEXT_full,TEXT_discharge,TEXT
HADM_ID,SUBJECT_ID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
135336.0,23503,58,,
179559.0,98481,46,,
115146.0,60104,36,,
122807.0,68426,28,,
183620.0,71206,15,,
...,...,...,...,...
147875.0,30913,1,,
151247.0,21125,1,,
151545.0,40464,1,,
151720.0,87144,1,,


In [38]:
# example visit with no nursing or discharge notes
notes_hospital[notes_hospital['SUBJECT_ID']==63762]

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,CHARTDATE,CHARTTIME,STORETIME,CATEGORY,DESCRIPTION,CGID,ISERROR,TEXT
298974,310484,63762,198006.0,2181-12-04,,,ECG,Report,,,The patient is now in atrial fibrillation at a...
299190,310485,63762,198006.0,2181-12-03,,,ECG,Report,,,"Sinus tachycardia, rate 135. Marked left axis..."
490158,496561,63762,198006.0,2181-12-04,2181-12-04 07:23:00,2181-12-04 09:24:13,General,Generic Note,20818.0,,TITLE: Nursing Admit-Event note/Expiration.\n...
490159,496562,63762,198006.0,2181-12-04,2181-12-04 07:23:00,2181-12-04 09:38:41,General,Generic Note,20818.0,,TITLE: Nursing Admit-Event note/Expiration.\n...
490306,496550,63762,198006.0,2181-12-04,2181-12-04 08:41:00,2181-12-04 08:41:29,Physician,Physician Resident Admission Note,17640.0,,"Chief Complaint: s/p PEA arrest, Beta Blocker..."
490307,496551,63762,198006.0,2181-12-04,2181-12-04 08:52:00,2181-12-04 08:52:26,General,ICU Event Note,17640.0,,Clinician: Resident\n With family at bedsid...
490447,496538,63762,198006.0,2181-12-04,2181-12-04 07:23:00,2181-12-04 07:23:38,General,Generic Note,15029.0,,TITLE: Nursing Admit-Event note/Expiration.\n...
490547,496523,63762,198006.0,2181-12-04,2181-12-04 06:29:00,2181-12-04 06:29:15,General,ICU Event Note,17640.0,,Clinician: Resident\n I spoke with patient'...
490987,496507,63762,198006.0,2181-12-04,2181-12-04 05:34:00,2181-12-04 05:34:37,Physician,Cardiology Fellow Admission Note,21130.0,,TITLE: CARDIOLOGY FELLOW CCU ADMISSION NOTE\nP...
498774,496554,63762,198006.0,2181-12-04,2181-12-04 07:23:00,2181-12-04 09:04:35,General,Generic Note,20818.0,,TITLE: Nursing Admit-Event note/Expiration.\n...


In [40]:
# confirm some individuals have more than one discharge note
joined[joined['TEXT_discharge'] > 1]

Unnamed: 0_level_0,Unnamed: 1_level_0,TEXT_full,TEXT_discharge,TEXT
HADM_ID,SUBJECT_ID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
100006.0,9895,22,2.0,14.0
100031.0,6892,22,2.0,14.0
100036.0,30078,21,2.0,5.0
100041.0,9588,9,2.0,4.0
100112.0,25418,9,2.0,3.0
...,...,...,...,...
199952.0,22711,50,2.0,3.0
199961.0,20620,19,2.0,11.0
199962.0,53283,6,4.0,
199964.0,18356,26,2.0,20.0


In [42]:
# example person with more than one discharge
notes_hospital[notes_hospital['HADM_ID']==199962]

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,CHARTDATE,CHARTTIME,STORETIME,CATEGORY,DESCRIPTION,CGID,ISERROR,TEXT
29215,40795,53283,199962.0,2118-07-24,,,Discharge summary,Report,,,Admission Date: [**2118-7-16**] ...
29216,40796,53283,199962.0,2118-07-28,,,Discharge summary,Report,,,Admission Date: [**2118-7-26**] ...
58214,58400,53283,199962.0,2118-07-24,,,Discharge summary,Addendum,,,"Name: [**Known lastname 14110**],[**Known fir..."
58268,58391,53283,199962.0,2118-07-24,,,Discharge summary,Addendum,,,"Name: [**Known lastname 14110**],[**Known fir..."
246064,247577,53283,199962.0,2118-07-27,,,ECG,Report,,,Normal sinus rhythm. Tracing is within normal ...
1173180,1201157,53283,199962.0,2118-07-26,2118-07-26 16:21:00,,Radiology,R WRIST(3 + VIEWS) RIGHT,,,[**2118-7-26**] 4:21 PM\n FOREARM (AP & LAT) R...


Select a single note per HADM_ID and SUBJECT_ID to use in predictive model. Choose according to the logic: Choose reports over addendums; if multiple reports, choose report with later CHARTDATE (or CHARTTIME or ROW_ID).

In [46]:
notes_hospital['CHARTDATE'] = pd.to_datetime(notes_hospital['CHARTDATE'])
notes_hospital['CHARTTIME'] = pd.to_datetime(notes_hospital['CHARTTIME'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  notes_hospital['CHARTDATE'] = pd.to_datetime(notes_hospital['CHARTDATE'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  notes_hospital['CHARTTIME'] = pd.to_datetime(notes_hospital['CHARTTIME'])


In [87]:
# for each stay, choose a single note
hadms = notes_hospital['HADM_ID'].unique()
#best_note_df = pd.DataFrame(columns=['SUBJECT_ID','HADM_ID','TEXT','IS_DISCHARGE','IS_NURSING','IS_OTHER'])
best_notes = []

for hadm in hadms:
    hadm_df = notes_hospital[notes_hospital['HADM_ID']==hadm]
    
    # starting with discharges, the priorit note
    discharges = hadm_df[hadm_df['CATEGORY'] == 'Discharge summary']
    # if only a single discharge, use it
    if discharges.shape[0] == 1:
        best_notes.append([int(discharges['SUBJECT_ID'].values),int(discharges['HADM_ID'].values),\
            str(discharges['TEXT'].values),1,0,0])
    # if multiple discharges...
    elif discharges.shape[0] > 1:
        discharges = discharges[discharges['DESCRIPTION'] == 'Report']
        # if only a single Report discharge summary, use this
        if discharges.shape[0] == 1:
            best_notes.append([int(discharges['SUBJECT_ID'].values),int(discharges['HADM_ID'].values),\
            str(discharges['TEXT'].values),1,0,0])
        elif discharges.shape[0] == 0:
            print('No Report Discharge Summaries - problem?')
            print('HADM_ID missing report discharge:',hadm)
            print('Adding addendum.. ')
            nonreports = hadm_df[hadm_df['CATEGORY'] == 'Discharge summary']
            if nonreports.shape[0] == 1:
                best_notes.append([int(nonreports['SUBJECT_ID'].values),int(nonreports['HADM_ID'].values),\
                    str(nonreports['TEXT'].values),1,0,0])
            else:
                max_row_id = max(nonreports['ROW_ID'])
                max_nonreports = nonreports[nonreports['ROW_ID'] == max_row_id]
                #print(max_nonreports)
                #print('max_discharge',max_date)
                best_notes.append([int(max_nonreports['SUBJECT_ID'].values),int(max_nonreports['HADM_ID'].values),\
                    str(max_nonreports['TEXT'].values),1,0,0])                 
        else:
            # choose max ROW_ID discharge summary (note discharge summaries don't have
            # CHARTTIME entries, and CHARTDATEs can be the same)
            max_row_id = max(discharges['ROW_ID'])
            max_discharge = discharges[discharges['ROW_ID'] == max_row_id]
            #print('max_discharge',max_date)
            best_notes.append([int(max_discharge['SUBJECT_ID'].values),int(max_discharge['HADM_ID'].values),\
                str(max_discharge['TEXT'].values),1,0,0])   
    
    else:
        # moving on to nursing notes, our second-preferred kind
        nursing = hadm_df[(hadm_df['CATEGORY'] == 'Nursing') | \
            (hadm_df['CATEGORY'] == 'Nursing/other')]
        # if only a single nursing note, choose that
        if nursing.shape[0] == 1:
            best_notes.append([int(nursing['SUBJECT_ID'].values),int(nursing['HADM_ID'].values),\
                str(nursing['TEXT'].values),0,1,0])
        # if multiple nursing notes, choose the latest one according to STORETIME 
        # (CHARTTIME can have duplicates)
        elif nursing.shape[0] > 1:
            max_row_id = max(nursing['ROW_ID'])    
            max_nursing = nursing[nursing['ROW_ID']==max_row_id] 
            #print('max nursing',max_nursing)
            #print('max nursing',max_date)
            best_notes.append([int(max_nursing['SUBJECT_ID'].values),int(max_nursing['HADM_ID'].values),\
                str(max_nursing['TEXT'].values),0,1,0])
        else:
            # if no nursing or discharge, choose the latest STORETIME note
            max_row_id = max(hadm_df['ROW_ID'])    
            max_hadm = hadm_df[hadm_df['ROW_ID']==max_row_id] 
            #print('max other notes',max_hadm)
            best_notes.append([int(max_hadm['SUBJECT_ID'].values),int(max_hadm['HADM_ID'].values),\
                str(max_hadm['TEXT'].values),0,0,1])       

best_note_df = pd.DataFrame.from_records(best_notes,\
    columns=['SUBJECT_ID','HADM_ID','TEXT','IS_DISCHARGE','IS_NURSING','IS_OTHER'])

print(best_note_df.shape)
best_note_df.head()

No Report Discharge Summaries - problem?
HADM_ID missing report discharge: 180183.0
Adding addendum.. 
No Report Discharge Summaries - problem?
HADM_ID missing report discharge: 133672.0
Adding addendum.. 
No Report Discharge Summaries - problem?
HADM_ID missing report discharge: 174094.0
Adding addendum.. 
No Report Discharge Summaries - problem?
HADM_ID missing report discharge: 129192.0
Adding addendum.. 
(58361, 6)


Unnamed: 0,SUBJECT_ID,HADM_ID,TEXT,IS_DISCHARGE,IS_NURSING,IS_OTHER
0,22532,167853,['Admission Date: [**2151-7-16**] Disch...,1,0,0
1,13702,107527,['Admission Date: [**2118-6-2**] Discha...,1,0,0
2,13702,167118,['Admission Date: [**2119-5-4**] ...,1,0,0
3,13702,196489,"[""Admission Date: [**2124-7-21**] ...",1,0,0
4,26880,135453,['Admission Date: [**2162-3-3**] ...,1,0,0


In [91]:
# pickle df to save
best_note_df.to_pickle('./data/best_note_df.pkl')

In [77]:
# (potential reason for choosing earlier notes over later - NOT essential)

# this example and idx 55069 are both from same day, Discharge Summary Addendum texts, but in this case
# seems we would actually want the first one (as identified by ROW_ID), as the second says it's
# additional information to add to this first.. but may be someone who doesn't know how to file 
# records?
notes_hospital['TEXT'][55068]

"Name:  [**Known lastname 2797**], [**Known firstname 739**]                     Unit No:  [**Numeric Identifier 4340**]\n\nAdmission Date:  [**2101-6-28**]     Discharge Date:  [**2101-7-17**]\n\nDate of Birth:   [**2051-7-5**]     Sex:  F\n\nService:  [**Hospital1 248**] MED\n\nADDENDUM:\n\nHOSPITAL COURSE:   The [**Hospital 1325**] hospital course in the\nMedical Intensive Care Unit was previously dictated.\n\nMiss [**Known lastname **] is a 50 year old female with diabetes mellitus,\nend-stage renal disease status post kidney transplant, and\nstatus post cardiac arrest of unknown etiology, who presented\nwith mental status changes and a right humeral fracture.\n\nHer hospital course is as follows:\n\n1.  INFECTIOUS DISEASE:  On a previous admission in [**2101-10-22**], urine cultures were positive for enterococci and the\npatient was treated with Ampicillin 500 mg p.o. q. eight\nhours for two weeks.  She was discharged to a nursing home\nfor further care.  While at this nursing hom

### Using RoBERTa to create embeddings from notes

In [3]:
# unpickle if not running from scratch
best_note_df = pd.read_pickle('./data/best_note_df.pkl')

In [5]:
#For RoBERTA:
from transformers import RobertaConfig, RobertaModel
from transformers import RobertaTokenizer

#Tokenizer:
tokenizer_r = RobertaTokenizer.from_pretrained("roberta-base")


# Initializing a RoBERTa configuration
configuration = RobertaConfig()

# Initializing a model from the configuration
model_r = RobertaModel(configuration)

In [6]:
tokenized_r = best_note_df['TEXT'].apply((lambda x: tokenizer_r.encode(x, truncation=True, \
    padding="max_length", add_special_tokens=True)))
#max_len = 0
#for i in tokenized_r.values:
#    if len(i) > max_len:
#        max_len = len(i)

#padded_r = np.array([i + [0]*(max_len-len(i)) for i in tokenized_r.values])
tokenized_np = np.array(tokenized_r, dtype=int)

In [None]:
sparse_roberta = sparse.csr_matrix(tokenized_np)
sparse_roberta_file = DATA_DIR + '/sparse_roberta'
sparse.save_npz(sparse_roberta_file, sparse_roberta)

### Using BioClinicalBERT

In [4]:
#from transformers import AutoTokenizer, AutoModel
tokenizer = ppb.AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
model = ppb.AutoModel.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")

Some weights of the model checkpoint at emilyalsentzer/Bio_ClinicalBERT were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [11]:
#Tokenize: 
tokenized_bc = full_notes['TEXT'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))
print('BioClinicalBERT tokenized shape:',tokenized_bc.shape)
tokenized_bc.head()

BioClinicalBERT tokenized shape: (2083180,)


0    [101, 10296, 2236, 131, 164, 115, 115, 18615, ...
1    [101, 10296, 2236, 131, 164, 115, 115, 20915, ...
2    [101, 10296, 2236, 131, 164, 115, 115, 20915, ...
3    [101, 10296, 2236, 131, 164, 115, 115, 19538, ...
4    [101, 10296, 2236, 131, 164, 115, 115, 22148, ...
Name: TEXT, dtype: object

In [5]:
#Tokenize and pad: 
tokenized_bc = full_notes['TEXT'].apply((lambda x: tokenizer.encode(x, truncation=True, max_length=512, \
    padding="max_length", add_special_tokens=True)))

print('BioClinicalBERT tokenized shape:',tokenized_bc.shape)
tokenized_bc.head()

BioClinicalBERT tokenized shape: (2083180,)


0    [101, 10296, 2236, 131, 164, 115, 115, 18615, ...
1    [101, 10296, 2236, 131, 164, 115, 115, 20915, ...
2    [101, 10296, 2236, 131, 164, 115, 115, 20915, ...
3    [101, 10296, 2236, 131, 164, 115, 115, 19538, ...
4    [101, 10296, 2236, 131, 164, 115, 115, 22148, ...
Name: TEXT, dtype: object

In [6]:
len(tokenized_bc[0])

512

In [7]:
# didn't work
tokenized_bc_np = np.array(tokenized_bc, dtype=int)

In [14]:
# didn't work
sparse_bc = sparse.csr_matrix(np.array(tokenized_bc, dtype=int))

In [None]:
from numpy import asarray
from numpy import savetxt

padded_bc_ar = asarray(padded_bc)
padded_r_file = DATA_DIR + '/padded_bc'
savetxt(padded_r_file, padded_bc_ar, delimiter=",")

### Using NLTK to tokenize and manually produce vocab

In [4]:
from collections import Counter
import json
from pathlib import Path

from nltk.tokenize import WordPunctTokenizer
tokenizer = WordPunctTokenizer()
counter = Counter()
counter.update(['<pad>', '<unk>'])
notes_text_list = list(full_notes['TEXT'])
print(f"Number of documents: {len(notes_text_list)}")

Number of documents: 2083180


In [7]:
for i, line in enumerate(notes_text_list):
    tokens = tokenizer.tokenize(line)
    tokens = [t.lower() for t in tokens]
    counter.update(tokens)
    #if i < 3:
    #    print(f"String of line {i}: {line.strip()}")
    #    print(f"Tokens of line {i}: {tokens}")
counter = dict(counter)

#print(f"Vocab size before frequency filtering: {len(counter)}")

vocab = {}
for word, freq in counter.items():
    #if freq < 3 and word not in ['<pad>', '<unk>']:
    #    continue
    vocab[word] = len(vocab)

print(f"Vocab size after frequency filtering: {len(vocab)}")
output_filepath = DATA_DIR + '/unigram_vocab.json'
json.dump(vocab, open(output_filepath, mode='w'))

Vocab size after frequency filtering: 552755


In [15]:
print('Number of documents:',full_notes.shape[0])

Number of documents: 2083180


In [9]:
print('Number of tokens:',sum(counter.values()))

Number of tokens: 753498364


Below is code for manually tokenizing; so far has been killing kernel so moved on to nltk above

In [14]:
# split on both spaces and the "\n" character for more accurate word count
re.split("\s+|\n",full_notes['TEXT'][0])

['Admission',
 'Date:',
 '[**2151-7-16**]',
 'Discharge',
 'Date:',
 '[**2151-8-4**]',
 'Service:',
 'ADDENDUM:',
 'RADIOLOGIC',
 'STUDIES:',
 'Radiologic',
 'studies',
 'also',
 'included',
 'a',
 'chest',
 'CT,',
 'which',
 'confirmed',
 'cavitary',
 'lesions',
 'in',
 'the',
 'left',
 'lung',
 'apex',
 'consistent',
 'with',
 'infectious',
 'process/tuberculosis.',
 'This',
 'also',
 'moderate-sized',
 'left',
 'pleural',
 'effusion.',
 'HEAD',
 'CT:',
 'Head',
 'CT',
 'showed',
 'no',
 'intracranial',
 'hemorrhage',
 'or',
 'mass',
 'effect,',
 'but',
 'old',
 'infarction',
 'consistent',
 'with',
 'past',
 'medical',
 'history.',
 'ABDOMINAL',
 'CT:',
 'Abdominal',
 'CT',
 'showed',
 'lesions',
 'of',
 'T10',
 'and',
 'sacrum',
 'most',
 'likely',
 'secondary',
 'to',
 'osteoporosis.',
 'These',
 'can',
 'be',
 'followed',
 'by',
 'repeat',
 'imaging',
 'as',
 'an',
 'outpatient.',
 '[**First',
 'Name8',
 '(NamePattern2)',
 '**]',
 '[**First',
 'Name4',
 '(NamePattern1)',
 '1775**

In [15]:
len(re.split("\s+|\n",full_notes['TEXT'][0]))

111

Test splitting into words on small sample (killed kernel after 12 min on first attempt to run the full dataset)

In [5]:
sample_notes = full_notes.sample(n = 200, random_state=53)

In [6]:
sample_notes['WORDS'] = sample_notes['TEXT'].str.split("\s+|\n")
sample_notes.head()

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,CHARTDATE,CHARTTIME,STORETIME,CATEGORY,DESCRIPTION,CGID,ISERROR,TEXT,WORDS
956690,961313,14734,,2127-05-21,2127-05-21 09:01:00,,Radiology,ABDOMEN (SUPINE ONLY),,,[**2127-5-21**] 9:01 AM\n ABDOMEN (SUPINE ONLY...,"[[**2127-5-21**], 9:01, AM, ABDOMEN, (SUPINE, ..."
1712826,1713696,2576,110661.0,2169-11-15,2169-11-15 19:49:00,2169-11-15 19:50:00,Nursing/other,Report,21232.0,,Agree with above note by [**Initials (NamePatt...,"[Agree, with, above, note, by, [**Initials, (N..."
1033727,1043826,63616,175657.0,2113-11-21,2113-11-21 08:28:00,,Radiology,CHEST (PA & LAT),,,"[**Last Name (LF) **],[**First Name3 (LF) **] ...","[[**Last, Name, (LF), **],[**First, Name3, (LF..."
1004537,1007150,28502,183617.0,2199-02-26,2199-02-26 03:15:00,,Radiology,CHEST (PORTABLE AP),,,[**2199-2-26**] 3:15 AM\n CHEST (PORTABLE AP) ...,"[[**2199-2-26**], 3:15, AM, CHEST, (PORTABLE, ..."
1044022,1055606,96629,,2118-02-17,2118-02-17 11:00:00,,Radiology,CT C-SPINE W/O CONTRAST,,,[**2118-2-17**] 11:00 AM\n CT C-SPINE W/O CONT...,"[[**2118-2-17**], 11:00, AM, CT, C-SPINE, W/O,..."


In [4]:
n_rows = full_notes.shape[0]
split_size = round(n_rows/3)
print('split size for easier processing:', split_size)

notes_1 = full_notes.iloc[:split_size]
notes_2 = full_notes.iloc[split_size:split_size*2]
# splitting the last third into 2 due to some extremely large notes in final third (killed kernel mutliple times)
notes_3 = full_notes.iloc[split_size*2:round(split_size*2.5)]
notes_4 = full_notes.iloc[round(split_size*2.5):]
print('notes shapes:',notes_1.shape, notes_2.shape, notes_3.shape, notes_4.shape)

split size for easier processing: 694393
notes shapes: (694393, 11) (694393, 11) (347196, 11) (347198, 11)


In [5]:
# takes approx 1.5 min to run
notes_1['WORDS'] = notes_1['TEXT'].str.split("\s+|\n")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  notes_1['WORDS'] = notes_1['TEXT'].str.split("\s+|\n")


In [None]:
full_notes_toks = pd.concat([notes_1, notes_2, notes_3, notes_4])
full_notes_toks.shape

### References

[Analytics Vidhya: Fine-tune a RoBERTa Encoder-Decoder model trained on MLM for Text Generation](https://medium.com/analytics-vidhya/fine-tune-a-roberta-encoder-decoder-model-trained-on-mlm-for-text-generation-23da5f3c1858)  
[StackOverflow: nlp - How to use Bert for long text classification?](https://stackoverflow.com/questions/58636587/how-to-use-bert-for-long-text-classification)  
[StackExchange: Convert a list of lists into a Pandas Dataframe](https://datascience.stackexchange.com/questions/26333/convert-a-list-of-lists-into-a-pandas-dataframe)    
