In [1]:
import pandas as pd
import numpy as np
import random
from tqdm import tqdm
tqdm.pandas()
import re
import json

from llama_index.core import SimpleDirectoryReader
from llama_index.core.node_parser import SimpleNodeParser
from llama_index.core.schema import MetadataMode



In [2]:
def clean_text(text):
    
    punc_list = [",", ":", ";", "'", "\""]

    for punc in punc_list:
        text = re.sub(rf"{punc}+", punc, text)
    
    for punc in punc_list:
        text = re.sub(rf"\n[{punc}]", punc, text)
        text = re.sub(rf"\n [{punc}]", punc, text)
        text = re.sub(rf"[{punc}]\n", f"{punc} ", text)
        text = re.sub(rf"[{punc}] \n", f"{punc} ", text)
    
    repl = re.findall(r"\n[a-z]", text)
    for r in repl:
        text = re.sub(r, ' '+r[-1], text)   
        
    repl = re.findall(r"\n [a-z]", text)
    for r in repl:
        text = re.sub(r, ' '+r[-1], text)   
    
    return text.lower()

In [3]:
MIMIC_PATH = "/mnt/scratch/shared_data/MIMIC-IV-NOTE/note/"

In [4]:
df = pd.read_csv(MIMIC_PATH+"discharge.csv.gz")

In [5]:
len(set(df['subject_id']))

145914

In [10]:
random.seed(42)
train_subject = random.sample(list(set(df['subject_id'])), 5000)
test_subject  = random.sample(list(set(df['subject_id'])-set(train_subject)), 100)
assert set(train_subject).intersection(set(test_subject)) == set()

In [11]:
df_train = df[df['subject_id'].isin(train_subject)].reset_index(drop=True)
df_test  = df[df['subject_id'].isin(test_subject)].reset_index(drop=True)
print(len(df_train), len(df_test))

11440 207


In [12]:
df_train['clean_text'] = df_train['text'].progress_apply(clean_text)
df_test['clean_text']  = df_test['text'].progress_apply(clean_text)


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11440/11440 [00:12<00:00, 923.89it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 207/207 [00:00<00:00, 905.88it/s]


In [119]:
import pandas as pd
import numpy as np
from collections import defaultdict
import json
from tqdm import tqdm
tqdm.pandas()
# pd.set_option('max_colwidth', 1000)

import transformers
from transformers import AutoModel, AutoTokenizer
from tokenizers import AddedToken

def note_to_chunk(tokenizer, text, max_length, doc_stride):
    '''
    Function to extract information using zero-shot prompting
    '''
    # tokenize prompt+note
    input_ids = tokenizer(text, truncation="only_first", padding='max_length',
                          return_overflowing_tokens=True, stride=doc_stride,
                          return_offsets_mapping=True, max_length=max_length)
        
    chunk_to_note_map = input_ids.pop("overflow_to_sample_mapping")
    input_ids = input_ids.input_ids
    
    chunked_text = tokenizer.batch_decode(input_ids)
    chunked_text = [s.replace("[CLS]", "") for s in chunked_text]
    chunked_text = [s.replace("[SEP]", "") for s in chunked_text]
    chunked_text = [s.replace("[PAD]", "") for s in chunked_text]
    chunked_text = [s.replace("_ _ _", "___") for s in chunked_text]
    chunked_text = [s.replace(" \ ", "\\") for s in chunked_text]
    chunked_text = [s.replace(" / ", "/") for s in chunked_text]
    chunked_text = [s.replace(" & ", "&") for s in chunked_text]
    chunked_text = [s.replace(" : ", ":") for s in chunked_text]
    chunk_list = []
    for s in chunked_text:
        for r in re.findall(r"\n [a-z]", s):
            s = re.sub(r, '\n'+r[-1], s)
        chunk_list.append(s)
    
    chunk_list = [{'text':chunk} for chunk in chunk_list]
    print(f"Number of chunks: {len(chunk_list)}")
    return chunk_list

print(f"Started Chunking . . . .")
tokenizer = AutoTokenizer.from_pretrained("BAAI/bge-large-en-v1.5")
tokenizer.add_special_tokens({"additional_special_tokens": [AddedToken("\n")]})

max_length = 512
doc_stride = 128
text = df_train['clean_text'].iloc[0]
x = note_to_chunk(tokenizer, text, max_length, doc_stride)
print(f"Chunking Completed . . . .")

Started Chunking . . . .
Number of chunks: 3
Chunking Completed . . . .


In [120]:
x

[{'text': ' \nname:___ unit no:___ \n \nadmission date:___ discharge date:___ \n \ndate of birth:___ sex:f \n \nservice:surgery \n \nallergies:___ \n \nattending:___. \n \nchief complaint:nausea/vomiting \n \nmajor surgical or invasive procedure:band adjustment \n \n \nhistory of present illness:ms. ___ is a ___ s/p lap band in ___ who prsents with a 1 week history of nausea, non - bilious non - bloody emesis of undigested food after eating, intolerance to solids/softs, hypersalivation, and moderate post - prandial epigastric discomfort. she denies fever, chills, hematemesis, brbpr, melena, diarrhea, or sympotoms of dehydration, but was recently evaluated for dizziness in an ed with a diagnosis given of bppv. of note, the patient underwent an unfill of her band from 5. 8 to 3. 8ml on \n ___ for similar symptoms, the band was subseqently been filled to 4. 8 on ___, 5. 2 on ___, and most recently to 5. 6ml on \n ___. \n \n \npast medical history:pmhx:hyperlipidemia and with elevated trig

In [121]:
noteid_to_finalnote_train = {}
for i,row in df_train.iterrows():
    noteid_to_finalnote_train[row['note_id']] = f"""The following note with note_id {row['note_id']} was charted on {row['charttime']} for patient {row['subject_id']}.\n
Note with note_id {row['note_id']}: {row['clean_text']}"""

noteid_to_finalnote_test = {}
for i,row in df_test.iterrows():
    noteid_to_finalnote_test[row['note_id']] = f"""The following note with note_id {row['note_id']} was charted on {row['charttime']} for patient {row['subject_id']}.\n
Note with note_id {row['note_id']}: {row['clean_text']}"""
    
df_train['final_note'] = df_train['note_id'].map(noteid_to_finalnote_train)
df_test['final_note']  = df_test['note_id'].map(noteid_to_finalnote_test)


In [13]:
DATA_PATH = "/home/75y/data_ragMimic/data/"

for i in range(len(df_train)):
    with open(DATA_PATH+f"train_{i}.txt", "w") as f:
        f.write(df_train['final_note'].iloc[i])

for i in range(len(df_test)):
    with open(DATA_PATH+f"test_{i}.txt", "w") as f:
        f.write(df_test['final_note'].iloc[i])


In [14]:
def load_corpus(files, verbose=False):
    if verbose:
        print(f"Loading files {files}")

    reader = SimpleDirectoryReader(input_files=files)
    docs = reader.load_data()
    if verbose:
        print(f'Loaded {len(docs)} docs')
    
    parser = SimpleNodeParser.from_defaults()
    nodes = parser.get_nodes_from_documents(docs, show_progress=verbose)

    if verbose:
        print(f'Parsed {len(nodes)} nodes')

    corpus = {node.node_id: node.get_content(metadata_mode=MetadataMode.NONE) for node in nodes}
    return corpus

In [17]:
TRAIN_FILES = [DATA_PATH+f"train_{i}.txt" for i in range(len(df_train))]
VAL_FILES   = [DATA_PATH+f"test_{i}.txt" for i in range(len(df_test))]

train_corpus = load_corpus(TRAIN_FILES, verbose=False)
val_corpus = load_corpus(VAL_FILES, verbose=False)

In [18]:
DATA_PATH = "/home/75y/data_ragMimic/data/"

TRAIN_CORPUS_FPATH = DATA_PATH+'train_corpus.json'
VAL_CORPUS_FPATH = DATA_PATH+'val_corpus.json'

with open(TRAIN_CORPUS_FPATH, 'w+') as f:
    json.dump(train_corpus, f)

with open(VAL_CORPUS_FPATH, 'w+') as f:
    json.dump(val_corpus, f)

### trial code below

In [None]:
# # # !mkdir -p 'data/10k/'
# # # !wget 'https://raw.githubusercontent.com/run-llama/llama_index/main/docs/examples/data/10k/uber_2021.pdf' -O 'data/10k/uber_2021.pdf'
# # # !wget 'https://raw.githubusercontent.com/run-llama/llama_index/main/docs/examples/data/10k/lyft_2021.pdf' -O 'data/10k/lyft_2021.pdf'

# # import json

# from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
# # from llama_index.core.node_parser import SimpleNodeParser
# from llama_index.core.node_parser import TokenTextSplitter
# from llama_index.core.schema import MetadataMode
# # TRAIN_FILES = ['data/10k/lyft_2021.pdf']
# # VAL_FILES = ['data/10k/uber_2021.pdf']


In [None]:
# def load_corpus(files, verbose=False):
#     if verbose:
#         print(f"Loading files {files}")

#     reader = SimpleDirectoryReader(input_files=files)
#     docs = reader.load_data()
#     if verbose:
#         print(f'Loaded {len(docs)} docs')
    
#     parser = SimpleNodeParser.from_defaults()
#     nodes = parser.get_nodes_from_documents(docs, show_progress=verbose)

#     nodes = splitter.get_nodes_from_documents(documents)

#     if verbose:
#         print(f'Parsed {len(nodes)} nodes')

#     corpus = {node.node_id: node.get_content(metadata_mode=MetadataMode.NONE) for node in nodes}
#     return corpus

In [None]:
# def load_corpus(docs, verbose=False):
#     if verbose:
#         print(f'Loaded {len(docs)} docs')
    
#     # parser = SimpleNodeParser.from_defaults()
#     # nodes = parser.get_nodes_from_documents(docs, show_progress=verbose)

#     splitter = TokenTextSplitter(
#         chunk_size=1024,
#         chunk_overlap=256,
#         separator=" ",
#     )
#     nodes = splitter.get_nodes_from_documents(documents)

#     if verbose:
#         print(f'Parsed {len(nodes)} nodes')

#     corpus = {node.node_id: node.get_content(metadata_mode=MetadataMode.NONE) for node in nodes}
#     return corpus

In [None]:
# train_corpus = load_corpus(TRAIN_FILES, verbose=True)
# val_corpus = load_corpus(VAL_FILES, verbose=True)

# # train_corpus = load_corpus(df_train, verbose=True)
# # val_corpus = load_corpus(df_test, verbose=True)
