In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
tqdm.pandas()
# pd.set_option('max_colwidth', 1000)
from collections import defaultdict
import random
import re
import json

import transformers
from transformers import AutoModel, AutoTokenizer
from tokenizers import AddedToken




In [2]:
def clean_text(text):
    
    punc_list = [",", ":", ";", "'", "\""]

    for punc in punc_list:
        text = re.sub(rf"{punc}+", punc, text)
    
    for punc in punc_list:
        text = re.sub(rf"\n[{punc}]", punc, text)
        text = re.sub(rf"\n [{punc}]", punc, text)
        text = re.sub(rf"[{punc}]\n", f"{punc} ", text)
        text = re.sub(rf"[{punc}] \n", f"{punc} ", text)
    
    repl = re.findall(r"\n[a-z]", text)
    for r in repl:
        text = re.sub(r, ' '+r[-1], text)   
        
    repl = re.findall(r"\n [a-z]", text)
    for r in repl:
        text = re.sub(r, ' '+r[-1], text)   
    
    return text.lower()

In [15]:
def note_to_chunk(tokenizer, text, max_length, doc_stride, save_to_json, OUTPUT_PATH, FILENAME):
    '''
    Function to create chunks
    '''
    input_ids = tokenizer(text, truncation="only_first", padding='max_length',
                          return_overflowing_tokens=True, stride=doc_stride,
                          return_offsets_mapping=True, max_length=max_length)
        
    chunk_to_note_map = input_ids.pop("overflow_to_sample_mapping")
    input_ids = input_ids.input_ids
    
    chunked_text = tokenizer.batch_decode(input_ids)
    chunked_text = [s.replace("[CLS]", "") for s in chunked_text]
    chunked_text = [s.replace("[SEP]", "") for s in chunked_text]
    chunked_text = [s.replace("[PAD]", "") for s in chunked_text]
    chunked_text = [s.replace("_ _ _", "___") for s in chunked_text]
    chunked_text = [s.replace(" \ ", "\\") for s in chunked_text]
    chunked_text = [s.replace(" / ", "/") for s in chunked_text]
    chunked_text = [s.replace(" & ", "&") for s in chunked_text]
    chunked_text = [s.replace(" : ", ":") for s in chunked_text]
    chunk_list = []
    for s in chunked_text:
        for r in re.findall(r"\n [a-z]", s):
            s = re.sub(r, '\n'+r[-1], s)
        chunk_list.append(s)
    
    chunk_list = [{'text':chunk} for chunk in chunk_list]
    print(f"Number of chunks: {len(chunk_list)}")

    if save_to_json:
        with open(f'{OUTPUT_PATH}+{FILENAME}', 'w') as outfile:
            for entry in chunk_list:
                json.dump(entry, outfile)
                outfile.write('\n')

    return chunk_list

In [7]:
MIMIC_PATH = "/mnt/scratch/shared_data/MIMIC-IV-NOTE/note/"
df = pd.read_csv(MIMIC_PATH+"discharge.csv.gz").head(10)
df['clean_text'] = df['text'].progress_apply(clean_text)


100%|█████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 1255.70it/s]


In [16]:
print(f"Started Chunking . . . .")
tokenizer = AutoTokenizer.from_pretrained("BAAI/bge-large-en-v1.5")
tokenizer.add_special_tokens({"additional_special_tokens": [AddedToken("\n")]})

max_length = 512
doc_stride = 128
text       = df['clean_text'].values.tolist()

save_to_json = True 
OUTPUT_PATH  = "../"
FILENAME     = "pretrain_data.jsonl"

note_chunks = note_to_chunk(tokenizer, text, max_length, doc_stride, save_to_json, OUTPUT_PATH, FILENAME)
print(f"Chunking Completed . . . .")

Started Chunking . . . .
Number of chunks: 64
Chunking Completed . . . .


In [None]:
note_chunks