Import statements and original csv files to a dataframe

In [1]:
import pandas as pd
import re
from faker import Faker
import random
from datetime import datetime, timedelta
from transformers import AutoTokenizer, BioGptForCausalLM
from transformers import BartForConditionalGeneration, BartTokenizer

In [2]:
# Adjust to your file locations
file_location_discharge = 'C:/3163dataset/discharge.csv.gz'
file_location_radiology = 'C:/3163dataset/radiology.csv.gz'

In [3]:
discharge_df = pd.read_csv(file_location_discharge, compression='gzip')
radiology_df = pd.read_csv(file_location_radiology, compression='gzip')

Name and DOB generation for DISCHARGE

In [4]:
# Name generation for every record in discharge and radiology
fake = Faker()
subject_ids = discharge_df['subject_id'].unique()
name_map = {subject_id: fake.name() for subject_id in subject_ids}
discharge_df['name'] = discharge_df['subject_id'].map(name_map)

In [5]:
# DOB generation for every record in discharge and radiology
def generate_random_dob(start_year=1930, end_year=2020):
    start_date = datetime(year=start_year, month=1, day=1)
    end_date = datetime(year=end_year, month=12, day=31)
    time_between_dates = end_date - start_date
    days_between_dates = time_between_dates.days
    random_number_of_days = random.randrange(days_between_dates)
    random_date = start_date + timedelta(days=random_number_of_days)
    return random_date
subject_ids = discharge_df['subject_id'].unique()
dob_map = {subject_id: generate_random_dob() for subject_id in subject_ids}
discharge_df['dob'] = discharge_df['subject_id'].map(dob_map)

Complaint and patient sex regex for DISCHARGE

In [6]:
# Extract chief complaint for every record
def extract_complaint(note):
    chief_complaint_match = re.search(r'(?:Chief|Present|Main)?\s*Complaint[:\s]*(.*?)(?:Major Surgical|History of Present Illness:|$)', note, re.IGNORECASE | re.DOTALL)
    if chief_complaint_match:
        return chief_complaint_match.group(1).strip()
    else:
        return 'N/A'
discharge_df['complaint'] = discharge_df['text'].apply(extract_complaint)

In [7]:
# Extract patient sex for every record
def extract_patient_sex(note):
    patient_sex = re.search(r'Sex:\s*(\w)', note, re.IGNORECASE)
    if patient_sex:
        patient_sex = patient_sex.group(1)
        if patient_sex == 'F' or patient_sex == 'f':
            patient_sex = 'Female'
        elif patient_sex == 'M' or patient_sex == 'm':
            patient_sex = 'Male'
        return patient_sex
    else:
        return 'N/A'
discharge_df['patient_sex'] = discharge_df['text'].apply(extract_patient_sex)

Examination regex for RADIOLOGY

In [8]:
# Extract examination for every record
def extract_radiology_examination(note):
    examination = re.search(r'EXAMINATION:(.*?)INDICATION:', note, re.IGNORECASE | re.DOTALL)
    if examination:
        examination = examination.group(1).strip()
        return examination
    else:
        return 'N/A'
radiology_df['examination'] = radiology_df['text'].apply(extract_radiology_examination)

Summarisation using Hugging Face Transformers

In [9]:
# BioGPT model summariser
# min_length and max_length by default is 200 and 1025 respectively
def biogpt(note, min_length=200, max_length=1025):
    model_name = "microsoft/biogpt"
    model = BioGptForCausalLM.from_pretrained(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    inputs = tokenizer.encode("summarize: " + note, return_tensors="pt", max_length=1024, truncation=True)
    summary_ids = model.generate(inputs, max_length=max_length, min_length=min_length, length_penalty=2.0, num_beams=4, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

# BART model summariser
# min_length and max_length by default is 200 and 500 respectively
def bart(note, min_length=200, max_length=500):
    model_name = "facebook/bart-large-cnn"
    model = BartForConditionalGeneration.from_pretrained(model_name)
    tokenizer = BartTokenizer.from_pretrained(model_name)
    inputs = tokenizer.encode("summarize: " + note, return_tensors="pt", max_length=1024, truncation=True)
    summary_ids = model.generate(inputs, max_length=max_length, min_length=min_length, length_penalty=2.0, num_beams=4, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

# To summarise discharge note -> run biogpt then bart
def summarise_discharge(note):
    summarised = biogpt(note)
    summarised = bart(summarised)
    return summarised

# To summarise radiology note -> run biogpt then bart (with less character length)
def summarise_radiology(note):
    summarised = biogpt(note, min_length=200, max_length=500)
    summarised = bart(summarised, min_length=100, max_length=200)
    return summarised

Summarise notes for RADIOLOGY

In [10]:
# TODO: summarise "text" column for radiology_df into new column called "summarised"
# first, pick the rows to summarise
# make the radiology_df dataframe consist of only those rows, remove the rest
# next, run summarise_radiology like this:
# radiology_df['summarised'] = radiology_df['text'].apply(summarise_radiology)

Data Cleaning for DISCHARGE - from Alex's data cleaning code

In [11]:
# TODO: pick the rows to keep for discharge_df dataframe, remove the rest
# run the next codes:

In [None]:
def clean_text(text):
    symbols_to_replace = ['#', '[]', '\n']
    for symbol in symbols_to_replace:
        if symbol == '\n':
            text = re.sub(re.escape(symbol), " ", text)
        else:
            text = re.sub(re.escape(symbol), "", text)
            text = text.strip()
    cleaned_text = text
    return cleaned_text
discharge_df['text'] = discharge_df['text'].apply(clean_text)

In [None]:
his_of_pres_ill_pattern = r'History of Present Illness:(.*?)(?=(?:[A-Z][a-z]*\s*:|$))'
family_hist_pattern = r'Family History:(.*?)(?=(?:[A-Z][a-z]*\s*:|$))'
past_med_hist_pattern = r'Past Medical History:(.*?)(?=(?:[A-Z][a-z]*\s*:|$))'

desired_titles = [his_of_pres_ill_pattern, family_hist_pattern, past_med_hist_pattern]

def extract_normalize_titles(text): 
    for desired_title in desired_titles:
        text = text.strip()
        text = re.sub(r'\n', ' ', text)
        text = re.sub(r'\s+', ' ', text)
        titles = re.findall(desired_title, text)
        normalized_titles = sorted(set(titles), key=lambda x: titles.index(x))
        for title in normalized_titles: 
            escaped_title = re.escape(title)
            pattern = re.compile(r'\b' + escaped_title + r'\b', re.IGNORECASE)
            text = re.sub(pattern, title, text)
    return text
discharge_df['text'] = discharge_df['text'].apply(extract_normalize_titles)

In [None]:
# TODO: check discharge_df first whether it has the new 3 columns (History of Present Illness, Family History, Past Medical History)
# run summarise_discharge for each 3 columns, if the output is weird, just adjust the function's minlength and maxlength
# you can read it in the function above
# discharge_df['History of Present Illness'] = discharge_df['History of Present Illness'].apply(summarise_discharge)

DISCHARGE data csv transformation

In [13]:
discharge_df

Unnamed: 0,note_id,subject_id,hadm_id,note_type,note_seq,charttime,storetime,text,name,dob,complaint,patient_sex
0,10000032-DS-21,10000032,22595853,DS,21,2180-05-07 00:00:00,2180-05-09 15:26:00,\nName: ___ Unit No: _...,Jonathan Jones,1995-04-14,Worsening ABD distension and pain,Female
1,10000032-DS-22,10000032,22841357,DS,22,2180-06-27 00:00:00,2180-07-01 10:15:00,\nName: ___ Unit No: _...,Jonathan Jones,1995-04-14,abdominal fullness and discomfort,Female
2,10000032-DS-23,10000032,29079034,DS,23,2180-07-25 00:00:00,2180-07-25 21:42:00,\nName: ___ Unit No: _...,Jonathan Jones,1995-04-14,altered mental status,Female
3,10000032-DS-24,10000032,25742920,DS,24,2180-08-07 00:00:00,2180-08-10 05:43:00,\nName: ___ Unit No: _...,Jonathan Jones,1995-04-14,Abdominal pain,Female
4,10000084-DS-17,10000084,23052089,DS,17,2160-11-25 00:00:00,2160-11-25 15:09:00,\nName: ___ Unit No: __...,Erin Meyer,1959-08-30,Visual hallucinations,Male
...,...,...,...,...,...,...,...,...,...,...,...,...
331789,19999828-DS-6,19999828,29734428,DS,6,2147-08-04 00:00:00,2147-08-12 15:36:00,\nName: ___ Unit No: ___...,Maria Johnson,1941-03-03,Enterocutaneous/enteroatmospheric fistula,Female
331790,19999828-DS-7,19999828,25744818,DS,7,2149-01-18 00:00:00,2149-01-19 07:03:00,\nName: ___ Unit No: ___...,Maria Johnson,1941-03-03,,Female
331791,19999840-DS-20,19999840,26071774,DS,20,2164-07-28 00:00:00,2164-07-29 14:52:00,\nName: ___ Unit No: ___\...,Ashlee Benton,1941-05-26,seizure,Male
331792,19999840-DS-21,19999840,21033226,DS,21,2164-09-17 00:00:00,2164-09-18 01:36:00,\nName: ___ Unit No: ___\...,Ashlee Benton,1941-05-26,seizures,Male


RADIOLOGY data csv transformation

In [14]:
radiology_df

Unnamed: 0,note_id,subject_id,hadm_id,note_type,note_seq,charttime,storetime,text,examination
0,10000032-RR-14,10000032,22595853.0,RR,14,2180-05-06 21:19:00,2180-05-06 23:32:00,EXAMINATION: CHEST (PA AND LAT)\n\nINDICATION...,CHEST (PA AND LAT)
1,10000032-RR-15,10000032,22595853.0,RR,15,2180-05-06 23:00:00,2180-05-06 23:26:00,EXAMINATION: LIVER OR GALLBLADDER US (SINGLE ...,LIVER OR GALLBLADDER US (SINGLE ORGAN)
2,10000032-RR-16,10000032,22595853.0,RR,16,2180-05-07 09:55:00,2180-05-07 11:15:00,"INDICATION: ___ HCV cirrhosis c/b ascites, hi...",
3,10000032-RR-18,10000032,,RR,18,2180-06-03 12:46:00,2180-06-03 14:01:00,EXAMINATION: Ultrasound-guided paracentesis.\...,Ultrasound-guided paracentesis.
4,10000032-RR-20,10000032,,RR,20,2180-07-08 13:18:00,2180-07-08 14:15:00,EXAMINATION: Paracentesis\n\nINDICATION: ___...,Paracentesis
...,...,...,...,...,...,...,...,...,...
2321350,19999987-RR-17,19999987,23865745.0,RR,17,2145-11-02 22:37:00,2145-11-03 18:55:00,"HISTORY: ___, with left occipital bleeding. ...",
2321351,19999987-RR-18,19999987,23865745.0,RR,18,2145-11-03 04:35:00,2145-11-03 10:46:00,INDICATION: ___ female intubated for head ble...,
2321352,19999987-RR-19,19999987,23865745.0,RR,19,2145-11-03 16:40:00,2145-11-04 08:36:00,HISTORY: ___ woman with left occipital hemorr...,
2321353,19999987-RR-20,19999987,23865745.0,RR,20,2145-11-04 05:10:00,2145-11-04 08:58:00,PORTABLE CHEST OF ___\n\nCOMPARISON: ___ radi...,
