Import statements and original csv files to a dataframe

In [1]:
import pandas as pd
import re
from faker import Faker
import random
from datetime import datetime, timedelta
from transformers import AutoTokenizer, BioGptForCausalLM
from transformers import BartForConditionalGeneration, BartTokenizer

In [2]:
# Adjust to your file locations
file_location_discharge = 'C:/3163dataset/discharge.csv.gz'
file_location_radiology = 'C:/3163dataset/radiology.csv.gz'

In [3]:
discharge_df = pd.read_csv(file_location_discharge, compression='gzip')
radiology_df = pd.read_csv(file_location_radiology, compression='gzip')

Summarisation using Hugging Face Transformers

In [4]:
# BioGPT model summariser
# min_length and max_length by default is 200 and 1025 respectively
def biogpt(note, min_length=200, max_length=1025):
    model_name = "microsoft/biogpt"
    model = BioGptForCausalLM.from_pretrained(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    inputs = tokenizer.encode("summarize: " + note, return_tensors="pt", max_length=1024, truncation=True)
    summary_ids = model.generate(inputs, max_length=max_length, min_length=min_length, length_penalty=2.0, num_beams=4, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

# BART model summariser
# min_length and max_length by default is 200 and 500 respectively
def bart(note, min_length=200, max_length=500):
    model_name = "facebook/bart-large-cnn"
    model = BartForConditionalGeneration.from_pretrained(model_name)
    tokenizer = BartTokenizer.from_pretrained(model_name)
    inputs = tokenizer.encode("summarize: " + note, return_tensors="pt", max_length=1024, truncation=True)
    summary_ids = model.generate(inputs, max_length=max_length, min_length=min_length, length_penalty=2.0, num_beams=4, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

# To summarise discharge note -> run biogpt then bart
def summarise_discharge(note):
    summarised = biogpt(note)
    summarised = bart(summarised)
    return summarised

# To summarise radiology note -> run biogpt then bart (with less character length)
def summarise_radiology(note):
    summarised = biogpt(note, min_length=200, max_length=500)
    summarised = bart(summarised, min_length=100, max_length=200)
    return summarised

In [5]:
# TODO: Integrate discharge and radiology sections
#discharge_df['summarised'] = discharge_df['text'].apply(summarise_discharge)
#radiology_df['summarised'] = radiology_df['text'].head(1).apply(summarise_radiology)

Name and DOB generation

In [6]:
# Name generation for every record in discharge and radiology
fake = Faker()
subject_ids = discharge_df['subject_id'].unique()
name_map = {subject_id: fake.name() for subject_id in subject_ids}
discharge_df['name'] = discharge_df['subject_id'].map(name_map)
radiology_df['name'] = radiology_df['subject_id'].map(name_map)

In [7]:
# DOB generation for every record in discharge and radiology
def generate_random_dob(start_year=1930, end_year=2020):
    start_date = datetime(year=start_year, month=1, day=1)
    end_date = datetime(year=end_year, month=12, day=31)
    time_between_dates = end_date - start_date
    days_between_dates = time_between_dates.days
    random_number_of_days = random.randrange(days_between_dates)
    random_date = start_date + timedelta(days=random_number_of_days)
    return random_date
subject_ids = discharge_df['subject_id'].unique()
dob_map = {subject_id: generate_random_dob() for subject_id in subject_ids}
discharge_df['dob'] = discharge_df['subject_id'].map(dob_map)
radiology_df['dob'] = radiology_df['subject_id'].map(dob_map)

Complaint and patient sex regex for DISCHARGE

In [8]:
# Extract chief complaint for every record
def extract_complaint(note):
    chief_complaint_match = re.search(r'(?:Chief|Present|Main)?\s*Complaint[:\s]*(.*?)(?:Major Surgical|History of Present Illness:|$)', note, re.IGNORECASE | re.DOTALL)
    if chief_complaint_match:
        return chief_complaint_match.group(1).strip()
    else:
        return None
discharge_df['complaint'] = discharge_df['text'].apply(extract_complaint)

In [9]:
# Extract patient sex for every record
def extract_patient_sex(note):
    patient_sex = re.search(r'Sex:\s*(\w)', note, re.IGNORECASE)
    if patient_sex:
        patient_sex = patient_sex.group(1)
        if patient_sex == 'F' or patient_sex == 'f':
            patient_sex = 'Female'
        elif patient_sex == 'M' or patient_sex == 'm':
            patient_sex = 'Male'
        return patient_sex
    else:
        return None
discharge_df['patient_sex'] = discharge_df['text'].apply(extract_patient_sex)

Examination regex for RADIOLOGY

In [10]:
# Extract examination for every record
def extract_radiology_examination(note):
    examination = re.search(r'EXAMINATION:(.*?)INDICATION:', note, re.IGNORECASE | re.DOTALL)
    if examination:
        examination = examination.group(1).strip()
        return examination
    else:
        return None
radiology_df['examination'] = radiology_df['text'].apply(extract_radiology_examination)