### MIMIC-IV CTPA PE Preprocessing
---
#### Author: `Omid Jafari (Ang Li Lab - Baylor College of Medicine)`
#### Created on: `May 8, 2024`
#### License: `This Jupyter Notebook is licensed under the GNU General Public License version 3 (GPLv3).`
---

#### Step 0: import packages

In [1]:
import pandas as pd
import re
from tqdm.auto import tqdm
import spacy

In [2]:
tqdm.pandas()
nlp = spacy.load("en_core_web_lg")

#### Step 1: import data files

In [3]:
section_inc = []
section_exc = []
keyword_inc = []
keyword_exc = []

with open("../data/ctpa/section_header_inc.txt", "r") as fp:
    section_inc = [line.strip() for line in fp]

with open("../data/ctpa/section_header_exc.txt", "r") as fp:
    section_exc = [line.strip() for line in fp]

with open("../data/ctpa/keyword_inc.txt", "r") as fp:
    keyword_inc = [line.strip() for line in fp]

with open("../data/ctpa/keyword_exc.txt", "r") as fp:
    keyword_exc = [line.strip() for line in fp]

print(section_inc)
print(section_exc)
print(keyword_inc)
print(keyword_exc)

['impression', 'impressions', 'plan', 'suggestion', 'discussion', 'conclusion', 'conclusions', 'recommendation', 'rec', 'recs', 'FINDINGS', 'FINDING']
['review of system', 'review of systems', 'system review', 'systems review', 'hematologic history', 'hematological history', 'heme history', 'oncological history', 'oncologic history', 'onc history', 'History', 'hematologic hx', 'hematological hx', 'heme hx', 'oncological hx', 'oncologic hx', 'onc hx', 'past h', 'past med', 'past surg', 'medical h', 'social h', 'family h', 'medication', 'allerg', 'physical exam', 'vital', 'ros', 'pmh:', 'pmhx', 'psh:', 'pshx', 'sh:', 'shx', 'fh:', 'fhx', 'med:', 'meds', 'general', 'gen:', 'pe:', 'v/s', 'data', 'lab', 'labs', 'laboratory', 'image', 'imaging', 'radiology', 'pathology', 'path', 'problem', 'problems', 'objective', 'obj', 'o:', 'diagnosis', 'discharge condition', 'disposition', 'discharge medication', 'medication list', 'consults', 'procedures']
['PE', 'VTE', 'pulmonary embolus', 'pulmonary e

In [4]:
df = pd.read_csv("../data/ctpa/ctpa_input.csv")
df

Unnamed: 0,note_id,subject_id,hadm_id,note_seq,charttime,storetime,text
0,15724651-RR-113,15724651,22574666.0,113,2174-07-19 13:56:00,2174-07-19 17:19:00,EXAMINATION: CTA TORSO\n\nINDICATION: ___ ye...
1,13467921-RR-97,13467921,21937296.0,97,2160-11-23 00:38:00,2160-11-23 01:47:00,EXAMINATION: CTA CHEST WITH CONTRAST\n\nINDIC...
2,18389073-RR-169,18389073,20599400.0,169,2154-11-01 15:55:00,2154-11-01 18:37:00,INDICATION: ___ woman with history of pulmona...
3,18371997-RR-89,18371997,23840260.0,89,2170-11-26 02:11:00,2170-11-26 03:50:00,EXAMINATION: CTA CHEST\n\nINDICATION: ___ ye...
4,13294123-RR-174,13294123,24526753.0,174,2143-08-20 15:56:00,2143-08-20 17:20:00,EXAMINATION: CTA CHEST WITH CONTRAST\n\nINDIC...
...,...,...,...,...,...,...,...
19937,13962878-RR-11,13962878,21152405.0,11,1967-10-31 04:39:00,1967-10-31 09:07:00,INDICATION: ___ man with multiple shots to th...
19938,15700387-RR-15,15700387,,15,1991-10-18 09:48:00,1991-10-18 17:24:00,CT OF THE TORSO\n\nHISTORY: Thoracic aneurysm...
19939,16608260-RR-21,16608260,,21,1974-05-18 11:14:00,1974-05-18 17:11:00,"HISTORY: Type B aortic dissection, evaluate f..."
19940,16833001-RR-78,16833001,20987299.0,78,1974-02-11 09:33:00,1974-02-11 12:02:00,INDICATION: ___ man with metastatic esophagea...


#### Step 2: mask exclusion keywords

In [5]:
def mask_keyword(text, keyword_exc):
    for word in keyword_exc:
        text = re.sub(r"\b" + re.escape(word) + r"\b", "*" * len(word), text, flags=re.IGNORECASE)
    return text

df["note_text_masked"] = df["text"].progress_apply(lambda x: mask_keyword(x, keyword_exc))

  0%|          | 0/19942 [00:00<?, ?it/s]

#### Step 3: restrict to notes containing at least 1 inclusion keyword

In [6]:
def keyword(element):
    if any(re.search(r"\b" + word + r"\b", element, re.IGNORECASE) for word in keyword_inc):
        return 1

df["key_status"] = df["note_text_masked"].progress_apply(keyword)
df

  0%|          | 0/19942 [00:00<?, ?it/s]

Unnamed: 0,note_id,subject_id,hadm_id,note_seq,charttime,storetime,text,note_text_masked,key_status
0,15724651-RR-113,15724651,22574666.0,113,2174-07-19 13:56:00,2174-07-19 17:19:00,EXAMINATION: CTA TORSO\n\nINDICATION: ___ ye...,EXAMINATION: CTA TORSO\n\nINDICATION: ___ ye...,1.0
1,13467921-RR-97,13467921,21937296.0,97,2160-11-23 00:38:00,2160-11-23 01:47:00,EXAMINATION: CTA CHEST WITH CONTRAST\n\nINDIC...,EXAMINATION: CTA CHEST WITH CONTRAST\n\nINDIC...,1.0
2,18389073-RR-169,18389073,20599400.0,169,2154-11-01 15:55:00,2154-11-01 18:37:00,INDICATION: ___ woman with history of pulmona...,INDICATION: ___ woman with history of pulmona...,1.0
3,18371997-RR-89,18371997,23840260.0,89,2170-11-26 02:11:00,2170-11-26 03:50:00,EXAMINATION: CTA CHEST\n\nINDICATION: ___ ye...,EXAMINATION: CTA CHEST\n\nINDICATION: ___ ye...,1.0
4,13294123-RR-174,13294123,24526753.0,174,2143-08-20 15:56:00,2143-08-20 17:20:00,EXAMINATION: CTA CHEST WITH CONTRAST\n\nINDIC...,EXAMINATION: CTA CHEST WITH CONTRAST\n\nINDIC...,1.0
...,...,...,...,...,...,...,...,...,...
19937,13962878-RR-11,13962878,21152405.0,11,1967-10-31 04:39:00,1967-10-31 09:07:00,INDICATION: ___ man with multiple shots to th...,INDICATION: ___ man with multiple shots to th...,
19938,15700387-RR-15,15700387,,15,1991-10-18 09:48:00,1991-10-18 17:24:00,CT OF THE TORSO\n\nHISTORY: Thoracic aneurysm...,CT OF THE TORSO\n\nHISTORY: Thoracic aneurysm...,
19939,16608260-RR-21,16608260,,21,1974-05-18 11:14:00,1974-05-18 17:11:00,"HISTORY: Type B aortic dissection, evaluate f...","HISTORY: Type B aortic dissection, evaluate f...",
19940,16833001-RR-78,16833001,20987299.0,78,1974-02-11 09:33:00,1974-02-11 12:02:00,INDICATION: ___ man with metastatic esophagea...,INDICATION: ___ man with metastatic esophagea...,


In [7]:
df = df[df["key_status"] == 1]
df.shape

(18748, 9)

#### Step 4: define section of interest

In [8]:
def section_interests(element):
    sections = [section.strip() for section in re.split(r"\n \n|\n\n", element)]
    output = []
    i = 0
    
    while i < len(sections):
        if any(re.search(r"^\b" + word + r"\b.*", sections[i], re.IGNORECASE) for word in section_inc):
            j = i
            tmp = []
            
            while j < len(sections) and not any(re.search(r"^\b" + word, sections[j], re.IGNORECASE) for word in section_exc):
                tmp.append(sections[j])
                j += 1
                
            output.append("  ".join(tmp))
            i = j
            
        i += 1
        
    return " ".join(output)

df["section_interests"] = df["note_text_masked"].progress_apply(section_interests)
df

  0%|          | 0/18748 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["section_interests"] = df["note_text_masked"].progress_apply(section_interests)


Unnamed: 0,note_id,subject_id,hadm_id,note_seq,charttime,storetime,text,note_text_masked,key_status,section_interests
0,15724651-RR-113,15724651,22574666.0,113,2174-07-19 13:56:00,2174-07-19 17:19:00,EXAMINATION: CTA TORSO\n\nINDICATION: ___ ye...,EXAMINATION: CTA TORSO\n\nINDICATION: ___ ye...,1.0,FINDINGS: VASCULAR:\nARTERIES: The ascending ...
1,13467921-RR-97,13467921,21937296.0,97,2160-11-23 00:38:00,2160-11-23 01:47:00,EXAMINATION: CTA CHEST WITH CONTRAST\n\nINDIC...,EXAMINATION: CTA CHEST WITH CONTRAST\n\nINDIC...,1.0,FINDINGS: HEART AND VASCULATURE: Evaluation o...
2,18389073-RR-169,18389073,20599400.0,169,2154-11-01 15:55:00,2154-11-01 18:37:00,INDICATION: ___ woman with history of pulmona...,INDICATION: ___ woman with history of pulmona...,1.0,FINDINGS: The pulmonary arteries are opacifie...
3,18371997-RR-89,18371997,23840260.0,89,2170-11-26 02:11:00,2170-11-26 03:50:00,EXAMINATION: CTA CHEST\n\nINDICATION: ___ ye...,EXAMINATION: CTA CHEST\n\nINDICATION: ___ ye...,1.0,FINDINGS: The aorta and its major branch vess...
4,13294123-RR-174,13294123,24526753.0,174,2143-08-20 15:56:00,2143-08-20 17:20:00,EXAMINATION: CTA CHEST WITH CONTRAST\n\nINDIC...,EXAMINATION: CTA CHEST WITH CONTRAST\n\nINDIC...,1.0,"FINDINGS: THYROID, LYMPH NODES & MEDIASTINUM..."
...,...,...,...,...,...,...,...,...,...,...
18743,15695493-RR-51,15695493,21922166.0,51,1964-05-12 18:04:00,1964-05-12 20:52:00,HISTORY: ___ female with metastatic lung carc...,HISTORY: ___ female with metastatic lung carc...,1.0,IMPRESSION:\n1. Progressive atelectasis of th...
18744,10203225-RR-22,10203225,24837382.0,22,1974-05-07 12:59:00,1974-05-07 15:09:00,HISTORY: Recently diagnosed massive pulmonary...,HISTORY: Recently diagnosed massive pulmonary...,1.0,FINDINGS: CTA chest: The pulmonary arterial ...
18745,13050816-RR-169,13050816,,169,1973-12-14 20:13:00,1973-12-14 22:32:00,"CLINICAL HISTORY: Prior history of PE, substa...","CLINICAL HISTORY: Prior history of PE, substa...",1.0,FINDINGS: There are no filling defects in the...
18746,10923536-RR-82,10923536,22275082.0,82,1953-05-05 20:48:00,1953-05-06 11:38:00,HISTORY: ___ male with known T11 compression ...,HISTORY: ___ male with known T11 compression ...,1.0,IMPRESSION: 1. Nonocclusive thrombus in the ...


Keep whole note if no section of interest is found:

In [9]:
df["section_interests"] = df["section_interests"].replace("", pd.NA).fillna(df["note_text_masked"])
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["section_interests"] = df["section_interests"].replace("", pd.NA).fillna(df["note_text_masked"])


Unnamed: 0,note_id,subject_id,hadm_id,note_seq,charttime,storetime,text,note_text_masked,key_status,section_interests
0,15724651-RR-113,15724651,22574666.0,113,2174-07-19 13:56:00,2174-07-19 17:19:00,EXAMINATION: CTA TORSO\n\nINDICATION: ___ ye...,EXAMINATION: CTA TORSO\n\nINDICATION: ___ ye...,1.0,FINDINGS: VASCULAR:\nARTERIES: The ascending ...
1,13467921-RR-97,13467921,21937296.0,97,2160-11-23 00:38:00,2160-11-23 01:47:00,EXAMINATION: CTA CHEST WITH CONTRAST\n\nINDIC...,EXAMINATION: CTA CHEST WITH CONTRAST\n\nINDIC...,1.0,FINDINGS: HEART AND VASCULATURE: Evaluation o...
2,18389073-RR-169,18389073,20599400.0,169,2154-11-01 15:55:00,2154-11-01 18:37:00,INDICATION: ___ woman with history of pulmona...,INDICATION: ___ woman with history of pulmona...,1.0,FINDINGS: The pulmonary arteries are opacifie...
3,18371997-RR-89,18371997,23840260.0,89,2170-11-26 02:11:00,2170-11-26 03:50:00,EXAMINATION: CTA CHEST\n\nINDICATION: ___ ye...,EXAMINATION: CTA CHEST\n\nINDICATION: ___ ye...,1.0,FINDINGS: The aorta and its major branch vess...
4,13294123-RR-174,13294123,24526753.0,174,2143-08-20 15:56:00,2143-08-20 17:20:00,EXAMINATION: CTA CHEST WITH CONTRAST\n\nINDIC...,EXAMINATION: CTA CHEST WITH CONTRAST\n\nINDIC...,1.0,"FINDINGS: THYROID, LYMPH NODES & MEDIASTINUM..."
...,...,...,...,...,...,...,...,...,...,...
18743,15695493-RR-51,15695493,21922166.0,51,1964-05-12 18:04:00,1964-05-12 20:52:00,HISTORY: ___ female with metastatic lung carc...,HISTORY: ___ female with metastatic lung carc...,1.0,IMPRESSION:\n1. Progressive atelectasis of th...
18744,10203225-RR-22,10203225,24837382.0,22,1974-05-07 12:59:00,1974-05-07 15:09:00,HISTORY: Recently diagnosed massive pulmonary...,HISTORY: Recently diagnosed massive pulmonary...,1.0,FINDINGS: CTA chest: The pulmonary arterial ...
18745,13050816-RR-169,13050816,,169,1973-12-14 20:13:00,1973-12-14 22:32:00,"CLINICAL HISTORY: Prior history of PE, substa...","CLINICAL HISTORY: Prior history of PE, substa...",1.0,FINDINGS: There are no filling defects in the...
18746,10923536-RR-82,10923536,22275082.0,82,1953-05-05 20:48:00,1953-05-06 11:38:00,HISTORY: ___ male with known T11 compression ...,HISTORY: ___ male with known T11 compression ...,1.0,IMPRESSION: 1. Nonocclusive thrombus in the ...


#### Step 5: filter sentences using inclusion keywords

In [10]:
def sentence_interests(element, keyword_inc):
    sentences = list(nlp(element.replace("\n", "")).sents)
    processed_sentences = []

    for sent in sentences:
        sent_text = sent.text.strip()
        if any(re.search(r"\b" + re.escape(word) + r"\b", sent_text, re.IGNORECASE) for word in keyword_inc):
            processed_sentences.append(sent_text)

    return " ".join(processed_sentences)

df["sentence_interests"] = df["section_interests"].progress_apply(lambda x: sentence_interests(x, keyword_inc))
df

  0%|          | 0/18748 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["sentence_interests"] = df["section_interests"].progress_apply(lambda x: sentence_interests(x, keyword_inc))


Unnamed: 0,note_id,subject_id,hadm_id,note_seq,charttime,storetime,text,note_text_masked,key_status,section_interests,sentence_interests
0,15724651-RR-113,15724651,22574666.0,113,2174-07-19 13:56:00,2174-07-19 17:19:00,EXAMINATION: CTA TORSO\n\nINDICATION: ___ ye...,EXAMINATION: CTA TORSO\n\nINDICATION: ___ ye...,1.0,FINDINGS: VASCULAR:\nARTERIES: The ascending ...,Pulmonary vasculature is well opacified to the...
1,13467921-RR-97,13467921,21937296.0,97,2160-11-23 00:38:00,2160-11-23 01:47:00,EXAMINATION: CTA CHEST WITH CONTRAST\n\nINDIC...,EXAMINATION: CTA CHEST WITH CONTRAST\n\nINDIC...,1.0,FINDINGS: HEART AND VASCULATURE: Evaluation o...,Pulmonary vasculature is otherwise well opacif...
2,18389073-RR-169,18389073,20599400.0,169,2154-11-01 15:55:00,2154-11-01 18:37:00,INDICATION: ___ woman with history of pulmona...,INDICATION: ___ woman with history of pulmona...,1.0,FINDINGS: The pulmonary arteries are opacifie...,The pulmonary arteries are opacified to subseg...
3,18371997-RR-89,18371997,23840260.0,89,2170-11-26 02:11:00,2170-11-26 03:50:00,EXAMINATION: CTA CHEST\n\nINDICATION: ___ ye...,EXAMINATION: CTA CHEST\n\nINDICATION: ___ ye...,1.0,FINDINGS: The aorta and its major branch vess...,The pulmonary arteries are well opacified to t...
4,13294123-RR-174,13294123,24526753.0,174,2143-08-20 15:56:00,2143-08-20 17:20:00,EXAMINATION: CTA CHEST WITH CONTRAST\n\nINDIC...,EXAMINATION: CTA CHEST WITH CONTRAST\n\nINDIC...,1.0,"FINDINGS: THYROID, LYMPH NODES & MEDIASTINUM...",There is no evidence of pulmonary embolism to ...
...,...,...,...,...,...,...,...,...,...,...,...
18743,15695493-RR-51,15695493,21922166.0,51,1964-05-12 18:04:00,1964-05-12 20:52:00,HISTORY: ___ female with metastatic lung carc...,HISTORY: ___ female with metastatic lung carc...,1.0,IMPRESSION:\n1. Progressive atelectasis of th...,
18744,10203225-RR-22,10203225,24837382.0,22,1974-05-07 12:59:00,1974-05-07 15:09:00,HISTORY: Recently diagnosed massive pulmonary...,HISTORY: Recently diagnosed massive pulmonary...,1.0,FINDINGS: CTA chest: The pulmonary arterial ...,"There is an irregular 2.4 cm, polypoid filling..."
18745,13050816-RR-169,13050816,,169,1973-12-14 20:13:00,1973-12-14 22:32:00,"CLINICAL HISTORY: Prior history of PE, substa...","CLINICAL HISTORY: Prior history of PE, substa...",1.0,FINDINGS: There are no filling defects in the...,There are no filling defects in the pulmonary ...
18746,10923536-RR-82,10923536,22275082.0,82,1953-05-05 20:48:00,1953-05-06 11:38:00,HISTORY: ___ male with known T11 compression ...,HISTORY: ___ male with known T11 compression ...,1.0,IMPRESSION: 1. Nonocclusive thrombus in the ...,


In [11]:
df = df[df["sentence_interests"] != ""]
df

Unnamed: 0,note_id,subject_id,hadm_id,note_seq,charttime,storetime,text,note_text_masked,key_status,section_interests,sentence_interests
0,15724651-RR-113,15724651,22574666.0,113,2174-07-19 13:56:00,2174-07-19 17:19:00,EXAMINATION: CTA TORSO\n\nINDICATION: ___ ye...,EXAMINATION: CTA TORSO\n\nINDICATION: ___ ye...,1.0,FINDINGS: VASCULAR:\nARTERIES: The ascending ...,Pulmonary vasculature is well opacified to the...
1,13467921-RR-97,13467921,21937296.0,97,2160-11-23 00:38:00,2160-11-23 01:47:00,EXAMINATION: CTA CHEST WITH CONTRAST\n\nINDIC...,EXAMINATION: CTA CHEST WITH CONTRAST\n\nINDIC...,1.0,FINDINGS: HEART AND VASCULATURE: Evaluation o...,Pulmonary vasculature is otherwise well opacif...
2,18389073-RR-169,18389073,20599400.0,169,2154-11-01 15:55:00,2154-11-01 18:37:00,INDICATION: ___ woman with history of pulmona...,INDICATION: ___ woman with history of pulmona...,1.0,FINDINGS: The pulmonary arteries are opacifie...,The pulmonary arteries are opacified to subseg...
3,18371997-RR-89,18371997,23840260.0,89,2170-11-26 02:11:00,2170-11-26 03:50:00,EXAMINATION: CTA CHEST\n\nINDICATION: ___ ye...,EXAMINATION: CTA CHEST\n\nINDICATION: ___ ye...,1.0,FINDINGS: The aorta and its major branch vess...,The pulmonary arteries are well opacified to t...
4,13294123-RR-174,13294123,24526753.0,174,2143-08-20 15:56:00,2143-08-20 17:20:00,EXAMINATION: CTA CHEST WITH CONTRAST\n\nINDIC...,EXAMINATION: CTA CHEST WITH CONTRAST\n\nINDIC...,1.0,"FINDINGS: THYROID, LYMPH NODES & MEDIASTINUM...",There is no evidence of pulmonary embolism to ...
...,...,...,...,...,...,...,...,...,...,...,...
18740,17889152-RR-28,17889152,25398136.0,28,2017-12-29 19:09:00,2017-12-30 09:46:00,HISTORY: Desaturation suspect pulmonary embol...,HISTORY: Desaturation suspect pulmonary embol...,1.0,FINDINGS: An aorta is intact. There are no f...,There are no filling defects in the centralpul...
18741,11922120-RR-73,11922120,23113183.0,73,1985-03-03 16:38:00,1985-03-03 18:34:00,HISTORY: Hypoxia. Evaluate for pulmonary emb...,HISTORY: Hypoxia. Evaluate for pulmonary emb...,1.0,FINDINGS: CHEST CTA: The thoracic aorta is n...,"The main, lobar, segmental, and subsegmental p..."
18744,10203225-RR-22,10203225,24837382.0,22,1974-05-07 12:59:00,1974-05-07 15:09:00,HISTORY: Recently diagnosed massive pulmonary...,HISTORY: Recently diagnosed massive pulmonary...,1.0,FINDINGS: CTA chest: The pulmonary arterial ...,"There is an irregular 2.4 cm, polypoid filling..."
18745,13050816-RR-169,13050816,,169,1973-12-14 20:13:00,1973-12-14 22:32:00,"CLINICAL HISTORY: Prior history of PE, substa...","CLINICAL HISTORY: Prior history of PE, substa...",1.0,FINDINGS: There are no filling defects in the...,There are no filling defects in the pulmonary ...


#### Step 6: output results

In [12]:
df.to_csv("../data/ctpa/ctpa_prep_out.csv", index=False)