In [1]:
from datasketch import MinHash, MinHashLSH
from multiprocessing import Pool, cpu_count
import pandas as pd
from tqdm import tqdm  # For progress tracking
import re

tqdm.pandas()

In [2]:
rad_reports = pd.read_csv("/scratch/baj321/MIMIC-Note/physionet.org/files/mimic-iv-note/2.2/note/radiology.csv")

In [4]:
discharge_notes = pd.read_csv("/scratch/baj321/MIMIC-Note/physionet.org/files/mimic-iv-note/2.2/note/discharge.csv")

In [6]:
rad_reports.columns

Index(['note_id', 'subject_id', 'hadm_id', 'note_type', 'note_seq',
       'charttime', 'storetime', 'text'],
      dtype='object')

In [7]:
discharge_notes.columns

Index(['note_id', 'subject_id', 'hadm_id', 'note_type', 'note_seq',
       'charttime', 'storetime', 'text'],
      dtype='object')

In [8]:
for name, df in {"Radiology": rad_reports, "Discharge": discharge_notes}.items():
    print(f"{name:<10}: {df.shape[0]:,} rows  |  {df['subject_id'].nunique():,} patients")

Radiology : 2,321,355 rows  |  237,427 patients
Discharge : 331,793 rows  |  145,914 patients


In [9]:
print("\n--- Note types present ---")
display(rad_reports['note_type'].value_counts())

print("\n--- Date ranges ---")
for label, df in [("Radiology", rad_reports), ("Discharge", discharge_notes)]:
    print(f"{label:<10}: {pd.to_datetime(df.charttime).min()} ⟶ {pd.to_datetime(df.charttime).max()}")


--- Note types present ---


RR    2295635
AR      25720
Name: note_type, dtype: int64


--- Date ranges ---
Radiology : 2105-02-14 14:24:00 ⟶ 2212-12-19 18:42:00
Discharge : 2105-10-12 00:00:00 ⟶ 2212-04-12 00:00:00


In [4]:
print(rad_reports.text[10])

EXAMINATION:  ULTRASOUND PARACENTESIS

INDICATION:  ___ year old woman with Cirrhosis and ascites // bi weekly large
volume paracentesis with albumim post per protocol

TECHNIQUE:  Ultrasound guided therapeutic paracentesis

COMPARISON:  ___

FINDINGS: 

Limited grayscale ultrasound imaging of the abdomen demonstrated
moderateascites. A suitable target in the deepest pocket in the right lower
quadrant was selected for paracentesis.

PROCEDURE:  The procedure, risks, benefits and alternatives were discussed
with the patient and existing annual signed consent was reviewed.

A preprocedure time-out was performed discussing the planned procedure,
confirming the patient's identity with 3 identifiers, and reviewing a
checklist per ___ protocol.

Under ultrasound guidance, an entrance site was selected and the skin was
prepped and draped in the usual sterile fashion. 1% lidocaine was instilled
for local anesthesia.

A 5 ___ catheter was advanced into the largest fluid pocket in the right
lowe

In [20]:
print(discharge_notes.text[100])

 
Name:  ___            Unit No:   ___
 
Admission Date:  ___              Discharge Date:   ___
 
Date of Birth:  ___             Sex:   F
 
Service: SURGERY
 
Allergies: 
Ciprofloxacin Hcl
 
Attending: ___.
 
Chief Complaint:
RUQ and epigastric pain
 
Major Surgical or Invasive Procedure:
Laparoscopic cholecystectomy.

 
History of Present Illness:
___ is an ___ year old female who presents
with a one day history of RUQ and epigastric pain.  The pain has
been intermittent and associated with nausea. She reports that
the pain is somewhat improved now, but not completely 
alleviated.
She denies any emesis. She denies any fevers or chills. She has
had a couple episodes of pain that was similar in the past. She
continues to have flatus and bowel movements. 

She has not had any po intake since the pain began, so she is 
not
sure if the pain is increased with po intake.  She reports
decreased appetite today.  Last po intake was this morning.
 
Past Medical History:
PMH: Multinodular goite

In [14]:
import re
from collections import Counter
from tqdm.auto import tqdm   # auto picks the right UI (notebook/terminal)

# 1. compile the section-header pattern
section_pat = re.compile(r'(?:^|\n)\s*([^\n:]+?):')

# 2. gather counts across the whole column, showing progress
section_counts = Counter()

for txt in tqdm(discharge_notes['text'].dropna(),
                desc="Parsing discharge notes",
                unit="note"):
    labels = section_pat.findall(txt)
    section_counts.update(label.strip() for label in labels)

# 3. pretty-print the results, most frequent first
for label, n in section_counts.most_common():
    print(f'{label}: {n}')


  from .autonotebook import tqdm as notebook_tqdm
Parsing discharge notes: 100%|██████████| 331793/331793 [03:21<00:00, 1642.77note/s]


___ 06: 542718
___ 05: 432886
___ 07: 411156
Past Medical History: 336535
Allergies: 333610
Name: 333594
Date of Birth: 331977
Admission Date: 331781
Service: 331752
Discharge Instructions: 331598
Discharge Condition: 331349
Attending: 331125
Major Surgical or Invasive Procedure: 330468
Discharge Disposition: 327488
History of Present Illness: 327034
Followup Instructions: 325782
Physical Exam: 325418
Family History: 324725
Discharge Medications: 324620
Social History: 323006
Chief Complaint: 321434
Pertinent Results: 319488
Discharge Diagnosis: 318222
Medications on Admission: 312748
HEENT: 304666
Brief Hospital Course: 292373
___ 04: 275275
Mental Status: 258246
Disp: 257126
___ 08: 253728
Level of Consciousness: 253059
Activity Status: 252484
___ 12: 250310
___ 09: 241670
IMPRESSION: 240482
___ 10: 223616
___ 03: 216273
___ 11: 208797
CV: 207408
___ 01: 207307
___ 02: 193036
___: 173630
Facility: 151578
General: 127647
Neuro: 127102
VS: 120179
GENERAL: 118229
Ext: 115962
Abdomen: 11

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)




6. betamethasone, augmented 0.05 % Gel Sig: 1
___ LABS ___: 1
G Tube Check: 1
# Urinary tract infection, catheter-based: 1
# G-Tube malfunction: 1
# Question of Bilateral pneumonia: 1
# Acute toxic/metobolic encephalopathy: 1
15. Ondansetron 4 mg PO/NG Q8H: 1
-G-tube study (___): 1
-G-tube replacement (___): 1
#Mal-positioned G-tube, possible cellulitis: 1
UPDATED ALLERGIES: 1
TUBEFEEDING: 1
Pelvic Ultrasound, ___: 1
Her course was as follows: 1
# Post Operative Care: 1
increase by 1U: 1
20.  Meclizine 25 mg PO Q8H: 1
Arterial Duplex Unilateral Left Lower Extremity ___: 1
Peak systolic velocities of the native vessels are as follows: 1
MRI Thigh w & w/o Contrast ___: 1
femoral diaphysis (03: 1
Plain films of hip, 2 views, ___: 1
opacification of the left mastoid air cells.   IMPRESSION: 1
# Superior pubic ramus fracture: 1
Floor Physical Exam: 1
#. Dyspnea/fatigue: 1
9. Lorazepam 2 mg/mL Concentrate Sig: 1
10. Atropine 1 % Drops Sig: 1
24, 156/80, O2 saturation 99% on room air. HEENT:

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)




directed Disp #*27 Tablet Refills: 1
which decreased her tachycardia (Note: 1
1.Influenza: 1
1. Exposure prophylaxis: 1
1. MetronidAZOLE Topical 1 % Gel 1 Appl TP BID: 1
VS   HR 100  BP  104/64   Resp: 1
#MAT vs sinus arrhythmia: 1
#Pulmonary HTN c/b R heart failure: 1
#CAD s/p PCI to LCx: 1
hours) as needed for right hip pain: 1
9. Bosentan 125 mg Tablet Sig: 1
16. Hydroxychloroquine 200 mg Tablet Sig: 1
pain. She had 1L NS. VS on transfer: 1
- R-sided heart failure: 1
___ hip 2 view: 1
2. ASPIRATION PNEUMONITIS: 1
3. PULMONARY HTN: 1
4. RIGHT CHF: 1
5. PULMONARY EMBOLISM: 1
8. DELIRIUM: 1
waning status for the 2 months prior to admission: 1
9. CAD s/p PCI to LCx: 1
10. PAROXYSMAL AFIB: 1
11. SYSTEMIC SCLEROSIS: 1
12. RIGHT HIP FRACTURE: 1
13. NUTRITION: 1
14. C DIFFICILE COLLITIS: 1
FAMILY CONTACT INFORMATION: 1
6. Bosentan 125 mg Tablet Sig: 1
(___) for 3 weeks: 1
Per ___ Discharge Letter to ___: 1
#. HYPOXIA: 1
-Another sister: 1
Sjogren's ab: 1
Cu: 1
Vit B6: 1
Bcx/Ucx: 1
Pend: 1


In [19]:
for label, n in section_counts.most_common():
    if n>0.1*len(discharge_notes):
        print(f'{label}: {n}')

___ 06: 542718
___ 05: 432886
___ 07: 411156
Past Medical History: 336535
Allergies: 333610
Name: 333594
Date of Birth: 331977
Admission Date: 331781
Service: 331752
Discharge Instructions: 331598
Discharge Condition: 331349
Attending: 331125
Major Surgical or Invasive Procedure: 330468
Discharge Disposition: 327488
History of Present Illness: 327034
Followup Instructions: 325782
Physical Exam: 325418
Family History: 324725
Discharge Medications: 324620
Social History: 323006
Chief Complaint: 321434
Pertinent Results: 319488
Discharge Diagnosis: 318222
Medications on Admission: 312748
HEENT: 304666
Brief Hospital Course: 292373
___ 04: 275275
Mental Status: 258246
Disp: 257126
___ 08: 253728
Level of Consciousness: 253059
Activity Status: 252484
___ 12: 250310
___ 09: 241670
IMPRESSION: 240482
___ 10: 223616
___ 03: 216273
___ 11: 208797
CV: 207408
___ 01: 207307
___ 02: 193036
___: 173630
Facility: 151578
General: 127647
Neuro: 127102
VS: 120179
GENERAL: 118229
Ext: 115962
Abdomen: 11

In [5]:
import re
from collections import Counter
from tqdm.auto import tqdm   # auto picks the right UI (notebook/terminal)

# 1. compile the section-header pattern
section_pat = re.compile(r'(?:^|\n)\s*([^\n:]+?):')

# 2. gather counts across the whole column, showing progress
section_counts = Counter()

for txt in tqdm(rad_reports['text'].dropna(),
                desc="Parsing radiology reports",
                unit="note"):
    labels = section_pat.findall(txt)
    section_counts.update(label.strip() for label in labels)

# 3. pretty-print the results, most frequent first
for label, n in section_counts.most_common():
    print(f'{label}: {n}')


  from .autonotebook import tqdm as notebook_tqdm
Parsing radiology reports: 100%|██████████| 2321355/2321355 [02:32<00:00, 15185.70note/s]


IMPRESSION: 2029571
FINDINGS: 1660158
INDICATION: 1624079
COMPARISON: 1591222
TECHNIQUE: 1285986
EXAMINATION: 937250
HISTORY: 433551
DOSE: 239665
NOTIFICATION: 108830
PANCREAS: 99824
SPLEEN: 97278
ABDOMEN: 96001
PELVIS: 86412
BONES: 78908
RECOMMENDATION(S): 77851
ADRENALS: 70143
VASCULAR: 69858
GASTROINTESTINAL: 68984
HEPATOBILIARY: 68079
URINARY: 67881
COMPARISONS: 66155
CLINICAL HISTORY: 63746
PROCEDURE: 62838
LOWER CHEST: 62606
STUDY: 61796
LYMPH NODES: 60969
SOFT TISSUES: 60181
REPRODUCTIVE ORGANS: 57933
CLINICAL INFORMATION: 39477
BI-RADS: 38746
OSSEOUS STRUCTURES: 36909
RETROPERITONEUM: 36510
Tissue density: 34707
LIVER: 32452
GALLBLADDER: 30104
REASON FOR EXAM: 29995
ANESTHESIA: 29910
KIDNEYS: 29590
BILE DUCTS: 29312
EXAM: 27127
DLP: 26444
REASON FOR EXAMINATION: 24873
CLINICAL INDICATION: 24569
BONE WINDOWS: 23679
LMP: 23284
CONTRAST: 22632
MEDICATIONS: 20382
PROCEDURE DETAILS: 20184
FLUOROSCOPY TIME AND DOSE: 19805
CHEST: 18045
CONCLUSION: 17603
UPPER ABDOMEN: 16910
OPERATORS:

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [9]:
# Function to extract fields
def extract_field(text, field):
    pattern = rf"\n{field}:(.*?)(?=\n[A-Z ]+:|\Z)"  # Match content up to next section or end
    match = re.search(pattern, text, re.DOTALL)
    return match.group(1).strip() if match else None

# Apply with progress bar
rad_reports['EXAMINATION'] = rad_reports['text'].progress_apply(lambda x: extract_field(x, 'EXAMINATION'))
rad_reports['TECHNIQUE'] = rad_reports['text'].progress_apply(lambda x: extract_field(x, 'TECHNIQUE'))

# Count and display
examination_counts = rad_reports['EXAMINATION'].value_counts(dropna=True)
technique_counts = rad_reports['TECHNIQUE'].value_counts(dropna=True)

100%|██████████| 2321355/2321355 [00:08<00:00, 287820.90it/s]
100%|██████████| 2321355/2321355 [00:17<00:00, 132763.29it/s]


ModuleNotFoundError: No module named 'ace_tools'

In [13]:
for label, n in examination_counts.items():
    print(f'{label}: {n}')

PA and lateral chest radiographs.: 1302
Non-contrast head CT.: 779
Single frontal chest radiograph.: 516
CHEST (PORTABLE AP): 473
Ultrasound-guided paracentesis.: 427
CHEST (PA AND LAT): 263
CT of the abdomen and pelvis with intravenous contrast.: 233
Single portable chest radiograph.: 182
Non-obstetric pelvic ultrasound.: 173
BILATERAL DIGITAL SCREENING MAMMOGRAPHY INTERPRETED WITH CAD AND
TOMOSYNTHESIS: 168
BILATERAL DIGITAL SCREENING MAMMOGRAPHY INTERPRETED WITH CAD: 163
CHEST PORT. LINE PLACEMENT: 162
CT of the torso with intravenous contrast.: 145
Supine and erect abdominal radiographs.: 131
CT of the cervical spine without intravenous contrast.: 120
CT of the cervical spine without contrast.: 116
Complete abdominal ultrasound.: 113
Renal ultrasound.: 106
CTA of the chest with and without intravenous contrast.: 91
Bilateral lower extremity DVT study.: 79
PA AND LATERAL CHEST RADIOGRAPHS.: 78
CT of the head without intravenous contrast.: 73
PA and lateral chest radiograph.: 72
Sing

In [1]:
for label, n in technique_counts.head(100).items():
    print(f'{label}: {n}')

NameError: name 'technique_counts' is not defined