In [3]:
from datasketch import MinHash, MinHashLSH
from multiprocessing import Pool, cpu_count
import pandas as pd
from tqdm import tqdm  # For progress tracking
import re

tqdm.pandas()

In [4]:
 cxr_reports = pd.read_csv("/scratch/baj321/cxr_reports.csv")
mimic_notes = pd.read_csv("/scratch/baj321/MIMIC-Note/physionet.org/files/mimic-iv-note/2.2/note/radiology.csv")

In [5]:
# Finding subject_ids in mimic_notes but not in cxr_reports
mimic_only_subject_ids = set(mimic_notes['subject_id'].unique()) - set(cxr_reports['subject_id'].unique())

# Finding subject_ids in cxr_reports but not in mimic_notes
cxr_only_subject_ids = set(cxr_reports['subject_id'].unique()) - set(mimic_notes['subject_id'].unique())

mimic_only_subject_ids = list(mimic_only_subject_ids)
cxr_only_subject_ids = list(cxr_only_subject_ids)

print("Number of subject_ids in mimic_notes but not in cxr_reports:", len(mimic_only_subject_ids))
print("Number of subject_ids in cxr_reports but not in mimic_notes:", len(cxr_only_subject_ids))

Number of subject_ids in mimic_notes but not in cxr_reports: 175710
Number of subject_ids in cxr_reports but not in mimic_notes: 3662


In [6]:
def preprocess(text):
    if pd.isnull(text):
        print('text is', text)
        return set()
    
    # Step 1: Remove everything up to the literal string "FINAL REPORT\n"
    match = re.search(r"FINAL REPORT\n", text, re.IGNORECASE)
    if match:
        text = text[match.end():]  # Keep only text after "FINAL REPORT\n"
    text = re.sub(r"___M|___F|___|[.,!?/:;\\]", "", text)
    # Step 3: Remove all formatting: no spaces, literal "\n" strings, or newlines
    text = re.sub(r"\s+|\\n", "", text)  # Remove spaces and literal "\n" strings
    return text
    
def preprocess_mimic_notes(text):
    if pd.isnull(text):
        print('text is', text)
        return set()
        
    text = re.sub(r"___M|___F|___|[.,!?/:;\\]", "", text)
    
    # Step 2: Remove all formatting: no spaces, literal "\n" strings, or newlines
    text = re.sub(r"\s+|\\n", "", text)  # Remove spaces and literal "\n" strings
    return text

In [7]:
common_subject_mimic_notes = mimic_notes[~mimic_notes['subject_id'].isin(mimic_only_subject_ids)]
common_subject_cxr_reports = cxr_reports[~cxr_reports['subject_id'].isin(cxr_only_subject_ids)]

In [8]:
common_subject_mimic_notes['processed_text'] = common_subject_mimic_notes['text'].progress_apply(preprocess_mimic_notes)
common_subject_cxr_reports['processed_text'] = common_subject_cxr_reports['cxr_report_text'].progress_apply(preprocess)

100%|██████████| 1097916/1097916 [02:10<00:00, 8383.73it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
100%|██████████| 215822/215822 [00:13<00:00, 15803.38it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [8]:
from tqdm import tqdm

# Initialize tqdm for progress tracking
tqdm.pandas()

# Filter `common_subject_mimic_notes` to keep rows where there's no match in `common_subject_cxr_reports`
filtered_mimic_notes = common_subject_mimic_notes[
    ~common_subject_mimic_notes.progress_apply(
        lambda row: ((common_subject_cxr_reports['subject_id'] == row['subject_id']) &
                     (common_subject_cxr_reports['processed_text'] == row['processed_text'])).any(),
        axis=1
    )
]

# Filter `common_subject_cxr_reports` to keep rows where there's no match in `common_subject_mimic_notes`
filtered_cxr_reports = common_subject_cxr_reports[
    ~common_subject_cxr_reports.progress_apply(
        lambda row: ((common_subject_mimic_notes['subject_id'] == row['subject_id']) &
                     (common_subject_mimic_notes['processed_text'] == row['processed_text'])).any(),
        axis=1
    )
]


 22%|██▏       | 237871/1097916 [1:29:54<5:25:02, 44.10it/s]


KeyboardInterrupt: 

In [None]:
import swifter
from tqdm import tqdm

# Initialize tqdm for progress tracking
tqdm.pandas()

# Filter `common_subject_mimic_notes` to keep rows where there's no match in `common_subject_cxr_reports`
filtered_mimic_notes = common_subject_mimic_notes[
    ~common_subject_mimic_notes.swifter.progress_bar(True).apply(
        lambda row: ((common_subject_cxr_reports['subject_id'] == row['subject_id']) &
                     (common_subject_cxr_reports['processed_text'] == row['processed_text'])).any(),
        axis=1
    )
]

# Filter `common_subject_cxr_reports` to keep rows where there's no match in `common_subject_mimic_notes`
filtered_cxr_reports = common_subject_cxr_reports[
    ~common_subject_cxr_reports.swifter.progress_bar(True).apply(
        lambda row: ((common_subject_mimic_notes['subject_id'] == row['subject_id']) &
                     (common_subject_mimic_notes['processed_text'] == row['processed_text'])).any(),
        axis=1
    )
]


  from .autonotebook import tqdm as notebook_tqdm
Pandas Apply:   1%|          | 7380/1097916 [02:36<6:22:15, 47.55it/s]

In [2]:
pip install swifter

Collecting swifter
  Using cached swifter-1.4.0.tar.gz (1.2 MB)
  Preparing metadata (setup.py) ... [?25ldone
Collecting dask[dataframe]>=2.10.0
  Downloading dask-2022.2.0-py3-none-any.whl (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m75.2 kB/s[0m eta [36m0:00:00[0ma [36m0:00:02[0mm
Collecting cloudpickle>=1.1.1
  Using cached cloudpickle-2.2.1-py3-none-any.whl (25 kB)
Collecting pyyaml>=5.3.1
  Using cached PyYAML-6.0.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (670 kB)
Collecting fsspec>=0.6.0
  Using cached fsspec-2023.1.0-py3-none-any.whl (143 kB)
Collecting toolz>=0.8.2
  Downloading toolz-0.12.1-py3-none-any.whl (56 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.1/56.1 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting partd>=0.3.10
  Downloading partd-1.4.1-py3-none-any.whl (18 kB)
Collecting locket
  Using cached locket-1.0.0-py2.py3-none-any.whl (4.4 kB)
Building wheels fo