In [1]:
from datasketch import MinHash, MinHashLSH
from multiprocessing import Pool, cpu_count
import pandas as pd
from tqdm import tqdm  # For progress tracking
tqdm.pandas() 
cxr_reports = pd.read_csv("/scratch/baj321/cxr_reports.csv")
mimic_notes = pd.read_csv("/scratch/baj321/MIMIC-Note/physionet.org/files/mimic-iv-note/2.2/note/radiology.csv")

In [112]:
len(mimic_notes)

2321355

In [2]:
from datasketch import MinHash, MinHashLSH
from multiprocessing import Pool, cpu_count
import pandas as pd
from tqdm import tqdm
from sklearn.feature_extraction.text import CountVectorizer
import re
def preprocess(text):
    if pd.isnull(text):
        return set()
    
    # Step 1: Remove everything up to the literal string "FINAL REPORT\n"
    match = re.search(r"FINAL REPORT\n", text, re.IGNORECASE)
    if match:
        text = text[match.end():]  # Keep only text after "FINAL REPORT\n"
    # else:
    #     return set()  # If "FINAL REPORT\n" is missing, return an empty set
    
    # Step 2: Remove unwanted special codes
    #text = re.sub(r"[^A-Za-z0-9]", "", text)
    text = re.sub(r"___M|___F|___|[.,!?/:;\\]", "", text)
    # Step 3: Remove all formatting: no spaces, literal "\n" strings, or newlines
    text = re.sub(r"\s+|\\n", "", text)  # Remove spaces and literal "\n" strings
    return text
def preprocess_mimic_notes(text):
    if pd.isnull(text):
        return set()
    
    # Step 1: Remove unwanted special codes
    #text = re.sub(r"[^A-Za-z0-9]", "", text)
    text = re.sub(r"___M|___F|___|[.,!?/:;\\]", "", text)
    
    # Step 2: Remove all formatting: no spaces, literal "\n" strings, or newlines
    text = re.sub(r"\s+|\\n", "", text)  # Remove spaces and literal "\n" strings
    return text

In [3]:
cxr_reports['new_text'] = cxr_reports['cxr_report_text'].progress_apply(preprocess)

100%|██████████| 227835/227835 [00:15<00:00, 15168.57it/s]


In [4]:
mimic_notes['new_mimic_text'] = mimic_notes['text'].progress_apply(preprocess_mimic_notes)

100%|██████████| 2321355/2321355 [04:37<00:00, 8373.94it/s]


In [5]:
unique_mimic_notes = mimic_notes[~mimic_notes['new_mimic_text'].isin(cxr_reports['new_text'])]
len(unique_mimic_notes)

2157600

In [6]:
# Find unique notes from mimic_cxr that are not found in mimic_notes
unique_mimic_cxr_notes = cxr_reports[~cxr_reports['new_text'].isin(mimic_notes['new_mimic_text'])]
len(unique_mimic_cxr_notes)

64266

In [9]:
num = 0
print(unique_mimic_cxr_notes.iloc[num].subject_id)
print(unique_mimic_cxr_notes.iloc[num].new_text)
print(unique_mimic_cxr_notes.iloc[num].cxr_report_text)

14591045
EXAMINATIONChestradiographINDICATIONyearoldmanwithlowgradetempsandcoughhypoxiaalsometastaticcancerunknownprimarywlungnodulesevalforpneumoniaTECHNIQUEChestPAandlateralCOMPARISONRadiographFINDINGSAcardiacconductiondeviceiscontiguouswithleadswhichappeartoterminateinrightventricleandbilateralatriaMediansternotomywiresappearintactSurgicalappearunchangedInnumerableroundedpulmonarynodulesareunchangedandconsistentwithmetastaticdiseaseAssessmentforaconsolidationislimitedbynumerousmetastaseshoweverthereappearstobealossofthecardiacsilhouetteandincreasedopacityatthecardiacapexconcerningforpneumoniaVertebralbodyheightsarepreservedIMPRESSIONAssessmentforpneumoniaissomewhatlimitedgiventhepatient'swidespreadmetastaticdiseasehoweverincreasedconsolidationatthecardiacapexandlossofcardiacsilhouetteisconcerningforpneumonia
                                 FINAL REPORT
 EXAMINATION:  Chest radiograph
 
 INDICATION:  ___ year old man with low grade temps and cough/hypoxia, also
 metastatic cancer un

In [23]:
print(unique_mimic_notes[unique_mimic_notes["subject_id"] == 14591045 ].new_mimic_text.iloc[12])

EXAMINATIONChestradiographINDICATIONyearoldmanwithlowgradetempsandcoughhypoxiaalsometastaticcancerunknownprimarywlungnodulesevalforpneumoniaTECHNIQUEChestPAandlateralCOMPARISONRadiographFINDINGSAcardiacconductiondeviceiscontiguouswithleadswhichappeartoterminateinrightventricleandbilateralatriaMediansternotomywiresappearintactSurgicalstaplesappearunchangedInnumerableroundedpulmonarynodulesareunchangedandconsistentwithmetastaticdiseaseAssessmentforaconsolidationislimitedbynumerousmetastaseshoweverthereappearstobealossofthecardiacsilhouetteandincreasedopacityatthecardiacapexconcerningforpneumoniaVertebralbodyheightsarepreservedIMPRESSIONAssessmentforpneumoniaissomewhatlimitedgiventhepatient'swidespreadmetastaticdiseasehoweverincreasedconsolidationatthecardiacapexandlossofcardiacsilhouetteisconcerningforpneumonia


In [13]:
def find_differences(string1, string2):
    # Find the minimum length to avoid index errors
    min_length = min(len(string1), len(string2))
    
    # List to store indices where characters differ
    differing_indices = []
    
    # Compare characters up to the length of the shorter string
    for i in range(min_length):
        if string1[i] != string2[i]:
            differing_indices.append(i)
    
    # If one string is longer, record the remaining indices as differing
    if len(string1) != len(string2):
        differing_indices.extend(range(min_length, max(len(string1), len(string2))))
    
    return differing_indices

# Example usage
string1 = unique_mimic_notes[unique_mimic_notes["subject_id"] == 14591045 ].new_mimic_text.iloc[12]
string2 = unique_mimic_cxr_notes.iloc[0].new_text
differences = find_differences(string1, string2)
print(string1, '\n', string2)
print("Differing indices:", differences)


EXAMINATIONChestradiographINDICATIONyearoldmanwithlowgradetempsandcoughhypoxiaalsometastaticcancerunknownprimarywlungnodulesevalforpneumoniaTECHNIQUEChestPAandlateralCOMPARISONRadiographFINDINGSAcardiacconductiondeviceiscontiguouswithleadswhichappeartoterminateinrightventricleandbilateralatriaMediansternotomywiresappearintactSurgicalstaplesappearunchangedInnumerableroundedpulmonarynodulesareunchangedandconsistentwithmetastaticdiseaseAssessmentforaconsolidationislimitedbynumerousmetastaseshoweverthereappearstobealossofthecardiacsilhouetteandincreasedopacityatthecardiacapexconcerningforpneumoniaVertebralbodyheightsarepreservedIMPRESSIONAssessmentforpneumoniaissomewhatlimitedgiventhepatient'swidespreadmetastaticdiseasehoweverincreasedconsolidationatthecardiacapexandlossofcardiacsilhouetteisconcerningforpneumonia 
 EXAMINATIONChestradiographINDICATIONyearoldmanwithlowgradetempsandcoughhypoxiaalsometastaticcancerunknownprimarywlungnodulesevalforpneumoniaTECHNIQUEChestPAandlateralCOMPARISONR

In [28]:
print(string1[0:335])

EXAMINATIONChestradiographINDICATIONyearoldmanwithlowgradetempsandcoughhypoxiaalsometastaticcancerunknownprimarywlungnodulesevalforpneumoniaTECHNIQUEChestPAandlateralCOMPARISONRadiographFINDINGSAcardiacconductiondeviceiscontiguouswithleadswhichappeartoterminateinrightventricleandbilateralatriaMediansternotomywiresappearintactSurgical


In [29]:
print(string2[0:335])

EXAMINATIONChestradiographINDICATIONyearoldmanwithlowgradetempsandcoughhypoxiaalsometastaticcancerunknownprimarywlungnodulesevalforpneumoniaTECHNIQUEChestPAandlateralCOMPARISONRadiographFINDINGSAcardiacconductiondeviceiscontiguouswithleadswhichappeartoterminateinrightventricleandbilateralatriaMediansternotomywiresappearintactSurgical


In [35]:
print(string1[336:])

taplesappearunchangedInnumerableroundedpulmonarynodulesareunchangedandconsistentwithmetastaticdiseaseAssessmentforaconsolidationislimitedbynumerousmetastaseshoweverthereappearstobealossofthecardiacsilhouetteandincreasedopacityatthecardiacapexconcerningforpneumoniaVertebralbodyheightsarepreservedIMPRESSIONAssessmentforpneumoniaissomewhatlimitedgiventhepatient'swidespreadmetastaticdiseasehoweverincreasedconsolidationatthecardiacapexandlossofcardiacsilhouetteisconcerningforpneumonia


In [34]:
print(string2[336:])

ppearunchangedInnumerableroundedpulmonarynodulesareunchangedandconsistentwithmetastaticdiseaseAssessmentforaconsolidationislimitedbynumerousmetastaseshoweverthereappearstobealossofthecardiacsilhouetteandincreasedopacityatthecardiacapexconcerningforpneumoniaVertebralbodyheightsarepreservedIMPRESSIONAssessmentforpneumoniaissomewhatlimitedgiventhepatient'swidespreadmetastaticdiseasehoweverincreasedconsolidationatthecardiacapexandlossofcardiacsilhouetteisconcerningforpneumonia


In [36]:
unique_mimic_notes[unique_mimic_notes["subject_id"] == 14591045 ].text.iloc[12]

"EXAMINATION:  Chest radiograph\n\nINDICATION:  ___ year old man with low grade temps and cough/hypoxia, also\nmetastatic cancer unknown primary w/ lung nodules  // eval for pneumonia\n\nTECHNIQUE:  Chest PA and lateral\n\nCOMPARISON:  Radiograph ___\n\nFINDINGS: \n\nA cardiac conduction device is contiguous with leads which appear to terminate\nin right ventricle and bilateral atria.  Median sternotomy wires appear\nintact.  Surgical staples appear unchanged.  Innumerable rounded pulmonary\nnodules are unchanged and consistent with metastatic disease.  Assessment for\na consolidation is limited by numerous metastases, however there appears to be\na loss of the cardiac silhouette and increased opacity at the cardiac apex,\nconcerning for pneumonia.    Vertebral body heights are preserved.\n\nIMPRESSION: \n\nAssessment for pneumonia is somewhat limited given the patient's widespread\nmetastatic disease, however increased consolidation at the cardiac apex and\nloss of cardiac silhouette 

In [37]:
unique_mimic_cxr_notes.iloc[0].cxr_report_text

"                                 FINAL REPORT\n EXAMINATION:  Chest radiograph\n \n INDICATION:  ___ year old man with low grade temps and cough/hypoxia, also\n metastatic cancer unknown primary w/ lung nodules  // eval for pneumonia\n \n TECHNIQUE:  Chest PA and lateral\n \n COMPARISON:  Radiograph ___\n \n FINDINGS: \n \n A cardiac conduction device is contiguous with leads which appear to terminate\n in right ventricle and bilateral atria.  Median sternotomy wires appear\n intact.  Surgical ___ appear unchanged.  Innumerable rounded pulmonary\n nodules are unchanged and consistent with metastatic disease.  Assessment for\n a consolidation is limited by numerous metastases, however there appears to be\n a loss of the cardiac silhouette and increased opacity at the cardiac apex,\n concerning for pneumonia.    Vertebral body heights are preserved.\n \n IMPRESSION: \n \n Assessment for pneumonia is somewhat limited given the patient's widespread\n metastatic disease, however increased 

In [10]:
len(unique_cxr_mimic_notes_filtered)

64266

In [7]:
from datasketch import MinHash, MinHashLSH
from tqdm import tqdm
from concurrent.futures import ProcessPoolExecutor, as_completed
import pandas as pd

# Parameters
similarity_threshold = 0.9  # 90% similarity threshold for MinHashing
num_perm = 128  # Number of permutations for MinHashing

# Function to generate MinHash for a given string
def get_minhash(text, num_perm=num_perm):
    m = MinHash(num_perm=num_perm)
    for i in range(len(text) - 2):  # Using 3-character shingles
        shingle = text[i:i + 3]
        m.update(shingle.encode('utf8'))
    return m

# Initialize LSH for efficient similarity search
lsh = MinHashLSH(threshold=similarity_threshold, num_perm=num_perm)

In [11]:
# Parallel function to insert MinHash into LSH
def insert_to_lsh(i, cxr_text):
    minhash = get_minhash(cxr_text)
    lsh.insert(f"cxr_{i}", minhash)

# Add entries from unique_cxr_mimic_notes to LSH with MinHash signatures in parallel
with ProcessPoolExecutor() as executor:
    futures = [executor.submit(insert_to_lsh, i, cxr_text) 
               for i, cxr_text in unique_mimic_cxr_notes['new_text'].iteritems()]
    for future in tqdm(as_completed(futures), total=len(futures), desc="Inserting into LSH"):
        pass  # Wait for all tasks to complete
print("Keys in LSH:", list(lsh.keys))

Inserting into LSH: 100%|██████████| 64266/64266 [00:25<00:00, 2555.78it/s]


Keys in LSH: []


In [None]:
from concurrent.futures import ThreadPoolExecutor

with ThreadPoolExecutor() as executor:
    futures = [executor.submit(insert_to_lsh, i, cxr_text) 
               for i, cxr_text in unique_mimic_cxr_notes['new_text'].iteritems()]
    for future in tqdm(as_completed(futures), total=len(futures), desc="Inserting into LSH"):
        pass

print("Keys in LSH:", len(list(lsh.keys)))


In [13]:
print("Original DataFrame length:", len(unique_mimic_cxr_notes))
print("Keys in LSH:", len(list(lsh.keys)))

Original DataFrame length: 64266
Keys in LSH: 64266


In [14]:
# Lists to store indices to remove
remove_indices_mimic_notes = []
remove_indices_cxr_mimic_notes = set()

# Parallel function to find similar entries
def find_similar_entries(i, mimic_text):
    minhash = get_minhash(mimic_text)
    similar_cxr_ids = lsh.query(minhash)
    return i, similar_cxr_ids

# Use parallel processing to find similar entries and track indices to remove
with ProcessPoolExecutor() as executor:
    futures = [executor.submit(find_similar_entries, i, mimic_text) 
               for i, mimic_text in enumerate(unique_mimic_notes['new_mimic_text'])]
    for future in tqdm(as_completed(futures), total=len(futures), desc="Finding similar entries"):
        i, similar_cxr_ids = future.result()
        if similar_cxr_ids:
            remove_indices_mimic_notes.append(i)
            for cxr_id in similar_cxr_ids:
                remove_indices_cxr_mimic_notes.add(int(cxr_id.split('_')[1]))

Finding similar entries: 100%|██████████| 2157600/2157600 [24:57<00:00, 1440.74it/s]


In [26]:
# Drop the marked rows from each DataFrame
print(len(unique_mimic_notes))
unique_mimic_notes = unique_mimic_notes.reset_index(drop=True)
unique_mimic_notes_filtered = unique_mimic_notes.drop(index=remove_indices_mimic_notes).reset_index(drop=True)
unique_cxr_mimic_notes_filtered = unique_mimic_cxr_notes.drop(index=remove_indices_cxr_mimic_notes).reset_index(drop=True)

# Display the filtered DataFrames
unique_mimic_notes_filtered.reset_index(drop=True, inplace=True)
unique_cxr_mimic_notes_filtered.reset_index(drop=True, inplace=True)

2157600


In [27]:
len(unique_cxr_mimic_notes_filtered)

19394

In [62]:
num = 9
print(unique_cxr_mimic_notes_filtered.iloc[num].subject_id)
print(unique_cxr_mimic_notes_filtered.iloc[num].new_text)
print(unique_cxr_mimic_notes_filtered.iloc[num].cxr_report_text)

14797982
HISTORY-year-oldfemalewithcoughandwheezingEvaluateforpneumoniaCOMPARISONRadiographsofthechestdatedonandFINDINGSFrontalandlateralradiographsofthechestdemonstratewellexpandedclearlungsThecardiomediastinalandhilarcontoursareunremarkableThereisnopleuraleffusionorpneumothoraxIMPRESSIONNoevidenceofpneumonia
                                 FINAL REPORT
 HISTORY:  ___-year-old female with cough and wheezing.  Evaluate for pneumonia.
 
 COMPARISON:  Radiographs of the chest dated on ___, ___,
 and ___.
 
 FINDINGS:
 
 Frontal and lateral radiographs of the chest demonstrate well expanded, clear
 lungs.  The cardiomediastinal and hilar contours are unremarkable.  There is
 no pleural effusion or pneumothorax.
 
 IMPRESSION:
 
 No evidence of pneumonia.



In [110]:
print(unique_mimic_notes_filtered[unique_mimic_notes_filtered["subject_id"] == 14797982 ].new_mimic_text.iloc[39])

IndexError: single positional indexer is out-of-bounds

In [25]:
# Check the type and range of the index in unique_mimic_notes
print("Index type:", unique_mimic_notes.index.dtype)
print("First 10 index values:", unique_mimic_notes.index[:10])
print("Last 10 index values:", unique_mimic_notes.index[-10:])
print("Is index monotonically increasing?", unique_mimic_notes.index.is_monotonic_increasing)

# Convert remove_indices_mimic_notes to a set for faster comparison
remove_indices_set = set(remove_indices_mimic_notes)
# Find any indices in remove_indices_mimic_notes that are missing in unique_mimic_notes.index
missing_indices = [idx for idx in remove_indices_set if idx not in unique_mimic_notes.index]
print("Missing indices (if any):", len(missing_indices))


Index type: int64
First 10 index values: Int64Index([1, 2, 3, 4, 5, 6, 8, 9, 10, 11], dtype='int64')
Last 10 index values: Int64Index([2321343, 2321344, 2321345, 2321346, 2321347, 2321348, 2321350,
            2321351, 2321352, 2321354],
           dtype='int64')
Is index monotonically increasing? True
Missing indices (if any): 3308


In [14]:
# Calculate Jaccard similarity for the provided strings using 3-character shingles
from datasketch import MinHash
from sklearn.metrics import jaccard_score
import numpy as np

# Provided strings
text1 = ("EXAMINATIONChestradiographINDICATIONyearoldmanwithlowgradetempsandcoughhypoxiaalsometastaticcancer"
         "unknownprimarywlungnodulesevalforpneumoniaTECHNIQUEChestPAandlateralCOMPARISONRadiographFINDINGS"
         "Acardiacconductiondeviceiscontiguouswithleadswhichappeartoterminateinrightventricleandbilateralatria"
         "MediansternotomywiresappearintactSurgicalstaplesappearunchangedInnumerableroundedpulmonarynodules"
         "areunchangedandconsistentwithmetastaticdiseaseAssessmentforaconsolidationislimitedbynumerous"
         "metastaseshoweverthereappearstobealossofthecardiacsilhouetteandincreasedopacityatthecardiacapex"
         "concerningforpneumoniaVertebralbodyheightsarepreservedIMPRESSIONAssessmentforpneumoniaissomewhat"
         "limitedgiventhepatient'swidespreadmetastaticdiseasehoweverincreasedconsolidationatthecardiacapex"
         "andlossofcardiacsilhouetteisconcerningforpneumonia")

text2 = ("EXAMINATIONChestradiographINDICATIONyearoldmanwithlowgradetempsandcoughhypoxiaalsometastaticcancer"
         "unknownprimarywlungnodulesevalforpneumoniaTECHNIQUEChestPAandlateralCOMPARISONRadiographFINDINGS"
         "Acardiacconductiondeviceiscontiguouswithleadswhichappeartoterminateinrightventricleandbilateralatria"
         "MediansternotomywiresappearintactSurgicalappearunchangedInnumerableroundedpulmonarynodulesareunchanged"
         "andconsistentwithmetastaticdiseaseAssessmentforaconsolidationislimitedbynumerousmetastaseshowever"
         "thereappearstobealossofthecardiacsilhouetteandincreasedopacityatthecardiacapexconcerningforpneumonia"
         "VertebralbodyheightsarepreservedIMPRESSIONAssessmentforpneumoniaissomewhatlimitedgiventhepatient's"
         "widespreadmetastaticdiseasehoweverincreasedconsolidationatthecardiacapexandlossofcardiacsilhouette"
         "isconcerningforpneumonia")

# Function to create shingles and MinHash for a text
def create_minhash(text, num_perm=128, shingle_len=3):
    m = MinHash(num_perm=num_perm)
    for i in range(len(text) - shingle_len + 1):  # Generate shingles of length 3
        shingle = text[i:i + shingle_len]
        m.update(shingle.encode('utf8'))
    return m

# Create MinHashes for both texts
minhash1 = create_minhash(string1)
minhash2 = create_minhash(string2)

# Estimate Jaccard similarity using MinHash similarity
jaccard_similarity = minhash1.jaccard(minhash2)
jaccard_similarity


1.0

In [20]:
print(len(remove_indices_mimic_notes))
print(len(set(remove_indices_mimic_notes)))

46657
46657


In [21]:
print(len(remove_indices_cxr_mimic_notes))
print(len(set(remove_indices_cxr_mimic_notes)))

44872
44872


In [9]:
print("Keys in LSH:", list(lsh.keys))


Keys in LSH: []


In [None]:
def insert_to_lsh(i, cxr_text):
    minhash = get_minhash(cxr_text)
    lsh.insert(f"cxr_{i}", minhash)
    print(f"Inserted cxr_{i} with MinHash values: {minhash.hashvalues}")
with ProcessPoolExecutor() as executor:
    futures = [executor.submit(insert_to_lsh, i, cxr_text) 
               for i, cxr_text in unique_mimic_cxr_notes['new_text'].iteritems()]
    for future in tqdm(as_completed(futures), total=len(futures), desc="Inserting into LSH"):
        pass  # Wait for all tasks to complete