In [1]:
import pandas as pd
import matplotlib.pyplot as plt

# Load a sample of the CSV file first to get an overview
file_path = '/scratch/baj321/MIMIC-Note/physionet.org/files/mimic-iv-note/2.2/note/radiology.csv'
# Display the first few rows of the dataframe to get an overview
mimic_note_df = pd.read_csv(file_path)

print("First few rows of the dataframe (sample):")
display(mimic_note_df.head())

First few rows of the dataframe (sample):


Unnamed: 0,note_id,subject_id,hadm_id,note_type,note_seq,charttime,storetime,text
0,10000032-RR-14,10000032,22595853.0,RR,14,2180-05-06 21:19:00,2180-05-06 23:32:00,EXAMINATION: CHEST (PA AND LAT)\n\nINDICATION...
1,10000032-RR-15,10000032,22595853.0,RR,15,2180-05-06 23:00:00,2180-05-06 23:26:00,EXAMINATION: LIVER OR GALLBLADDER US (SINGLE ...
2,10000032-RR-16,10000032,22595853.0,RR,16,2180-05-07 09:55:00,2180-05-07 11:15:00,"INDICATION: ___ HCV cirrhosis c/b ascites, hi..."
3,10000032-RR-18,10000032,,RR,18,2180-06-03 12:46:00,2180-06-03 14:01:00,EXAMINATION: Ultrasound-guided paracentesis.\...
4,10000032-RR-20,10000032,,RR,20,2180-07-08 13:18:00,2180-07-08 14:15:00,EXAMINATION: Paracentesis\n\nINDICATION: ___...


In [2]:
# Load a sample of the CSV file first to get an overview
file_path = '/scratch/baj321/cxr_reports.csv'
# Display the first few rows of the dataframe to get an overview
mimic_cxr_reports = pd.read_csv(file_path)

print("First few rows of the dataframe (sample):")
display(mimic_cxr_reports.head())

First few rows of the dataframe (sample):


Unnamed: 0,subject_id,study_id,cxr_report_text
0,14591045,57685699,FINAL REPORT\...
1,14591045,52357049,FINAL REPORT\...
2,14591045,51433247,FINAL REPORT\...
3,14591045,57096030,WET READ: ___ ___ ___ 8:11 AM\n \n Tiny lef...
4,14591045,56519652,FINAL REPORT\...


In [3]:
len(mimic_note_df)

2321355

In [None]:
import dask.dataframe as dd
from dask import delayed
from Levenshtein import distance

# Convert mimic_note_df to a Dask DataFrame
mimic_note_df_dd = dd.from_pandas(mimic_note_df, npartitions=20)
cxr_report_texts_set = set(mimic_cxr_reports['cxr_report_text'].dropna().unique())
similarity_threshold = 0.9

print("Loaded")

# Define a delayed function to calculate Levenshtein similarity
@delayed
def levenshtein_similarity(text1, text2):
    lev_distance = distance(text1, text2)
    max_len = max(len(text1), len(text2))
    similarity = 1 - (lev_distance / max_len) if max_len > 0 else 1
    return similarity

# Define a delayed function to check if a note is unique based on the similarity threshold
@delayed
def is_unique_note_delayed(note_text, cxr_report_texts_set, threshold):
    for report_text in cxr_report_texts_set:
        if levenshtein_similarity(note_text, report_text).compute() >= threshold:
            return False  # Not unique if any report is similar enough
    return True  # Unique if no similar report found

# Apply the function with Dask's map_partitions
def check_uniqueness_partition(partition, cxr_report_texts_set, threshold):
    return partition[partition['text'].apply(lambda note: is_unique_note_delayed(note, cxr_report_texts_set, threshold).compute())]

# Apply check_uniqueness_partition across partitions in Dask
mimic_notes_unique_dd = mimic_note_df_dd.map_partitions(check_uniqueness_partition, cxr_report_texts_set=cxr_report_texts_set, threshold=similarity_threshold)

# Trigger computation and get the length
mimic_notes_unique = mimic_notes_unique_dd.compute()
len(mimic_notes_unique)


Loaded


In [None]:
output_path = '/scratch/baj321/other_reports.csv'  # Replace with your desired file path
mimic_notes_unique.to_csv(output_path, index=False)

print(f"DataFrame has been saved to {output_path}")


In [4]:
pip install Levenshtein

Collecting Levenshtein
  Downloading Levenshtein-0.21.1-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (171 kB)
[K     |████████████████████████████████| 171 kB 8.9 MB/s eta 0:00:01
[?25hCollecting rapidfuzz<4.0.0,>=2.3.0
  Downloading rapidfuzz-2.11.1-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.2 MB)
[K     |████████████████████████████████| 2.2 MB 52.2 MB/s eta 0:00:01
[?25hInstalling collected packages: rapidfuzz, Levenshtein
Successfully installed Levenshtein-0.21.1 rapidfuzz-2.11.1
Note: you may need to restart the kernel to use updated packages.


In [None]:
import pandas as pd
from Levenshtein import distance

# Assume mimic_cxr_reports and mimic_note_df are your DataFrames
# cxr_report_texts_set = set(mimic_cxr_reports['cxr_report_text'].dropna().unique())
# mimic_note_df['text'] contains the notes to compare

# Define a function to calculate Levenshtein similarity
def levenshtein_similarity(text1, text2):
    lev_distance = distance(text1, text2)
    max_len = max(len(text1), len(text2))
    similarity = 1 - (lev_distance / max_len) if max_len > 0 else 1
    return similarity

# Set your similarity threshold (e.g., 0.8 for 80% similarity)
similarity_threshold = 0.8

# Define a function to check if a note is unique based on the similarity threshold
def is_unique_note(note_text, cxr_report_texts_set, threshold):
    for report_text in cxr_report_texts_set:
        if levenshtein_similarity(note_text, report_text) >= threshold:
            return False  # Not unique if any report is similar enough
    return True  # Unique if no similar report found

# Apply the function to filter mimic_note_df
mimic_notes_unique = mimic_note_df[mimic_note_df['text'].apply(lambda note: is_unique_note(note, cxr_report_texts_set, similarity_threshold))]

# Display the result
len(mimic_notes_unique)

In [None]:
from datasketch import MinHash, MinHashLSH
from multiprocessing import Pool, cpu_count
import pandas as pd

# Parameters
NUM_PERMUTATIONS = 64  # Reduced for speed
JACCARD_THRESHOLD = 0.95  # High similarity threshold
BATCH_SIZE = 100000  # Adjust batch size based on available memory and CPU

# MinHash signature creation function
def create_minhash_signature(tokens):
    m = MinHash(num_perm=NUM_PERMUTATIONS)
    for token in tokens:
        m.update(token.encode('utf8'))
    return m

# Preprocess and tokenize text
def preprocess_and_tokenize(text):
    if pd.isnull(text):
        return set()
    # Normalize text: lowercase, remove non-alphanumeric characters, split into words
    #text = text.lower()
    #text = ''.join(e for e in text if e.isalnum() or e.isspace())
    tokens = text.split()
    return set(tokens)

# Helper function to process a batch of data
def process_batch_min_hash(data):
    return data.apply(create_minhash_signature)

# Load datasets and apply tokenization
print("Loading datasets...")
cxr_reports = pd.read_csv("/scratch/baj321/cxr_reports.csv")  # Adjust path to file
mimic_notes = pd.read_csv("/scratch/baj321/MIMIC-Note/physionet.org/files/mimic-iv-note/2.2/note/radiology.csv")  # Adjust path to file

print("Tokenizing text in cxr_reports and mimic_notes...")
cxr_reports['tokens'] = cxr_reports['cxr_report_text'].apply(preprocess_and_tokenize)
mimic_notes['tokens'] = mimic_notes['text'].apply(preprocess_and_tokenize)

# Generate MinHash signatures for cxr_reports
print("Generating MinHash signatures for cxr_reports...")
cpu_cores = cpu_count()
with Pool(cpu_cores) as pool:
    cxr_reports['minhash'] = pool.map(create_minhash_signature, cxr_reports['tokens'], BATCH_SIZE)
print("Completed MinHash signature generation for cxr_reports.")

# Build the LSH index for cxr_reports
print("Building the LSH index for cxr_reports...")
lsh = MinHashLSH(threshold=JACCARD_THRESHOLD, num_perm=NUM_PERMUTATIONS)
for idx, minhash in enumerate(cxr_reports['minhash']):
    lsh.insert(f"cxr_{idx}", minhash)
print("LSH index built for cxr_reports.")

# Generate MinHash signatures for mimic_notes in parallel
print("Generating MinHash signatures for mimic_notes...")
with Pool(cpu_cores) as pool:
    mimic_notes['minhash'] = pool.map(create_minhash_signature, mimic_notes['tokens'], BATCH_SIZE)
print("Completed MinHash signature generation for mimic_notes.")

# Function to check similarity for a single MinHash signature
def is_similar(mimic_report_minhash):
    return len(lsh.query(mimic_report_minhash)) > 0

# Check for similarity in batches
print("Checking similarity of mimic_notes against cxr_reports in batches...")
with Pool(cpu_cores) as pool:
    mimic_notes['is_similar'] = pool.map(is_similar, mimic_notes['minhash'], BATCH_SIZE)
print("Similarity check completed.")

# Filter out similar reports to get unique reports in mimic_notes
print("Filtering unique mimic_notes that are not in cxr_reports...")
unique_mimic_notes = mimic_notes[~mimic_notes['is_similar']]

# Display or save results
print("Displaying unique mimic_notes...")
print(unique_mimic_notes.head())  # Display the first few rows
unique_mimic_notes.to_csv("/scratch/baj321/unique_mimic_notes.csv", index=False)  # Save to a CSV file for further analysis
print("Unique mimic_notes saved to '/scratch/baj321/unique_mimic_radiology_notes.csv'.")


Loading datasets...
Tokenizing text in cxr_reports and mimic_notes...
Generating MinHash signatures for cxr_reports...
Completed MinHash signature generation for cxr_reports.
Building the LSH index for cxr_reports...
LSH index built for cxr_reports.
Generating MinHash signatures for mimic_notes...


In [2]:
pip install datasketch

Collecting datasketch
  Downloading datasketch-1.6.5-py3-none-any.whl (89 kB)
[K     |████████████████████████████████| 89 kB 5.0 MB/s eta 0:00:011
Installing collected packages: datasketch
Successfully installed datasketch-1.6.5
Note: you may need to restart the kernel to use updated packages.
