In [1]:
pip install tqdm

Collecting tqdm
  Downloading tqdm-4.66.6-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.3/78.3 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tqdm
Successfully installed tqdm-4.66.6
Note: you may need to restart the kernel to use updated packages.


In [None]:
from datasketch import MinHash, MinHashLSH
from multiprocessing import Pool, cpu_count
import pandas as pd
from tqdm import tqdm  # For progress tracking

# Parameters
NUM_PERMUTATIONS = 64  # Reduced for speed
JACCARD_THRESHOLD = 0.95  # High similarity threshold
BATCH_SIZE = 100000  # Adjust batch size based on available memory and CPU

# MinHash signature creation function
def create_minhash_signature(tokens):
    m = MinHash(num_perm=NUM_PERMUTATIONS)
    for token in tokens:
        m.update(token.encode('utf8'))
    return m

# Preprocess and tokenize text
def preprocess_and_tokenize(text):
    if pd.isnull(text):
        return set()
    tokens = text.split()
    return set(tokens)

# Helper function to process a batch of data
def process_batch_min_hash(data):
    return data.apply(create_minhash_signature)

# Load datasets and apply tokenization
print("Loading datasets...")
cxr_reports = pd.read_csv("/scratch/baj321/cxr_reports.csv")  # Adjust path to file
mimic_notes = pd.read_csv("/scratch/baj321/MIMIC-Note/physionet.org/files/mimic-iv-note/2.2/note/radiology.csv")  # Adjust path to file

print("Tokenizing text in cxr_reports and mimic_notes...")
cxr_reports['tokens'] = cxr_reports['cxr_report_text'].apply(preprocess_and_tokenize)
mimic_notes['tokens'] = mimic_notes['text'].apply(preprocess_and_tokenize)

# Generate MinHash signatures for cxr_reports with progress tracking
print("Generating MinHash signatures for cxr_reports...")
cpu_cores = cpu_count()
with Pool(cpu_cores) as pool:
    cxr_reports['minhash'] = list(tqdm(pool.imap(create_minhash_signature, cxr_reports['tokens'], BATCH_SIZE),
                                       total=len(cxr_reports),
                                       desc="CXR MinHash Generation"))
print("Completed MinHash signature generation for cxr_reports.")

# Build the LSH index for cxr_reports
print("Building the LSH index for cxr_reports...")
lsh = MinHashLSH(threshold=JACCARD_THRESHOLD, num_perm=NUM_PERMUTATIONS)
for idx, minhash in tqdm(enumerate(cxr_reports['minhash']), total=len(cxr_reports), desc="Building LSH Index"):
    lsh.insert(f"cxr_{idx}", minhash)
print("LSH index built for cxr_reports.")

# Generate MinHash signatures for mimic_notes in parallel with progress tracking
print("Generating MinHash signatures for mimic_notes...")
with Pool(cpu_cores) as pool:
    mimic_notes['minhash'] = list(tqdm(pool.imap(create_minhash_signature, mimic_notes['tokens'], BATCH_SIZE),
                                       total=len(mimic_notes),
                                       desc="MIMIC MinHash Generation"))
print("Completed MinHash signature generation for mimic_notes.")

# Function to check similarity for a single MinHash signature
def is_similar(mimic_report_minhash):
    return len(lsh.query(mimic_report_minhash)) > 0

# Check for similarity in batches with progress tracking
print("Checking similarity of mimic_notes against cxr_reports in batches...")
with Pool(cpu_cores) as pool:
    mimic_notes['is_similar'] = list(tqdm(pool.imap(is_similar, mimic_notes['minhash'], BATCH_SIZE),
                                          total=len(mimic_notes),
                                          desc="Similarity Check"))
print("Similarity check completed.")

# Filter out similar reports to get unique reports in mimic_notes
print("Filtering unique mimic_notes that are not in cxr_reports...")
unique_mimic_notes = mimic_notes[~mimic_notes['is_similar']]

# Display or save results
print("Displaying unique mimic_notes...")
print(unique_mimic_notes.head())  # Display the first few rows
unique_mimic_notes.to_csv("/scratch/baj321/unique_mimic_notes.csv", index=False)  # Save to a CSV file for further analysis
print("Unique mimic_notes saved to '/scratch/baj321/unique_mimic_radiology_notes.csv'.")


Loading datasets...
Tokenizing text in cxr_reports and mimic_notes...
Generating MinHash signatures for cxr_reports...


CXR MinHash Generation: 100%|██████████| 227835/227835 [02:43<00:00, 1396.16it/s]


Completed MinHash signature generation for cxr_reports.
Building the LSH index for cxr_reports...


Building LSH Index: 100%|██████████| 227835/227835 [00:02<00:00, 111734.43it/s]

LSH index built for cxr_reports.
Generating MinHash signatures for mimic_notes...



MIMIC MinHash Generation:   0%|          | 0/2321355 [00:00<?, ?it/s]

In [None]:
# Generate MinHash signatures for cxr_reports with progress tracking
print("Generating MinHash signatures for cxr_reports...")
cpu_cores = cpu_count()
with Pool(cpu_cores) as pool:
    cxr_reports['minhash'] = list(tqdm(pool.imap_unordered(create_minhash_signature, cxr_reports['tokens']), 
                                       total=len(cxr_reports), 
                                       desc="CXR MinHash Generation"))
print("Completed MinHash signature generation for cxr_reports.")

# Generate MinHash signatures for mimic_notes in parallel with progress tracking
print("Generating MinHash signatures for mimic_notes...")
with Pool(cpu_cores) as pool:
    mimic_notes['minhash'] = list(tqdm(pool.imap_unordered(create_minhash_signature, mimic_notes['tokens']), 
                                       total=len(mimic_notes), 
                                       desc="MIMIC MinHash Generation"))
print("Completed MinHash signature generation for mimic_notes.")

# Check for similarity in batches with progress tracking
print("Checking similarity of mimic_notes against cxr_reports in batches...")
with Pool(cpu_cores) as pool:
    mimic_notes['is_similar'] = list(tqdm(pool.imap_unordered(is_similar, mimic_notes['minhash']), 
                                          total=len(mimic_notes), 
                                          desc="Similarity Check"))
print("Similarity check completed.")
