In [1]:
from datasketch import MinHash, MinHashLSH
from multiprocessing import Pool, cpu_count
import pandas as pd
from tqdm import tqdm  # For progress tracking
tqdm.pandas() 

# Parameters
NUM_PERMUTATIONS = 128 
JACCARD_THRESHOLD = 0.75  
BATCH_SIZE = 100000  # Adjust batch size based on available memory and CPU

# MinHash signature creation function
def create_minhash_signature(tokens):
    m = MinHash(num_perm=NUM_PERMUTATIONS)
    for token in tokens:
        m.update(token.encode('utf8'))
    return m

# Preprocess and tokenize text
def preprocess_and_tokenize(text):
    if pd.isnull(text):
        return set()
    text = text.lower()
    text = text.replace("\n", " ").replace("final report", "")
    text = ' '.join(text.split())  # Removes extra spaces
    tokens = text.split()
    return set(tokens)

# Helper function to process a batch of data
def process_batch_min_hash(data):
    return data.apply(create_minhash_signature)

# Load datasets and apply tokenization
print("Loading datasets...")
cxr_reports = pd.read_csv("/scratch/baj321/cxr_reports.csv")  # Adjust path to file
mimic_notes = pd.read_csv("/scratch/baj321/MIMIC-Note/physionet.org/files/mimic-iv-note/2.2/note/radiology.csv")  # Adjust path to file

print("Tokenizing text in cxr_reports and mimic_notes...")
cxr_reports['tokens'] = cxr_reports['cxr_report_text'].progress_apply(preprocess_and_tokenize)
mimic_notes['tokens'] = mimic_notes['text'].progress_apply(preprocess_and_tokenize)

# Generate MinHash signatures for cxr_reports with progress tracking
print("Generating MinHash signatures for cxr_reports...")
cpu_cores = cpu_count()
with Pool(cpu_cores) as pool:
    cxr_reports['minhash'] = list(tqdm(pool.imap_unordered(create_minhash_signature, cxr_reports['tokens']), 
                                       total=len(cxr_reports), 
                                       desc="CXR MinHash Generation"))
print("Completed MinHash signature generation for cxr_reports.")

# Build the LSH index for cxr_reports
print("Building the LSH index for cxr_reports...")
lsh = MinHashLSH(threshold=JACCARD_THRESHOLD, num_perm=NUM_PERMUTATIONS)
for idx, minhash in tqdm(enumerate(cxr_reports['minhash']), total=len(cxr_reports), desc="Building LSH Index"):
    lsh.insert(f"cxr_{idx}", minhash)
print("LSH index built for cxr_reports.")

# Generate MinHash signatures for mimic_notes in parallel with progress tracking
print("Generating MinHash signatures for mimic_notes...")
with Pool(cpu_cores) as pool:
    mimic_notes['minhash'] = list(tqdm(pool.imap_unordered(create_minhash_signature, mimic_notes['tokens']), 
                                       total=len(mimic_notes), 
                                       desc="MIMIC MinHash Generation"))
print("Completed MinHash signature generation for mimic_notes.")

# Function to check similarity for a single MinHash signature
def is_similar(mimic_report_minhash):
    return len(lsh.query(mimic_report_minhash)) > 0

# Check for similarity in batches with progress tracking
print("Checking similarity of mimic_notes against cxr_reports in batches...")
with Pool(cpu_cores) as pool:
    mimic_notes['is_similar'] = list(tqdm(pool.imap_unordered(is_similar, mimic_notes['minhash']), 
                                          total=len(mimic_notes), 
                                          desc="Similarity Check"))
print("Similarity check completed.")

# Filter out similar reports to get unique reports in mimic_notes
print("Filtering unique mimic_notes that are not in cxr_reports...")
unique_mimic_notes = mimic_notes[~mimic_notes['is_similar']]

# Display or save results
print("Displaying unique mimic_notes...")
print(unique_mimic_notes.head())  # Display the first few rows
len(unique_mimic_notes)
unique_mimic_notes.to_csv("/scratch/baj321/unique_mimic_notes.csv", index=False)  # Save to a CSV file for further analysis
print("Unique mimic_notes saved to '/scratch/baj321/unique_mimic_radiology_notes.csv'.")


Loading datasets...
Tokenizing text in cxr_reports and mimic_notes...


100%|██████████| 227835/227835 [00:09<00:00, 24824.52it/s]
100%|██████████| 2321355/2321355 [03:11<00:00, 12094.39it/s]


Generating MinHash signatures for cxr_reports...


CXR MinHash Generation: 100%|██████████| 227835/227835 [00:40<00:00, 5634.72it/s]


Completed MinHash signature generation for cxr_reports.
Building the LSH index for cxr_reports...


Building LSH Index: 100%|██████████| 227835/227835 [00:35<00:00, 6495.70it/s] 

LSH index built for cxr_reports.
Generating MinHash signatures for mimic_notes...



MIMIC MinHash Generation: 100%|██████████| 2321355/2321355 [06:27<00:00, 5995.37it/s] 


Completed MinHash signature generation for mimic_notes.
Checking similarity of mimic_notes against cxr_reports in batches...


Similarity Check: 100%|██████████| 2321355/2321355 [02:33<00:00, 15143.60it/s]


Similarity check completed.
Filtering unique mimic_notes that are not in cxr_reports...
Displaying unique mimic_notes...
          note_id  subject_id     hadm_id note_type  note_seq  \
2  10000032-RR-16    10000032  22595853.0        RR        16   
3  10000032-RR-18    10000032         NaN        RR        18   
5  10000032-RR-21    10000032         NaN        RR        21   
6  10000032-RR-22    10000032  22841357.0        RR        22   
7  10000032-RR-23    10000032  22841357.0        RR        23   

             charttime            storetime  \
2  2180-05-07 09:55:00  2180-05-07 11:15:00   
3  2180-06-03 12:46:00  2180-06-03 14:01:00   
5  2180-06-30 08:16:00  2180-06-30 12:58:00   
6  2180-06-26 17:15:00  2180-06-26 19:28:00   
7  2180-06-26 17:17:00  2180-06-26 17:28:00   

                                                text  \
2  INDICATION:  ___ HCV cirrhosis c/b ascites, hi...   
3  EXAMINATION:  Ultrasound-guided paracentesis.\...   
5  EXAMINATION:  ULTRASOUND INTERVENT

In [None]:
# Build the LSH index for cxr_reports
print("Building the LSH index for cxr_reports...")
lsh = MinHashLSH(threshold=JACCARD_THRESHOLD, num_perm=NUM_PERMUTATIONS)
for idx, minhash in tqdm(enumerate(cxr_reports['minhash']), total=len(cxr_reports), desc="Building LSH Index"):
    lsh.insert(f"cxr_{idx}", minhash)
print("LSH index built for cxr_reports.")

In [3]:
from datasketch import MinHash, MinHashLSH
from multiprocessing import Pool, cpu_count
import pandas as pd
from tqdm import tqdm  # For progress tracking
tqdm.pandas() 
unique_mimic_notes = pd.read_csv("/scratch/baj321/unique_mimic_notes.csv")
cxr_reports = pd.read_csv("/scratch/baj321/cxr_reports.csv")

In [4]:
len(unique_mimic_notes)

2081804

In [87]:
#239551 duplicates detected
len(mimic_notes)

2321355

In [5]:
len(cxr_reports)

227835

In [14]:
num = 5
print(unique_mimic_notes.iloc[num].subject_id)
unique_mimic_notes.iloc[num].text

10000032


'EXAMINATION:  CHEST (PA AND LAT)\n\nINDICATION:  History: ___ with shortness of breath\n\nTECHNIQUE:  Chest PA and lateral\n\nCOMPARISON:  ___\n\nFINDINGS: \n\nThe cardiac, mediastinal and hilar contours are normal. Pulmonary vasculature\nis normal.  Lungs are clear. No pleural effusion or pneumothorax is present.\nMultiple clips are again seen projecting over the left breast.  Remote\nleft-sided rib fractures are also re- demonstrated.\n\nIMPRESSION: \n\nNo acute cardiopulmonary abnormality.\n'

In [64]:
cxr_reports[cxr_reports["subject_id"] == 10000032 ].new_text.iloc[0]

'EXAMINATION:CHEST(PAANDLAT)INDICATION:History:withshortnessofbreathTECHNIQUE:ChestPAandlateralCOMPARISON:___FINDINGS:Thecardiac,mediastinalandhilarcontoursarenormal.Pulmonaryvasculatureisnormal.Lungsareclear.Nopleuraleffusionorpneumothoraxispresent.Multipleclipsareagainseenprojectingovertheleftbreast.Remoteleft-sidedribfracturesarealsore-demonstrated.IMPRESSION:Noacutecardiopulmonaryabnormality.'

In [33]:
#4
num = 6
print(cxr_reports.iloc[num].subject_id)
cxr_reports.iloc[num].cxr_report_text

14591045


' WET READ: ___ ___ 11:10 PM\n  No pneumothorax.  Multiple pulmonary lesions most consistent with metastatic\n  disease.\n ______________________________________________________________________________\n                                 FINAL REPORT\n EXAMINATION:  CHEST (PORTABLE AP)\n \n INDICATION:  ___ year old man with small L PTX after lung biopsy today, has\n chest tube in place, will switch to water seal at 20:00  // Any pneumothorax?\n chest tube placement?*** PATIENT IS BEING TURNED TO WATER SEAL AT 20:00.\n PLEASE DO CXR AROUND 20:30 ***\n \n TECHNIQUE:  Portable chest\n \n COMPARISON:  ___\n \n FINDINGS: \n \n Compared to the prior study there is no significant interval change.  There is\n a tiny left lateral pneumothorax with left chest tube in place.  Again seen\n are bilateral pulmonary lesions compatible with metastatic disease. \n Mediastinal clips and sternal wires are unchanged.  Left-sided dual lead\n pacemaker is unchanged.\n \n IMPRESSION: \n \n No change.\n'

In [40]:
unique_mimic_notes[unique_mimic_notes["subject_id"] == 14591045 ].text.iloc[6]

'EXAMINATION:  CHEST (PORTABLE AP)\n\nINDICATION:  ___ year old man with small L PTX after lung biopsy today, has\nchest tube in place, will switch to water seal at 20:00  // Any pneumothorax?\nchest tube placement?*** PATIENT IS BEING TURNED TO WATER SEAL AT 20:00.\nPLEASE DO CXR AROUND 20:30 ***\n\nTECHNIQUE:  Portable chest\n\nCOMPARISON:  ___\n\nFINDINGS: \n\nCompared to the prior study there is no significant interval change.  There is\na tiny left lateral pneumothorax with left chest tube in place.  Again seen\nare bilateral pulmonary lesions compatible with metastatic disease. \nMediastinal clips and sternal wires are unchanged.  Left-sided dual lead\npacemaker is unchanged.\n\nIMPRESSION: \n\nNo change.\n'

In [126]:
num = 2
print(cxr_reports.iloc[num].subject_id)
print(cxr_reports.iloc[num].new_text)
print(cxr_reports.iloc[num].cxr_report_text)

14591045
EXAMINATIONCHEST(PORTABLEAP)INDICATIONyearoldmanwithrecentchesttubeforpneumothorax//PleaseassessLpneumothoraxchesttubeanyintervalchange?PleaseassessLpneumothoraxchesttubeanyintervalchange?IMPRESSIONIncomparisonwithstudyof,theleftchesttuberemainsinplaceandthereisnodefinitepneumothoraxMultiplepulmonarymetastasesareagainseenbilaterally
                                 FINAL REPORT
 EXAMINATION:  CHEST (PORTABLE AP)
 
 INDICATION:  ___ year old man with recent chest tube for pneumothorax  //
 Please assess L pneumothorax   chest tube. any interval change?      Please
 assess L pneumothorax   chest tube. any interval change?
 
 IMPRESSION: 
 
 In comparison with study of ___, the left chest tube remains in place
 and there is no definite pneumothorax.  Multiple pulmonary metastases are
 again seen bilaterally.



In [139]:
from datasketch import MinHash, MinHashLSH
from multiprocessing import Pool, cpu_count
import pandas as pd
from tqdm import tqdm
from sklearn.feature_extraction.text import CountVectorizer
import re
def preprocess(text):
    if pd.isnull(text):
        return set()
    
    # Step 1: Remove everything up to the literal string "FINAL REPORT\n"
    match = re.search(r"FINAL REPORT\n", text, re.IGNORECASE)
    if match:
        text = text[match.end():]  # Keep only text after "FINAL REPORT\n"
    # else:
    #     return set()  # If "FINAL REPORT\n" is missing, return an empty set
    
    # Step 2: Remove unwanted special codes
    #text = re.sub(r"[^A-Za-z0-9]", "", text)
    text = re.sub(r"___M|___F|___|[.,!?/:;\\]", "", text)
    # Step 3: Remove all formatting: no spaces, literal "\n" strings, or newlines
    text = re.sub(r"\s+|\\n", "", text)  # Remove spaces and literal "\n" strings
    return text
def preprocess_mimic_notes(text):
    if pd.isnull(text):
        return set()
    
    # Step 1: Remove unwanted special codes
    #text = re.sub(r"[^A-Za-z0-9]", "", text)
    text = re.sub(r"___M|___F|___|[.,!?/:;\\]", "", text)
    
    # Step 2: Remove all formatting: no spaces, literal "\n" strings, or newlines
    text = re.sub(r"\s+|\\n", "", text)  # Remove spaces and literal "\n" strings
    return text

In [140]:
cxr_reports['new_text'] = cxr_reports['cxr_report_text'].progress_apply(preprocess)

100%|██████████| 227835/227835 [00:14<00:00, 15804.19it/s]


In [135]:
mimic_notes = pd.read_csv("/scratch/baj321/MIMIC-Note/physionet.org/files/mimic-iv-note/2.2/note/radiology.csv")


In [141]:
mimic_notes['new_mimic_text'] = mimic_notes['text'].progress_apply(preprocess_mimic_notes)

100%|██████████| 2321355/2321355 [04:38<00:00, 8339.93it/s]


In [137]:
cxr_reports[cxr_reports["subject_id"] == 10000032 ].new_text.iloc[1]

'EXAMINATIONCHESTPORTABLEAPINDICATIONFwithcoughacuteprocessCOMPARISONChestradiographFINDINGSSinglefrontalviewofthechestprovidedThereisnofocalconsolidationeffusionorpneumothoraxThecardiomediastinalsilhouetteisnormalAgainseenaremultipleclipsprojectingovertheleftbreastandremoteleftsidedribfracturesNofreeairbelowtherighthemidiaphragmisseenIMPRESSIONNoacuteintrathoracicprocess'

In [121]:
num = 7
print(mimic_notes.subject_id.iloc[num])
print(mimic_notes.new_mimic_text.iloc[num])

10000032
EXAMINATIONCHEST(PAANDLAT)INDICATIONHistorywithshortnessofbreathTECHNIQUEChestPAandlateralCOMPARISONFINDINGSThecardiac,mediastinalandhilarcontoursarenormalPulmonaryvasculatureisnormalLungsareclearNopleuraleffusionorpneumothoraxispresentMultipleclipsareagainseenprojectingovertheleftbreastRemoteleft-sidedribfracturesarealsore-demonstratedIMPRESSIONNoacutecardiopulmonaryabnormality


In [142]:
# Remove entries in mimic_notes where new_mimic_text matches any new_text in cxr_reports
unique_mimic_notes = mimic_notes[~mimic_notes['new_mimic_text'].isin(cxr_reports['new_text'])]
len(unique_mimic_notes)
#163755 are removed, 64080 unmatched

2157600

In [156]:
num = 400
print(unique_mimic_notes.subject_id.iloc[num])
print(unique_mimic_notes.new_mimic_text.iloc[num])

10002013
EXAMINATIONCTCHESTWOCONTRASTINDICATIONyearoldwomanwithpriorabdCTshowing3nodularpulmonarydensitiesintheleftbasilarregionmeasuringupto8x8mmThesefindingsmaymayrepresentareasofroundedatelectasishowevershort-termfollowupwithnonemergentCTchestisrecommendedfupulmonarynodulesTECHNIQUEMultidetectorhelicalscanningofthechestwasperformedwithoutintravenouscontrastagentreconstructedascontiguous5-and125-mmthickaxial25-mmthickcoronalandparasagittaland8x8mmMIPsaxialimagesDOSEAcquisitionsequence1)SpiralAcquisition43s335cmCTDIvol=232mGy(Body)DLP=7777mGy-cmTotalDLP(Body)=778mGy-cmCOMPARISONFINDINGSThethyroidisnormalSupraclavicularaxillarymediastinalandhilarlymphnodesarenotenlargedAortaandpulmonaryarteriesarenormalsizeCardiacconfigurationisnormalPatientisstatuspostCABGthereappearstobealsocoronarystentsThereisnopleuralorpericardialeffusionTherearescatteredtinycalcifiedgranulomasThereisminimalparaseptalemphysemaThereisa3mmrightperifissuralnodule(4106)representsanintrapulmonarylymphnode3mmsubpleuraln

In [149]:
num = 229
print(cxr_reports.subject_id.iloc[num])
print(cxr_reports.new_text.iloc[num])

14841168
INDICATION-year-oldfemalewithhypercapnicrespiratoryfailureCOMPARISONPORTABLECHESTEndotrachealtubeterminates43cmabovethecarinaNasogastrictubepassesintothestomachThereisaleft-sidedhemodialysiscatheterwhichextendstothelowSVCLungvolumesremainlowTherearebilateralpleuraleffusionsincreasedThereisdiffusehazyparenchymalopacitymostcompatiblewithpulmonaryedemainthesettingofcentralvascularcongestionandcardiomegalyThereisnonewfocalopacitytosuggestpneumoniaThereisnopneumothoraxIMPRESSIONIntervalworseningofpulmonaryedemacomparedtoonedaypriorPleuraleffusionsalsoincreasedNopneumonia


In [157]:
unique_mimic_notes_with_chest = unique_mimic_notes[unique_mimic_notes['new_mimic_text'].str.contains(r'\bchest\b', case=False, na=False)]

In [154]:
cxr_reports[cxr_reports["subject_id"] == 14841168 ].new_text.iloc[4]

'EXAMINATIONCHEST(PORTABLEAP)INDICATIONyearoldwomanwithdobhoffadvancementdobhoffplacementIMPRESSIONPortableradiographobtainedforthepurposeofDobbhofftubeassessmentdemonstratesthetipofaDobbhofftubeterminatinginthedistalstomach'

In [162]:
print(cxr_reports[cxr_reports["subject_id"] == 19958251 ].new_text.iloc[4])
num = 1
print(unique_mimic_notes_with_chest.subject_id.iloc[num])
print(unique_mimic_notes_with_chest.new_mimic_text.iloc[num])

IndexError: single positional indexer is out-of-bounds

In [161]:
unique_mimic_notes_with_chest
unique_mimic_notes_with_chest.tail()

Unnamed: 0,note_id,subject_id,hadm_id,note_type,note_seq,charttime,storetime,text,new_mimic_text
2311537,19958251-RR-30,19958251,20526133.0,RR,30,2178-10-29 07:53:00,2178-10-29 11:00:00,EXAMINATION: CHEST (PORTABLE AP)CHEST (PORTAB...,EXAMINATIONCHEST(PORTABLEAP)CHEST(PORTABLEAP)I...
2311538,19958251-RR-31,19958251,,RR,31,2178-11-06 09:53:00,2178-11-06 12:06:00,EXAMINATION: CHEST (PA AND LAT)CHEST (PA AND ...,EXAMINATIONCHEST(PAANDLAT)CHEST(PAANDLAT)INDIC...
2314553,19970470-RR-8,19970470,23848017.0,RR,8,2146-12-06 03:41:00,2146-12-06 09:28:00,EXAMINATION: CHEST (PORTABLE AP)CHEST (PORTAB...,EXAMINATIONCHEST(PORTABLEAP)CHEST(PORTABLEAP)i...
2316005,19975898-RR-18,19975898,26447601.0,RR,18,2158-08-24 10:46:00,2158-08-24 15:20:00,EXAMINATION: CHEST (PORTABLE AP)CHEST (PORTAB...,EXAMINATIONCHEST(PORTABLEAP)CHEST(PORTABLEAP)i...
2319758,19992875-RR-115,19992875,23327989.0,RR,115,2165-06-09 05:45:00,2165-06-09 09:35:00,EXAMINATION:\n-CHEST (PORTABLE AP)\n\nINDICATI...,EXAMINATION-CHEST(PORTABLEAP)INDICATIONyearold...


In [2]:
len(mimic_notes[mimic_notes['is_similar']])

249598

In [None]:
from datasketch import MinHash, MinHashLSH
from multiprocessing import Pool, cpu_count
import pandas as pd
from tqdm import tqdm
from sklearn.feature_extraction.text import CountVectorizer
import re

tqdm.pandas() 

# Parameters
NUM_PERMUTATIONS = 120
JACCARD_THRESHOLD = 0.7  # Lowered for improved recall
BATCH_SIZE = 80000

# Enhanced Preprocessing
def preprocess_and_tokenize(text):
    if pd.isnull(text):
        return set()
    
    text = text.lower()
    text = re.sub(r"\b(final report|examination|indication|technique|comparison|findings|impression)\b", "", text)  # Remove common headers
    text = re.sub(r"[^\w\s]", "", text)  # Remove punctuation
    text = ' '.join(text.split())  # Remove extra spaces

    # Use n-grams (e.g., bigrams) for more contextual similarity
    vectorizer = CountVectorizer(ngram_range=(1, 2), analyzer='word').build_analyzer()
    tokens = vectorizer(text)
    return set(tokens)

# MinHash signature creation function
def create_minhash_signature(tokens):
    m = MinHash(num_perm=NUM_PERMUTATIONS)
    for token in tokens:
        m.update(token.encode('utf8'))
    return m

# Load datasets and apply tokenization
print("Loading datasets...")
cxr_reports = pd.read_csv("/scratch/baj321/cxr_reports.csv")
mimic_notes = pd.read_csv("/scratch/baj321/MIMIC-Note/physionet.org/files/mimic-iv-note/2.2/note/radiology.csv")

print("Tokenizing text in cxr_reports and mimic_notes...")
cxr_reports['tokens'] = cxr_reports['cxr_report_text'].progress_apply(preprocess_and_tokenize)
mimic_notes['tokens'] = mimic_notes['text'].progress_apply(preprocess_and_tokenize)

# Generate MinHash signatures for cxr_reports
print("Generating MinHash signatures for cxr_reports...")
cpu_cores = cpu_count()
with Pool(cpu_cores) as pool:
    cxr_reports['minhash'] = list(tqdm(pool.imap_unordered(create_minhash_signature, cxr_reports['tokens']), 
                                       total=len(cxr_reports), 
                                       desc="CXR MinHash Generation"))
print("Completed MinHash signature generation for cxr_reports.")

# Build the LSH index for cxr_reports
print("Building the LSH index for cxr_reports...")
lsh = MinHashLSH(threshold=JACCARD_THRESHOLD, num_perm=NUM_PERMUTATIONS)
for idx, minhash in tqdm(enumerate(cxr_reports['minhash']), total=len(cxr_reports), desc="Building LSH Index"):
    lsh.insert(f"cxr_{idx}", minhash)
print("LSH index built for cxr_reports.")

# Generate MinHash signatures for mimic_notes in parallel with progress tracking
print("Generating MinHash signatures for mimic_notes...")
with Pool(cpu_cores) as pool:
    mimic_notes['minhash'] = list(tqdm(pool.imap_unordered(create_minhash_signature, mimic_notes['tokens']), 
                                       total=len(mimic_notes), 
                                       desc="MIMIC MinHash Generation"))
print("Completed MinHash signature generation for mimic_notes.")

# Check for similarity in batches with progress tracking
def is_similar(mimic_report_minhash):
    return len(lsh.query(mimic_report_minhash)) > 0

print("Checking similarity of mimic_notes against cxr_reports in batches...")
with Pool(cpu_cores) as pool:
    mimic_notes['is_similar'] = list(tqdm(pool.imap_unordered(is_similar, mimic_notes['minhash']), 
                                          total=len(mimic_notes), 
                                          desc="Similarity Check"))
print("Similarity check completed.")

# Filter out similar reports to get unique reports in mimic_notes
print("Filtering unique mimic_notes that are not in cxr_reports...")
unique_mimic_notes = mimic_notes[~mimic_notes['is_similar']]

# Display or save results
print('Num of Unique Notes:',len(mimic_notes[mimic_notes['is_similar']]) )
print("Displaying unique mimic_notes...")
print(unique_mimic_notes.head())
unique_mimic_notes.to_csv("/scratch/baj321/unique_mimic_notes.csv", index=False)
print("Unique mimic_notes saved to '/scratch/baj321/unique_mimic_radiology_notes.csv'.")


Loading datasets...
Tokenizing text in cxr_reports and mimic_notes...


100%|██████████| 227835/227835 [00:35<00:00, 6330.10it/s]
100%|██████████| 2321355/2321355 [12:01<00:00, 3218.64it/s] 


Generating MinHash signatures for cxr_reports...


In [3]:
pip install scikit-learn

Collecting scikit-learn
  Using cached scikit_learn-1.0.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (24.8 MB)
Collecting joblib>=0.11
  Using cached joblib-1.3.2-py3-none-any.whl (302 kB)
Collecting threadpoolctl>=2.0.0
  Using cached threadpoolctl-3.1.0-py3-none-any.whl (14 kB)
Installing collected packages: threadpoolctl, joblib, scikit-learn
Successfully installed joblib-1.3.2 scikit-learn-1.0.2 threadpoolctl-3.1.0
Note: you may need to restart the kernel to use updated packages.
