In [1]:
import os
work_dir='/home/mding3/local_scratch/fl_summer'
os.chdir(work_dir)

In [2]:
from transformers import pipeline
from transformers import CLIPProcessor, CLIPModel
from transformers import CLIPTokenizer


import torch
from transformers import CLIPProcessor, CLIPModel
import numpy as np
import pandas as pd
import os

import re
from pathlib import Path

  from .autonotebook import tqdm as notebook_tqdm


In [3]:


def extract_findings_and_impression(text: str) -> str:
   """
   Extract and concatenate the 'Findings' and 'Impression' sections from a radiology report.

   Sections are located by their headings (case-insensitive), and all text between each heading
   and the next ALL-CAPS heading (or end-of-text) is captured.
   """
   combined = []
   for section in ("FINDINGS", "IMPRESSION"):
       # Regex: match 'SECTION:' then lazily up to next all-caps heading ending with ':' or end of text
       pattern = rf"(?is){section}:(.*?)(?=\n[A-Z ]+?:|\Z)"
       match = re.search(pattern, text)
       if match:
           combined.append(match.group(1).strip())
   if len(combined) > 0:
    
       return " ".join(combined)
   if len(combined) == 0:
       return text.strip()


def split_and_filter_sentences(text: str, min_words: int = 3) -> list[str]:
   """
   Split text into sentences on ., !, or ? and remove any sentence with fewer than min_words words.
   """
   # Split on punctuation followed by whitespace
   sentences = re.split(r"(?<=[\.!?])\s+", text)
   # Filter out short sentences
   return [s.strip() for s in sentences if len(s.split()) >= min_words]

In [4]:

# 1) Load CLIP model & processor
MODEL_NAME = "openai/clip-vit-base-patch32"
processor = CLIPProcessor.from_pretrained(MODEL_NAME)
model = CLIPModel.from_pretrained(MODEL_NAME).eval()





def get_text_embedding(text: str) -> torch.Tensor:
   """
   Encode a single string (or sentence) into a 512-dim CLIP text embedding.

   Args:
       text (str): The input text to encode.

   Returns:
       torch.Tensor: A (1, 512) L2-normalized text embedding tensor.
   """
   # 2) Tokenize and create tensors
   inputs = processor(text=text,
                      return_tensors="pt",
                      padding=True,
                      truncation=True,
                      max_length=77)  # CLIP's max token length

   # 3) Extract text features
   with torch.no_grad():
       text_embeds = model.get_text_features(**inputs)  # (1, 512)
       text_embeds = text_embeds / text_embeds.norm(dim=-1, keepdim=True)

   return text_embeds




def count_clip_tokens(text: str, model_name: str = "openai/clip-vit-base-patch32") -> int:
   """
   Count the number of CLIP subword tokens for a given text string.
   """
   tokenizer = CLIPTokenizer.from_pretrained(model_name)
   # Tokenize without truncation to get the full token list
   tokens = tokenizer.tokenize(text)
   return len(tokens)

def is_over_token_limit(text: str, limit: int = 77, model_name: str = "openai/clip-vit-base-patch32") -> bool:
   """
   Return True if the text exceeds the specified CLIP token limit.
   """
   num_tokens = count_clip_tokens(text, model_name)
   print(f"Token count: {num_tokens} (limit = {limit})")
   return num_tokens > limit



Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [14]:
import os


summarizer = pipeline(
    "summarization",
    model="facebook/bart-large-cnn",
    device=0)



def extract_summary(text: str, token_limit: int = 77) -> str:
    """
    If `text` exceeds `token_limit` CLIP tokens, summarize it with BART;
    otherwise return the original text (stripped of leading/trailing whitespace).

    Args:
        text (str): Input report text.
        token_limit (int): Maximum allowed CLIP token count.

    Returns:
        str: Either the original text or a compressed summary.
    """
    # Check if text is too long for CLIP
    if is_over_token_limit(text, limit=token_limit):
        # Summarize down to fit under the limit
        summary = summarizer(
            text,
            max_length=token_limit,
            min_length=20,
            do_sample=False
        )[0]["summary_text"]
        return summary.strip()
    else:
        # Already within token budget
        return text.strip()


def get_pool_embedding_from_summary(file_path):
       # 1) Path to your report
   report_path = Path(file_path)

   # 2) Read the report text
   report_text = report_path.read_text(encoding="utf-8")

   # 3) Extract and combine Findings & Impression
   combined_text = extract_findings_and_impression(report_text)

   # 4) Split into sentences and filter
   sentences = split_and_filter_sentences(combined_text, min_words=3)
   sentences_whole = " ".join(sentences)
   sentences_whole = extract_summary(sentences_whole)

   text_embeds= get_text_embedding(sentences)
   txt_embed = text_embeds.mean(dim=0, keepdim=True)  # (1,512)
   txt_embed = txt_embed / txt_embed.norm(dim=-1, keepdim=True)
   return txt_embed,sentences_whole



Device set to use cuda:0


In [7]:
patient_file='mimic-cxr_lung_opacity_2000sample.csv'
df_patient = pd.read_csv(patient_file)
df_patient.columns

Index(['subject_id', 'study_id', 'Atelectasis', 'Cardiomegaly',
       'Consolidation', 'Edema', 'Enlarged Cardiomediastinum', 'Fracture',
       'Lung Lesion', 'Lung Opacity', 'No Finding', 'Pleural Effusion',
       'Pleural Other', 'Pneumonia', 'Pneumothorax', 'Support Devices'],
      dtype='object')

In [8]:
folder='/home/mding3/local_scratch/fl_summer/files'
for indx,row in df_patient.iterrows():
   subject_id ='p'+ str(int(row['subject_id']))
 # Remove 'p' prefix
  
   study_id = 's'+str(int(row['study_id']))
 
   subfolders = os.listdir(folder)

   for subfolder in subfolders:
       subfolder_path = os.path.join(folder, subfolder)
       files = os.listdir(subfolder_path)
       if subject_id in files:
           file_path = os.path.join(subfolder_path, subject_id, study_id + '.txt')
           if os.path.exists(file_path):
               print(f"File found: {file_path}")
           df_patient.at[indx, 'file_path'] = file_path
           break

File found: /home/mding3/local_scratch/fl_summer/files/p12/p12779994/s55965863.txt
File found: /home/mding3/local_scratch/fl_summer/files/p11/p11494296/s54944489.txt
File found: /home/mding3/local_scratch/fl_summer/files/p17/p17251996/s57007998.txt
File found: /home/mding3/local_scratch/fl_summer/files/p14/p14130788/s58955766.txt
File found: /home/mding3/local_scratch/fl_summer/files/p13/p13293910/s56789738.txt
File found: /home/mding3/local_scratch/fl_summer/files/p11/p11690358/s56825148.txt
File found: /home/mding3/local_scratch/fl_summer/files/p13/p13350579/s58780206.txt
File found: /home/mding3/local_scratch/fl_summer/files/p16/p16783577/s50730772.txt
File found: /home/mding3/local_scratch/fl_summer/files/p10/p10440642/s56122537.txt
File found: /home/mding3/local_scratch/fl_summer/files/p14/p14306557/s58579716.txt
File found: /home/mding3/local_scratch/fl_summer/files/p15/p15390826/s53461741.txt
File found: /home/mding3/local_scratch/fl_summer/files/p14/p14783057/s50769888.txt
File

In [11]:
df_patient

Unnamed: 0,subject_id,study_id,Atelectasis,Cardiomegaly,Consolidation,Edema,Enlarged Cardiomediastinum,Fracture,Lung Lesion,Lung Opacity,No Finding,Pleural Effusion,Pleural Other,Pneumonia,Pneumothorax,Support Devices,file_path
0,12779994,55965863,,,,-1.0,,,,1.0,,,,,,,/home/mding3/local_scratch/fl_summer/files/p12...
1,11494296,54944489,,,,,,,1.0,1.0,,,,,,,/home/mding3/local_scratch/fl_summer/files/p11...
2,17251996,57007998,,,,,-1.0,,,1.0,,1.0,,0.0,,,/home/mding3/local_scratch/fl_summer/files/p17...
3,14130788,58955766,,,,,0.0,,,1.0,,0.0,,-1.0,0.0,,/home/mding3/local_scratch/fl_summer/files/p14...
4,13293910,56789738,,,,,,,,1.0,,,,-1.0,,,/home/mding3/local_scratch/fl_summer/files/p13...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,13973055,55847178,,,,,,,,0.0,1.0,,,,0.0,1.0,/home/mding3/local_scratch/fl_summer/files/p13...
1996,18550032,54642925,,1.0,,,,,,0.0,,1.0,,1.0,,1.0,/home/mding3/local_scratch/fl_summer/files/p18...
1997,15311611,52627144,-1.0,,,,,,,0.0,,1.0,,0.0,0.0,1.0,/home/mding3/local_scratch/fl_summer/files/p15...
1998,11486239,51256738,,1.0,,0.0,,,,0.0,,1.0,0.0,,,,/home/mding3/local_scratch/fl_summer/files/p11...


In [15]:


for indx, row in df_patient.iterrows():
   print(f'processing {indx+1}/{df_patient.shape[0]}: {row["subject_id"]}, {row["study_id"]}')
   file_path = row['file_path']
   txt_embed,sentence_whole= get_pool_embedding_from_summary(file_path)
   output_dir='./embeddings/bart_summary'
   os.makedirs(output_dir, exist_ok=True)

   subject_id = row['subject_id']
# Remove 'p' prefix
  
   study_id = row['study_id']

   output_txt_dir = './embeddings/bart_summary_txt'
   os.makedirs(output_txt_dir, exist_ok=True)
   output_txt_path = os.path.join(output_txt_dir, f"{subject_id}_{study_id}.txt")
   with open(output_txt_path, "w", encoding="utf-8") as f:
      f.write(sentence_whole)


   output_path = os.path.join(output_dir, f"{subject_id}_{study_id}.npy")
   np.save(output_path,txt_embed)







processing 1/2000: 12779994, 55965863
Token count: 142 (limit = 77)
processing 2/2000: 11494296, 54944489
Token count: 132 (limit = 77)
processing 3/2000: 17251996, 57007998
Token count: 94 (limit = 77)
processing 4/2000: 14130788, 58955766
Token count: 76 (limit = 77)
processing 5/2000: 13293910, 56789738
Token count: 105 (limit = 77)
processing 6/2000: 11690358, 56825148
Token count: 112 (limit = 77)
processing 7/2000: 13350579, 58780206
Token count: 93 (limit = 77)
processing 8/2000: 16783577, 50730772
Token count: 47 (limit = 77)
processing 9/2000: 10440642, 56122537
Token count: 65 (limit = 77)
processing 10/2000: 14306557, 58579716
Token count: 104 (limit = 77)
processing 11/2000: 15390826, 53461741
Token count: 46 (limit = 77)
processing 12/2000: 14783057, 50769888
Token count: 83 (limit = 77)
processing 13/2000: 14470386, 58832223
Token count: 89 (limit = 77)
processing 14/2000: 19299068, 59551981
Token count: 108 (limit = 77)
processing 15/2000: 12806204, 55096288
Token count:

HTTP Error 429 thrown while requesting HEAD https://huggingface.co/openai/clip-vit-base-patch32/resolve/main/tokenizer_config.json
Retrying in 1s [Retry 1/5].


processing 1120/2000: 12809207, 55401770


HTTP Error 429 thrown while requesting HEAD https://huggingface.co/openai/clip-vit-base-patch32/resolve/main/tokenizer_config.json
Retrying in 2s [Retry 2/5].
HTTP Error 429 thrown while requesting HEAD https://huggingface.co/openai/clip-vit-base-patch32/resolve/main/tokenizer_config.json
Retrying in 4s [Retry 3/5].


Token count: 32 (limit = 77)
processing 1121/2000: 15379716, 54274298
Token count: 95 (limit = 77)
processing 1122/2000: 12315463, 56434306
Token count: 70 (limit = 77)
processing 1123/2000: 17964176, 58103470
Token count: 83 (limit = 77)
processing 1124/2000: 16426507, 59819523
Token count: 46 (limit = 77)
processing 1125/2000: 19955348, 56227479
Token count: 130 (limit = 77)
processing 1126/2000: 19975635, 53619606
Token count: 96 (limit = 77)
processing 1127/2000: 10122392, 59596554
Token count: 125 (limit = 77)
processing 1128/2000: 16086391, 55949610
Token count: 62 (limit = 77)
processing 1129/2000: 18098720, 55987116
Token count: 48 (limit = 77)
processing 1130/2000: 10048522, 52414596
Token count: 85 (limit = 77)
processing 1131/2000: 10316648, 59892014
Token count: 93 (limit = 77)
processing 1132/2000: 13707812, 57009548
Token count: 200 (limit = 77)
processing 1133/2000: 11204646, 54351633
Token count: 101 (limit = 77)
processing 1134/2000: 13513122, 54661785
Token count: 72 

HTTP Error 429 thrown while requesting HEAD https://huggingface.co/api/resolve-cache/models/openai/clip-vit-base-patch32/3d74acf9a28c67741b2f4f2ea7635f0aaf6f0268/tokenizer_config.json
Retrying in 1s [Retry 1/5].


Token count: 48 (limit = 77)
processing 1270/2000: 13593545, 53206152


HTTP Error 429 thrown while requesting HEAD https://huggingface.co/api/resolve-cache/models/openai/clip-vit-base-patch32/3d74acf9a28c67741b2f4f2ea7635f0aaf6f0268/tokenizer_config.json
Retrying in 2s [Retry 2/5].
HTTP Error 429 thrown while requesting HEAD https://huggingface.co/api/resolve-cache/models/openai/clip-vit-base-patch32/3d74acf9a28c67741b2f4f2ea7635f0aaf6f0268/tokenizer_config.json
Retrying in 4s [Retry 3/5].
HTTP Error 429 thrown while requesting HEAD https://huggingface.co/api/resolve-cache/models/openai/clip-vit-base-patch32/3d74acf9a28c67741b2f4f2ea7635f0aaf6f0268/tokenizer_config.json
Retrying in 8s [Retry 4/5].


Token count: 46 (limit = 77)
processing 1271/2000: 18257244, 54467807
Token count: 138 (limit = 77)
processing 1272/2000: 14845249, 54278665
Token count: 239 (limit = 77)
processing 1273/2000: 19787095, 50273602
Token count: 203 (limit = 77)
processing 1274/2000: 16137431, 50235269
Token count: 128 (limit = 77)
processing 1275/2000: 16950272, 59090747
Token count: 73 (limit = 77)
processing 1276/2000: 13074701, 50306231
Token count: 8 (limit = 77)
processing 1277/2000: 14817419, 50795339
Token count: 73 (limit = 77)
processing 1278/2000: 10433353, 56471403
Token count: 89 (limit = 77)
processing 1279/2000: 18195430, 51048173
Token count: 69 (limit = 77)
processing 1280/2000: 12535940, 55982934
Token count: 63 (limit = 77)
processing 1281/2000: 18126438, 52734090
Token count: 183 (limit = 77)
processing 1282/2000: 14662246, 55514325
Token count: 34 (limit = 77)
processing 1283/2000: 12450697, 51156240
Token count: 54 (limit = 77)
processing 1284/2000: 17420936, 53437387
Token count: 76 

HTTP Error 429 thrown while requesting HEAD https://huggingface.co/openai/clip-vit-base-patch32/resolve/main/tokenizer_config.json
Retrying in 1s [Retry 1/5].


processing 1409/2000: 15944472, 51098567


HTTP Error 429 thrown while requesting HEAD https://huggingface.co/openai/clip-vit-base-patch32/resolve/main/tokenizer_config.json
Retrying in 2s [Retry 2/5].
HTTP Error 429 thrown while requesting HEAD https://huggingface.co/openai/clip-vit-base-patch32/resolve/main/tokenizer_config.json
Retrying in 4s [Retry 3/5].


Token count: 69 (limit = 77)
processing 1410/2000: 16421457, 59992511
Token count: 29 (limit = 77)
processing 1411/2000: 15319814, 53587949
Token count: 43 (limit = 77)
processing 1412/2000: 19526366, 51028969
Token count: 135 (limit = 77)
processing 1413/2000: 12809280, 55099534
Token count: 35 (limit = 77)
processing 1414/2000: 14035383, 52805991
Token count: 90 (limit = 77)
processing 1415/2000: 17249901, 56123196
Token count: 44 (limit = 77)
processing 1416/2000: 19158091, 55626752
Token count: 54 (limit = 77)
processing 1417/2000: 15170034, 59267660
Token count: 146 (limit = 77)
processing 1418/2000: 12741592, 57537411
Token count: 102 (limit = 77)
processing 1419/2000: 16609088, 53794290
Token count: 99 (limit = 77)
processing 1420/2000: 17486231, 51204935
Token count: 43 (limit = 77)
processing 1421/2000: 14211105, 58313546
Token count: 45 (limit = 77)
processing 1422/2000: 16411820, 50311465
Token count: 37 (limit = 77)
processing 1423/2000: 13365915, 56820651
Token count: 83 (

HTTP Error 429 thrown while requesting HEAD https://huggingface.co/api/resolve-cache/models/openai/clip-vit-base-patch32/3d74acf9a28c67741b2f4f2ea7635f0aaf6f0268/tokenizer_config.json
Retrying in 1s [Retry 1/5].


processing 1541/2000: 10115182, 51406842


HTTP Error 429 thrown while requesting HEAD https://huggingface.co/api/resolve-cache/models/openai/clip-vit-base-patch32/3d74acf9a28c67741b2f4f2ea7635f0aaf6f0268/tokenizer_config.json
Retrying in 2s [Retry 2/5].
HTTP Error 429 thrown while requesting HEAD https://huggingface.co/api/resolve-cache/models/openai/clip-vit-base-patch32/3d74acf9a28c67741b2f4f2ea7635f0aaf6f0268/tokenizer_config.json
Retrying in 4s [Retry 3/5].
HTTP Error 429 thrown while requesting HEAD https://huggingface.co/api/resolve-cache/models/openai/clip-vit-base-patch32/3d74acf9a28c67741b2f4f2ea7635f0aaf6f0268/tokenizer_config.json
Retrying in 8s [Retry 4/5].
HTTP Error 429 thrown while requesting HEAD https://huggingface.co/api/resolve-cache/models/openai/clip-vit-base-patch32/3d74acf9a28c67741b2f4f2ea7635f0aaf6f0268/tokenizer_config.json
Retrying in 8s [Retry 5/5].


Token count: 83 (limit = 77)
processing 1542/2000: 18785003, 55364712
Token count: 136 (limit = 77)
processing 1543/2000: 10082014, 55731615
Token count: 133 (limit = 77)
processing 1544/2000: 16527913, 52931559
Token count: 152 (limit = 77)
processing 1545/2000: 17740852, 58273274
Token count: 188 (limit = 77)
processing 1546/2000: 13095294, 51659735
Token count: 103 (limit = 77)
processing 1547/2000: 11416492, 53552513
Token count: 130 (limit = 77)
processing 1548/2000: 14873669, 51884387
Token count: 84 (limit = 77)
processing 1549/2000: 11545313, 51108810
Token count: 142 (limit = 77)
processing 1550/2000: 10176514, 59083641
Token count: 114 (limit = 77)
processing 1551/2000: 10291098, 57722714
Token count: 55 (limit = 77)
processing 1552/2000: 14260832, 56973182
Token count: 61 (limit = 77)
processing 1553/2000: 11810761, 55618395
Token count: 56 (limit = 77)
processing 1554/2000: 10977414, 51998900
Token count: 211 (limit = 77)
processing 1555/2000: 19427173, 51888342
Token count

HTTP Error 429 thrown while requesting HEAD https://huggingface.co/api/resolve-cache/models/openai/clip-vit-base-patch32/3d74acf9a28c67741b2f4f2ea7635f0aaf6f0268/tokenizer_config.json
Retrying in 1s [Retry 1/5].


processing 1686/2000: 19736108, 55514987


HTTP Error 429 thrown while requesting HEAD https://huggingface.co/api/resolve-cache/models/openai/clip-vit-base-patch32/3d74acf9a28c67741b2f4f2ea7635f0aaf6f0268/tokenizer_config.json
Retrying in 2s [Retry 2/5].
HTTP Error 429 thrown while requesting HEAD https://huggingface.co/api/resolve-cache/models/openai/clip-vit-base-patch32/3d74acf9a28c67741b2f4f2ea7635f0aaf6f0268/tokenizer_config.json
Retrying in 4s [Retry 3/5].
HTTP Error 429 thrown while requesting HEAD https://huggingface.co/api/resolve-cache/models/openai/clip-vit-base-patch32/3d74acf9a28c67741b2f4f2ea7635f0aaf6f0268/tokenizer_config.json
Retrying in 8s [Retry 4/5].
HTTP Error 429 thrown while requesting HEAD https://huggingface.co/api/resolve-cache/models/openai/clip-vit-base-patch32/3d74acf9a28c67741b2f4f2ea7635f0aaf6f0268/tokenizer_config.json
Retrying in 8s [Retry 5/5].


Token count: 61 (limit = 77)
processing 1687/2000: 14546998, 55953785
Token count: 197 (limit = 77)
processing 1688/2000: 18140944, 58000070
Token count: 31 (limit = 77)
processing 1689/2000: 11549602, 51903328
Token count: 98 (limit = 77)
processing 1690/2000: 18421301, 59930197
Token count: 56 (limit = 77)
processing 1691/2000: 12485364, 59165419
Token count: 60 (limit = 77)
processing 1692/2000: 17261345, 51949463
Token count: 360 (limit = 77)
processing 1693/2000: 14530732, 55327406
Token count: 108 (limit = 77)
processing 1694/2000: 19138636, 54084583
Token count: 45 (limit = 77)
processing 1695/2000: 15689762, 57693050
Token count: 51 (limit = 77)
processing 1696/2000: 16471016, 55602837
Token count: 126 (limit = 77)
processing 1697/2000: 18731624, 56065088
Token count: 52 (limit = 77)
processing 1698/2000: 10763729, 55532301
Token count: 90 (limit = 77)
processing 1699/2000: 10046922, 59373859
Token count: 53 (limit = 77)
processing 1700/2000: 19723798, 51698810
Token count: 82 

HTTP Error 429 thrown while requesting HEAD https://huggingface.co/api/resolve-cache/models/openai/clip-vit-base-patch32/3d74acf9a28c67741b2f4f2ea7635f0aaf6f0268/tokenizer_config.json
Retrying in 1s [Retry 1/5].


Token count: 58 (limit = 77)
processing 1981/2000: 15182529, 57527174


HTTP Error 429 thrown while requesting HEAD https://huggingface.co/api/resolve-cache/models/openai/clip-vit-base-patch32/3d74acf9a28c67741b2f4f2ea7635f0aaf6f0268/tokenizer_config.json
Retrying in 2s [Retry 2/5].
HTTP Error 429 thrown while requesting HEAD https://huggingface.co/api/resolve-cache/models/openai/clip-vit-base-patch32/3d74acf9a28c67741b2f4f2ea7635f0aaf6f0268/tokenizer_config.json
Retrying in 4s [Retry 3/5].
HTTP Error 429 thrown while requesting HEAD https://huggingface.co/api/resolve-cache/models/openai/clip-vit-base-patch32/3d74acf9a28c67741b2f4f2ea7635f0aaf6f0268/tokenizer_config.json
Retrying in 8s [Retry 4/5].


Token count: 105 (limit = 77)
processing 1982/2000: 15674609, 56484500
Token count: 169 (limit = 77)
processing 1983/2000: 17735335, 59659087
Token count: 130 (limit = 77)
processing 1984/2000: 12156452, 55779890
Token count: 140 (limit = 77)
processing 1985/2000: 14876557, 53286274
Token count: 55 (limit = 77)
processing 1986/2000: 15349002, 52497681
Token count: 94 (limit = 77)
processing 1987/2000: 14260773, 51136034
Token count: 57 (limit = 77)
processing 1988/2000: 13269046, 52257317
Token count: 33 (limit = 77)
processing 1989/2000: 15677773, 53299407
Token count: 60 (limit = 77)
processing 1990/2000: 12042749, 54130884
Token count: 233 (limit = 77)
processing 1991/2000: 14479847, 55843096
Token count: 81 (limit = 77)
processing 1992/2000: 15250428, 57395745
Token count: 212 (limit = 77)
processing 1993/2000: 15750196, 58376582
Token count: 76 (limit = 77)
processing 1994/2000: 11593651, 57631034
Token count: 23 (limit = 77)
processing 1995/2000: 16577068, 56325958
Token count: 2