In [65]:
import torch
from transformers import AutoModel, AutoImageProcessor
from PIL import Image
import numpy as np
import glob
import pandas as pd
from tqdm import tqdm

In [None]:
df_train = pd.read_csv('/home/lfay/MedImageInsights/data/CheXpert-v1.0-512/train.csv')
df_val = pd.read_csv('/home/lfay/MedImageInsights/data/CheXpert-v1.0-512/val.csv')
df_test = pd.read_csv('/home/lfay/MedImageInsights/data/CheXpert-v1.0-512/test.csv')

In [None]:
# Drop the last 1024 columns
df_train = df_train.drop(df_train.columns[-1024:], axis=1)
df_valid = df_val.drop(df_val.columns[-1024:], axis=1)
df_test = df_test.drop(df_test.columns[-1024:], axis=1)
len(df_val)

6464

In [None]:
datasets = [("train", df_train), ("test", df_test), ("valid", df_valid)]

# extract the first 10 rows

for i, (name, df) in enumerate(datasets):
    print(f"Processing {name} dataset...")
    # Get the list of image paths
    # Load model and processor
   
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model = AutoModel.from_pretrained("microsoft/rad-dino").to(device)
    processor = AutoImageProcessor.from_pretrained("microsoft/rad-dino")

    # Load images (adjust path accordingly)
    image_paths = df['Path'].values
    features = []
    embeddings = {}
    for img_path in tqdm(image_paths):
        img_full_path = f"/home/lfay/MedImageInsights/data/{img_path}"
        
        image = Image.open(img_full_path).convert("RGB")
        inputs = processor(images=image, return_tensors="pt").to(device)
        
        with torch.no_grad():
            embedding = model(**inputs).last_hidden_state[:, 0, :]  # Extract CLS token or mean pooling
            embedding = embedding.cpu().numpy()

        
        embeddings[img_path] = embedding.flatten()

    embeddings_df = pd.DataFrame.from_dict(embeddings, orient="index")

    df_merged = df.merge(embeddings_df, left_on='Path', right_index=True, how='left')

    # Save the merged DataFrame to a CSV file
    output_file = f"/home/lfay/MedImageInsights/predictions/DINO/CheXpert/{name}.csv"
    df_merged.to_csv(output_file, index=False)
    print(f"Saved {name} dataset with embeddings to {output_file}")




Processing train dataset...


100%|██████████| 10/10 [00:00<00:00, 13.24it/s]


Saved train dataset with embeddings to /home/lfay/MedImageInsights/predictions/DINO/CheXpert/train.csv
Processing test dataset...


100%|██████████| 10/10 [00:00<00:00, 11.90it/s]


Saved test dataset with embeddings to /home/lfay/MedImageInsights/predictions/DINO/CheXpert/test.csv
Processing valid dataset...


100%|██████████| 10/10 [00:00<00:00, 12.32it/s]

Saved valid dataset with embeddings to /home/lfay/MedImageInsights/predictions/DINO/CheXpert/valid.csv





Unnamed: 0,Path,report,section_findings,section_impression,age,sex,race,No Finding,Enlarged Cardiomediastinum,Cardiomegaly,...,758,759,760,761,762,763,764,765,766,767
0,/CheXpert-v1.0-512/images/train/patient41636/s...,"NARRATIVE:\nEXAM: Chest 2 Views, 9/6/2020\n \n...",\n \nSlightly low lung volumes. Vague left re...,\n \n1.VAGUE LEFT RETROCARDIAC OPACITY WHICH M...,82.0,Female,White,0.0,0.0,0.0,...,0.517187,0.137364,0.135248,-0.115836,-0.160161,-0.341121,0.140617,-0.087786,-0.109787,0.475072
1,/CheXpert-v1.0-512/images/train/patient20211/s...,NARRATIVE:\nCLINICAL HISTORY: Acute myocardial...,,\n1. INTERVAL INSERTION OF INTRAAORTIC BALLOON...,68.0,Male,Asian,0.0,0.0,0.0,...,0.384487,0.27615,-0.012666,0.386115,-0.058276,-0.002611,0.264599,-0.205247,-0.027271,-0.177243
2,/CheXpert-v1.0-512/images/train/patient23741/s...,"NARRATIVE:\nEXAM: Chest 1 View, 3-30-13\n \nCL...",Mild diffuse reticular opacities are seen thr...,\n \n1.MILD DIFFUSE RETICULAR OPACITIES ARE SE...,89.0,Female,White,0.0,0.0,0.0,...,0.028652,0.490437,-0.066536,0.208335,-0.461104,0.579142,0.365383,-0.309619,-0.352168,-0.276253
3,/CheXpert-v1.0-512/images/train/patient00673/s...,NARRATIVE:\nCHEST AP PORTABLE: 11/11\nCOMPARIS...,\nAn endotracheal tube is present approximatel...,\n1. MULTIPLE TUBES AND LINES.\n2. RELATIVELY ...,64.0,Female,Unknown,0.0,0.0,0.0,...,0.50712,0.206107,0.115506,0.046893,0.621015,-0.339397,-0.23689,-0.898211,-0.55739,0.38847
4,/CheXpert-v1.0-512/images/train/patient19403/s...,NARRATIVE:\nCHEST RADIOGRAPH: 03-2015 X2\n \n...,Single frontal view of the chest obtained 3-...,\n \n1. SERIES OF TWO CHEST RADIOGRAPHS DEMO...,36.0,Female,White,0.0,0.0,0.0,...,0.261567,-0.054837,-0.310386,-0.28689,0.052734,0.190555,0.462588,-0.306477,0.396755,-0.27443
5,/CheXpert-v1.0-512/images/train/patient52643/s...,NARRATIVE:\nCHEST AP PORTABLE: 6/20/2018\nCOMP...,"\nIn the interval, a left chest tube has been ...",\n1. LEFT APICAL PNEUMOTHORAX.\n2. VOLUME LOSS...,55.0,Male,Unknown,0.0,0.0,0.0,...,0.145011,0.215086,0.173036,0.518787,0.303144,0.059863,0.266999,-0.17826,0.258075,0.35024
6,/CheXpert-v1.0-512/images/train/patient37739/s...,"NARRATIVE:\nEXAM: Chest 1 View, 9-1-2004\n \nC...",,\n \n1.TIP OF ET TUBE IN RIGHT MAINSTEM BRONCH...,57.0,Male,Other,1.0,0.0,0.0,...,0.141114,0.27452,0.510233,-0.366037,-0.85317,-0.151341,0.448295,0.286317,0.032104,0.623084
7,/CheXpert-v1.0-512/images/train/patient39907/s...,"NARRATIVE:\nChest 1 View, 6/27/2003\n \nHISTOR...",,\n \n1.INTERVAL REMOVAL OF ENDOTRACHEAL TUBE...,71.0,Male,White,0.0,0.0,0.0,...,0.425867,0.495116,0.248706,0.693935,-0.588446,-0.13024,0.158417,-0.08441,0.399467,0.497938
8,/CheXpert-v1.0-512/images/train/patient61087/s...,NARRATIVE:\nChest 1 View 6/11/2020\n \nClinica...,Frontal view of the chest at 5:37 p.m. demon...,\n \n1.IMPROVING AERATION OF THE LUNGS.\n \n2...,67.0,Female,White,1.0,0.0,0.0,...,0.111728,-0.108809,0.051421,0.449714,0.250766,-0.270738,0.137605,0.138568,-0.11097,0.346025
9,/CheXpert-v1.0-512/images/train/patient06848/s...,NARRATIVE:\nSINGLE AP VIEW OF THE CHEST: 1/7...,,\n \n 1. RIGHT CHEST PORT WITH TIP NEAR TH...,53.0,Female,Other,0.0,0.0,1.0,...,0.518781,0.567393,-0.957494,0.23524,-0.126061,0.163441,-0.143717,0.009975,0.241318,0.102796


In [46]:
df_val = df_val.merge(embeddings_df, left_on='Path', right_index=True, how='left')
df_val.to_csv(f"/home/lfay/MedImageInsights/predictions/DINO/CheXpert/val.csv", index=False)