In [12]:
import kagglehub, os

# This downloads (or reuses cached) NIH Chest X-rays dataset and gives you the local path
dataset_root = kagglehub.dataset_download("nih-chest-xrays/data")

print("Dataset root:", dataset_root)
print("Immediate contents of dataset_root:")
print(os.listdir(dataset_root))


Downloading from https://www.kaggle.com/api/v1/datasets/download/nih-chest-xrays/data?dataset_version_number=3...


100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 42.0G/42.0G [23:54<00:00, 31.4MB/s]

Extracting files...





Dataset root: /Users/graysonrichard/.cache/kagglehub/datasets/nih-chest-xrays/data/versions/3
Immediate contents of dataset_root:
['images_006', 'images_001', 'images_008', 'images_009', 'images_007', 'FAQ_CHESTXRAY.pdf', 'images_012', 'Data_Entry_2017.csv', 'BBox_List_2017.csv', 'ARXIV_V5_CHESTXRAY.pdf', 'train_val_list.txt', 'README_CHESTXRAY.pdf', 'images_002', 'images_005', 'LOG_CHESTXRAY.pdf', 'images_004', 'images_003', 'test_list.txt', 'images_010', 'images_011']


In [14]:
# If needed (first time in this environment) you can install:
# !pip install torch torchvision torchaudio scikit-learn

import os
import random
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, models

from sklearn.model_selection import train_test_split
from tqdm import tqdm

# Reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)


Using device: cpu


In [15]:
# dataset_root is already defined from your previous cell
csv_path = os.path.join(dataset_root, "Data_Entry_2017.csv")
df = pd.read_csv(csv_path)

print("CSV shape:", df.shape)
print("Columns:", df.columns.tolist())
print(df.head())

# Option A: Binary target
# 0 = No Finding, 1 = Has at least one finding
df["target"] = (df["Finding Labels"] != "No Finding").astype(int)

print("\nTarget value counts (0 = No Finding, 1 = Any finding):")
print(df["target"].value_counts())
print("\nTarget proportion:")
print(df["target"].value_counts(normalize=True))

# OPTIONAL: if you want to prototype faster, uncomment to subsample
# df = df.sample(n=20000, random_state=SEED).reset_index(drop=True)
# print("Subsampled CSV shape:", df.shape)


CSV shape: (112120, 12)
Columns: ['Image Index', 'Finding Labels', 'Follow-up #', 'Patient ID', 'Patient Age', 'Patient Gender', 'View Position', 'OriginalImage[Width', 'Height]', 'OriginalImagePixelSpacing[x', 'y]', 'Unnamed: 11']
        Image Index          Finding Labels  Follow-up #  Patient ID  \
0  00000001_000.png            Cardiomegaly            0           1   
1  00000001_001.png  Cardiomegaly|Emphysema            1           1   
2  00000001_002.png   Cardiomegaly|Effusion            2           1   
3  00000002_000.png              No Finding            0           2   
4  00000003_000.png                  Hernia            0           3   

   Patient Age Patient Gender View Position  OriginalImage[Width  Height]  \
0           58              M            PA                 2682     2749   
1           58              M            PA                 2894     2729   
2           58              M            PA                 2500     2048   
3           81             

In [29]:
patients = df["Patient ID"].unique()

train_patients, temp_patients = train_test_split(
    patients, test_size=0.4, random_state=SEED
)
val_patients, test_patients = train_test_split(
    temp_patients, test_size=0.5, random_state=SEED
)

def assign_split(pid):
    if pid in train_patients:
        return "train"
    elif pid in val_patients:
        return "val"
    else:
        return "test"

df["split"] = df["Patient ID"].apply(assign_split)

train_df = df[df["split"] == "train"].reset_index(drop=True)
val_df   = df[df["split"] == "val"].reset_index(drop=True)
test_df  = df[df["split"] == "test"].reset_index(drop=True)

print("Train size:", len(train_df))
print("Val size:",   len(val_df))
print("Test size:",  len(test_df))

print("\nTrain target distribution:")
print(train_df["target"].value_counts(normalize=True))


Train size: 66590
Val size: 22701
Test size: 22829

Train target distribution:
target
0    0.540201
1    0.459799
Name: proportion, dtype: float64


In [30]:
image_path_map = {}

for root, dirs, files in os.walk(dataset_root):
    for fname in files:
        if fname.lower().endswith((".png", ".jpg", ".jpeg")):
            full_path = os.path.join(root, fname)
            image_path_map[fname] = full_path

print("Number of images found:", len(image_path_map))

coverage = df["Image Index"].isin(image_path_map.keys()).mean()
print(f"Proportion of CSV rows with a matching file: {coverage:.4f}")


Number of images found: 112120
Proportion of CSV rows with a matching file: 1.0000


In [31]:
from PIL import Image

# Transforms for ResNet
train_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize(
        mean=[0.485, 0.456, 0.406],  # ImageNet stats
        std=[0.229, 0.224, 0.225]
    ),
])

eval_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225]
    ),
])

class ChestXrayDataset(Dataset):
    def __init__(self, df, image_path_map, transform=None):
        self.df = df.reset_index(drop=True)
        self.image_path_map = image_path_map
        self.transform = transform
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        img_name = row["Image Index"]
        
        if img_name not in self.image_path_map:
            raise FileNotFoundError(f"Image file not found for {img_name}")
        
        img_path = self.image_path_map[img_name]
        img = Image.open(img_path).convert("RGB")
        
        if self.transform:
            img = self.transform(img)
        
        label = int(row["target"])  # 0 or 1
        return img, label


In [32]:
BATCH_SIZE = 32
NUM_WORKERS = 0  # keep 0 on Mac / Jupyter to avoid multiprocessing issues

train_dataset = ChestXrayDataset(train_df, image_path_map, transform=train_transform)
val_dataset   = ChestXrayDataset(val_df,   image_path_map, transform=eval_transform)
test_dataset  = ChestXrayDataset(test_df,  image_path_map, transform=eval_transform)

train_loader = DataLoader(
    train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=NUM_WORKERS,
    pin_memory=True if device.type == "cuda" else False,
)

val_loader = DataLoader(
    val_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=NUM_WORKERS,
    pin_memory=True if device.type == "cuda" else False,
)

test_loader = DataLoader(
    test_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=NUM_WORKERS,
    pin_memory=True if device.type == "cuda" else False,
)

len(train_loader), len(val_loader), len(test_loader)


(2081, 710, 714)

In [33]:
from torchvision import models

resnet = models.resnet18(weights=models.ResNet18_Weights.IMAGENET1K_V1)
num_feats = resnet.fc.in_features
resnet.fc = nn.Linear(num_feats, 1)

resnet = resnet.to(device)
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(resnet.parameters(), lr=1e-4)


Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /Users/graysonrichard/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth


100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 44.7M/44.7M [00:02<00:00, 18.6MB/s]


In [34]:
def train_one_epoch(model, loader, optimizer, criterion, device):
    model.train()
    running_loss = 0.0
    
    for images, labels in tqdm(loader, desc="Train", leave=False):
        images = images.to(device)
        labels = labels.float().unsqueeze(1).to(device)  # (N, 1)
        
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item() * images.size(0)
    
    epoch_loss = running_loss / len(loader.dataset)
    return epoch_loss


def eval_one_epoch(model, loader, criterion, device):
    model.eval()
    running_loss = 0.0
    all_labels = []
    all_probs = []
    
    with torch.no_grad():
        for images, labels in tqdm(loader, desc="Eval", leave=False):
            images = images.to(device)
            labels = labels.float().unsqueeze(1).to(device)
            
            outputs = model(images)
            loss = criterion(outputs, labels)
            running_loss += loss.item() * images.size(0)
            
            probs = torch.sigmoid(outputs).cpu().numpy().flatten()
            all_probs.extend(probs)
            all_labels.extend(labels.cpu().numpy().flatten())
    
    epoch_loss = running_loss / len(loader.dataset)
    all_labels = np.array(all_labels, dtype=np.float32)
    all_probs = np.array(all_probs, dtype=np.float32)
    
    return epoch_loss, all_labels, all_probs


In [35]:
N_PER_CLASS = 3000  # total ~6000 images; bump to e.g. 5000 for ~10k

df_small = (
    df.groupby("target", group_keys=False)
      .apply(lambda x: x.sample(min(len(x), N_PER_CLASS), random_state=SEED))
      .reset_index(drop=True)
)

print("Original dataset size:", len(df))
print("Smaller balanced dataset size:", len(df_small))
print(df_small["target"].value_counts())


Original dataset size: 112120
Smaller balanced dataset size: 6000
target
0    3000
1    3000
Name: count, dtype: int64


  .apply(lambda x: x.sample(min(len(x), N_PER_CLASS), random_state=SEED))


In [36]:
from sklearn.model_selection import train_test_split

patients = df_small["Patient ID"].unique()

train_patients, temp_patients = train_test_split(
    patients, test_size=0.4, random_state=SEED
)
val_patients, test_patients = train_test_split(
    temp_patients, test_size=0.5, random_state=SEED
)

def assign_split(pid):
    if pid in train_patients:
        return "train"
    elif pid in val_patients:
        return "val"
    else:
        return "test"

df_small["split"] = df_small["Patient ID"].apply(assign_split)

train_df = df_small[df_small["split"] == "train"].reset_index(drop=True)
val_df   = df_small[df_small["split"] == "val"].reset_index(drop=True)
test_df  = df_small[df_small["split"] == "test"].reset_index(drop=True)

print("Train size:", len(train_df))
print("Val size:",   len(val_df))
print("Test size:",  len(test_df))
print("\nTrain target distribution:")
print(train_df["target"].value_counts(normalize=True))


Train size: 3615
Val size: 1183
Test size: 1202

Train target distribution:
target
1    0.505671
0    0.494329
Name: proportion, dtype: float64


In [37]:
from PIL import Image
from torch.utils.data import Dataset, DataLoader

class ChestXrayDataset(Dataset):
    def __init__(self, df, image_path_map, transform=None):
        self.df = df.reset_index(drop=True)
        self.image_path_map = image_path_map
        self.transform = transform
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        img_name = row["Image Index"]
        
        img_path = self.image_path_map[img_name]
        img = Image.open(img_path).convert("RGB")
        
        if self.transform:
            img = self.transform(img)
        
        label = int(row["target"])
        return img, label


BATCH_SIZE = 32
NUM_WORKERS = 0  # keep 0 on Mac / notebooks

train_dataset = ChestXrayDataset(train_df, image_path_map, transform=train_transform)
val_dataset   = ChestXrayDataset(val_df,   image_path_map, transform=eval_transform)
test_dataset  = ChestXrayDataset(test_df,  image_path_map, transform=eval_transform)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True,
                          num_workers=NUM_WORKERS)
val_loader   = DataLoader(val_dataset,   batch_size=BATCH_SIZE, shuffle=False,
                          num_workers=NUM_WORKERS)
test_loader  = DataLoader(test_dataset,  batch_size=BATCH_SIZE, shuffle=False,
                          num_workers=NUM_WORKERS)

len(train_loader), len(val_loader), len(test_loader)


(113, 37, 38)

In [38]:
NUM_EPOCHS = 2  # start with 2; bump to 3â€“5 if it's still okay

best_val_loss = float("inf")
best_state_dict = None

for epoch in range(NUM_EPOCHS):
    print(f"\n===== Epoch {epoch+1}/{NUM_EPOCHS} =====")
    
    train_loss = train_one_epoch(resnet, train_loader, optimizer, criterion, device)
    val_loss, val_y_true, val_y_prob = eval_one_epoch(resnet, val_loader, criterion, device)
    
    print(f"Train loss: {train_loss:.4f} | Val loss: {val_loss:.4f}")
    
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_state_dict = resnet.state_dict()
        print("ðŸ‘‰ New best model (saved in memory)")

if best_state_dict is not None:
    resnet.load_state_dict(best_state_dict)
    print("\nLoaded best model based on validation loss.")
else:
    print("\nWARNING: No best_state_dict saved; using last-epoch model.")



===== Epoch 1/2 =====


                                                                                

Train loss: 0.6470 | Val loss: 0.6622
ðŸ‘‰ New best model (saved in memory)

===== Epoch 2/2 =====


                                                                                

Train loss: 0.5636 | Val loss: 0.6722

Loaded best model based on validation loss.




In [39]:
test_loss, test_y_true, test_y_prob = eval_one_epoch(
    resnet, test_loader, criterion, device
)

print(f"\nTest loss: {test_loss:.4f}")
print("Number of test samples:", len(test_y_true))

print("First 10 true labels:", test_y_true[:10])
print("First 10 predicted probabilities:", test_y_prob[:10])

# Save results in a DataFrame for further analysis (confusion matrix, subgroups, etc.)
test_results_df = pd.DataFrame({
    "y_true": test_y_true,
    "y_prob": test_y_prob,
})

test_results_df.head()


                                                                                


Test loss: 0.6382
Number of test samples: 1202
First 10 true labels: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
First 10 predicted probabilities: [0.39562777 0.42700183 0.40202335 0.19539285 0.50387543 0.26866668
 0.34148452 0.39076948 0.29376507 0.60493916]




Unnamed: 0,y_true,y_prob
0,0.0,0.395628
1,0.0,0.427002
2,0.0,0.402023
3,0.0,0.195393
4,0.0,0.503875


In [41]:
# MODEL EVAL AND DATA EXPLORATION

In [42]:
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score

# Threshold at 0.5 (you could tune this later if you want)
test_y_pred = (test_y_prob >= 0.5).astype(int)

cm = confusion_matrix(test_y_true.astype(int), test_y_pred)
print("Confusion matrix (rows = true, cols = pred):")
print(cm)

print("\nClassification report:")
print(classification_report(
    test_y_true.astype(int),
    test_y_pred,
    target_names=["No Finding", "Any Finding"]
))

# ROC-AUC is nice to report too
try:
    auc = roc_auc_score(test_y_true, test_y_prob)
    print(f"ROC-AUC: {auc:.4f}")
except ValueError:
    print("Could not compute ROC-AUC (only one class present?).")


Confusion matrix (rows = true, cols = pred):
[[489 121]
 [311 281]]

Classification report:
              precision    recall  f1-score   support

  No Finding       0.61      0.80      0.69       610
 Any Finding       0.70      0.47      0.57       592

    accuracy                           0.64      1202
   macro avg       0.66      0.64      0.63      1202
weighted avg       0.65      0.64      0.63      1202

ROC-AUC: 0.7159


In [43]:
# Evaluate best model on test set
test_loss, test_y_true, test_y_prob = eval_one_epoch(
    resnet, test_loader, criterion, device
)

print(f"\nTest loss: {test_loss:.4f}")
print("Number of test samples:", len(test_y_true))
print("First 10 true labels:", test_y_true[:10])
print("First 10 predicted probabilities:", test_y_prob[:10])


                                                                                


Test loss: 0.6382
Number of test samples: 1202
First 10 true labels: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
First 10 predicted probabilities: [0.39562777 0.42700183 0.40202335 0.19539285 0.50387543 0.26866668
 0.34148452 0.39076948 0.29376507 0.60493916]


