# Tutorial 1: Loading the CheXpert Dataset and Data Preprocessing

## Downloading the Dataset

In [None]:
!pip install kaggle
from google.colab import drive
drive.mount('/content/drive')

# put kaggle.json in /content/drive/MyDrive
!mkdir ~/.kaggle
!cp /content/drive/MyDrive/kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

# download dataset
!kaggle datasets download -d ashery/chexpert
!unzip chexpert.zip -d chexpert


Mounted at /content/drive
cp: cannot stat '/content/drive/MyDrive/kaggle.json': No such file or directory
chmod: cannot access '/root/.kaggle/kaggle.json': No such file or directory
Traceback (most recent call last):
  File "/usr/local/bin/kaggle", line 10, in <module>
    sys.exit(main())
             ^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/kaggle/cli.py", line 68, in main
    out = args.func(**command_args)
          ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/kaggle/api/kaggle_api_extended.py", line 1741, in dataset_download_cli
    with self.build_kaggle_client() as kaggle:
         ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/kaggle/api/kaggle_api_extended.py", line 688, in build_kaggle_client
    username=self.config_values['username'],
             ~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^
KeyError: 'username'
unzip:  cannot find or open chexpert.zip, chexpert.zip.zip or chexpert.zip.ZIP.


## Upload to Colab

Note that for this demonstration we use only a subset of the small training data set.

In [None]:
!rsync -avh --ignore-errors "/content/drive/MyDrive/CheXpertSubset/CheXpert-v1.0-small/valid" "/content/chexpert/"
!rsync -avh --ignore-errors "/content/drive/MyDrive/CheXpertSubset/train.csv" "/content/chexpert/"
!rsync -avh --ignore-errors "/content/drive/MyDrive/CheXpertSubset/valid.csv" "/content/chexpert/"

In [None]:
SRC="/content/drive/MyDrive/CheXpertSubset/CheXpert-v1.0-small/train"
DST="/content/chexpert/train"

!mkdir -p "$DST"

import os
import subprocess
import time

patients = sorted(os.listdir(SRC))

BATCH_SIZE = 50
total = 2500 #len(patients)

print(f"Found {total} patient folders. Starting batch copy...")

for i in range(0, total, BATCH_SIZE):
    batch = patients[i:i+BATCH_SIZE]
    print(f"\n=== Copying batch {i//BATCH_SIZE + 1} ({i} to {i+len(batch)-1}) ===")

    for p in batch:
        src_path = os.path.join(SRC, p)
        dst_path = os.path.join(DST, p)

        # Skip if already copied
        if os.path.exists(dst_path):
            print(f"{p} already exists — skipping")
            continue

        # Attempt copy with retry
        retries = 3
        for attempt in range(1, retries+1):
            print(f"Copying {p} (attempt {attempt})...")

            result = subprocess.run(
                ["rsync", "-a", src_path, DST],
                stderr=subprocess.PIPE,
                stdout=subprocess.PIPE,
                text=True
            )

            if result.returncode == 0:
                print(f"Finished {p}")
                break
            else:
                print(f"Error copying {p}: {result.stderr.strip()}")
                print("Sleeping 5 seconds before retry...")
                time.sleep(5)

        if result.returncode != 0:
            print(f"Failed to copy {p} after {retries} attempts.")


### Modify train.csv to match subset and paths: we only want one view

In [None]:
def collect_valid_paths(root):
    valid = set()
    for root_dir, _, files in os.walk(root):
        for f in files:
            if f.endswith(".jpg"):
                rel = os.path.relpath(os.path.join(root_dir, f), root)
                valid.add(rel)
    return valid

valid_paths = collect_valid_paths("/content/chexpert/train")

In [None]:
import pandas as pd

df = pd.read_csv("/content/chexpert/train.csv").fillna(0)
df["Path"] = df["Path"].str.replace("CheXpert-v1.0-small/", "")
df = df[df["Path"].str.contains("frontal", na=False)]


df = df[df["Path"].isin(valid_paths)]
df.to_csv("/content/chexpert/train_subset.csv", index=False)

print(f"Filtered dataset size: {len(df)}")

Filtered dataset size: 0


In [None]:

df = pd.read_csv("/content/chexpert/train.csv").fillna(0)

# Strip both prefixes
df["Path"] = (
    df["Path"]
    .str.replace("CheXpert-v1.0-small/train/", "", regex=False)
)

# frontal only
df = df[df["Path"].str.contains("frontal", na=False)]

# keep only files that exist
df = df[df["Path"].isin(valid_paths)]

print(f"Filtered dataset size: {len(df)}")
df.to_csv("/content/chexpert/train_subset.csv", index=False)

print("CSV Path example:")
print(df["Path"].iloc[0])

print("\nValid path example:")
print(list(valid_paths)[0])


# Creating a Pytorch Data Class

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms

class CheXpertDataset(Dataset):
    def __init__(self, csv_path, root, label_name="Pneumonia"):
        self.df = pd.read_csv(csv_path)
        self.df = self.df.fillna(0)

        # Keep only rows with valid image paths
        self.df = self.df[self.df['Path'].notna()]

        self.root = root
        self.label_name = label_name

        self.transform = transforms.Compose([
            transforms.Resize((128, 128)),
            transforms.ToTensor()
        ])

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]

        rel_path = row["Path"].replace("CheXpert-v1.0-small/", "")
        img_path = os.path.join(self.root, rel_path)

        # Skip missing images
        if not os.path.exists(img_path):
            print(f"Missing image: {img_path}")
            return self.__getitem__((idx + 1) % len(self))

        img = Image.open(img_path).convert("L")
        img = self.transform(img)

        # Labels are -1,0,1 in CheXpert → convert to {0,1}
        y = torch.tensor([1.0 if row[self.label_name] == 1 else 0.0], dtype=torch.float32)

        return img, y

