In [1]:
!pip install --quiet openslide-python opencv-python scikit-image pillow matplotlib pandas
!pip install --quiet timm segmentation-models-pytorch

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.8/154.8 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m105.5 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m79.3 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m43.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/

In [None]:
from pathlib import Path
import os, random, glob, sys, subprocess
import numpy as np
import pandas as pd
import cv2
import matplotlib.pyplot as plt
import openslide
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

In [36]:
SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)

TRAIN_DIR = Path("/kaggle/input/prostate-cancer-grade-assessment/train_images")
MASK_DIR  = Path("/kaggle/input/prostate-cancer-grade-assessment/train_label_masks")
OUT_DIR   = Path("/kaggle/working/panda_simple")
OUT_DIR.mkdir(parents=True, exist_ok=True)

N_TRAIN_SLIDES = 10
N_VAL_SLIDES   = 3
N_TEST_SLIDES  = 3

LEVEL        = 1
PATCH_SIZE   = 512
STRIDE       = 512
POS_MAX      = 500
NEG_MAX      = 300
TISSUE_MIN_PCT = 0.05

BATCH_SIZE   = 8
EPOCHS       = 40
LR           = 2e-4
NUM_WORKERS  = 0
IMNET_MEAN   = (0.485, 0.456, 0.406)
IMNET_STD    = (0.229, 0.224, 0.225)
MODEL_PATH   = OUT_DIR / "unet_best.pth"

In [None]:

def tissue_mask_rgb(img_rgb):
    hsv = cv2.cvtColor(img_rgb, cv2.COLOR_RGB2HSV)
    S = hsv[...,1] / 255.0; V = hsv[...,2] / 255.0
    return (S > 0.07) & (V > 0.20) & (V < 0.95)

def tumor_bool_from_gray(mask_gray):
    u = set(np.unique(mask_gray).tolist())
    if any(x in u for x in (3,4,5)):
        return np.isin(mask_gray, [3,4,5])
    elif 2 in u:
        return (mask_gray == 2)
    else:
        return np.zeros_like(mask_gray, dtype=bool)

def list_slide_ids():
    ids = []
    for p in TRAIN_DIR.glob("*.tiff"):
        sid = p.stem
        if (MASK_DIR / f"{sid}_mask.tiff").exists():
            ids.append(sid)
    return sorted(ids)

def choose_mask_level(mask, target_down):
    downs = [float(d) for d in mask.level_downsamples]
    return int(np.argmin([abs(d - target_down) for d in downs]))

In [None]:
def process_one_slide(sid, level=LEVEL, patch=PATCH_SIZE, stride=STRIDE,
                      pos_max=POS_MAX, neg_max=NEG_MAX, tissue_min=TISSUE_MIN_PCT):
    slide_path = str(TRAIN_DIR / f"{sid}.tiff")
    mask_path  = str(MASK_DIR  / f"{sid}_mask.tiff")
    ss = openslide.OpenSlide(slide_path)
    ms = openslide.OpenSlide(mask_path)
    level_slide = min(level, ss.level_count-1)
    down_slide  = float(ss.level_downsamples[level_slide])
    wL, hL      = ss.level_dimensions[level_slide]
    level_mask  = choose_mask_level(ms, down_slide)
    W0s, H0s = ss.level_dimensions[0]
    W0m, H0m = ms.level_dimensions[0]
    rx, ry   = (W0m / W0s, H0m / H0s)
    sdir = OUT_DIR / "tiles" / sid
    (sdir / "img").mkdir(parents=True, exist_ok=True)
    (sdir / "msk").mkdir(parents=True, exist_ok=True)
    rows, pos, neg = [], 0, 0
    for y in range(0, max(hL - patch, 0) + 1, stride):
        if pos >= pos_max and neg >= neg_max: break
        for x in range(0, max(wL - patch, 0) + 1, stride):
            if pos >= pos_max and neg >= neg_max: break
            x0 = int(x * down_slide); y0 = int(y * down_slide)
            img = np.array(ss.read_region((x0, y0), level_slide, (patch, patch)).convert("RGB"))
            mx0 = int(x0 * rx); my0 = int(y0 * ry)
            try:
                m_rgba = ms.read_region((mx0, my0), level_mask, (patch, patch))
            except Exception:
                continue
            m_arr = np.array(m_rgba)
            if m_arr.size == 0: 
                continue
            mgray = m_arr[..., 0].astype(np.uint8)
            tumor = tumor_bool_from_gray(mgray)
            if tumor.any():

                ip = sdir / "img" / f"{sid}_x{x}_y{y}.png"
                mp = sdir / "msk" / f"{sid}_x{x}_y{y}.png"
                cv2.imwrite(str(ip), cv2.cvtColor(img, cv2.COLOR_RGB2BGR))
                cv2.imwrite(str(mp), (tumor.astype(np.uint8) * 255))
                rows.append([str(ip), str(mp), 1, sid]); pos += 1
            else:

                if tissue_mask_rgb(cv2.resize(img, (256, 256))).mean() < tissue_min:
                    continue
                ip = sdir / "img" / f"{sid}_x{x}_y{y}.png"
                mp = sdir / "msk" / f"{sid}_x{x}_y{y}.png"
                cv2.imwrite(str(ip), cv2.cvtColor(img, cv2.COLOR_RGB2BGR))
                cv2.imwrite(str(mp), (tumor.astype(np.uint8) * 255))
                rows.append([str(ip), str(mp), 0, sid]); neg += 1
    ss.close(); ms.close()
    return rows, pos, neg

In [None]:
def build_split(ids, name):
    all_rows, total_pos, total_neg = [], 0, 0
    for sid in ids:
        rows, pos, neg = process_one_slide(sid)
        all_rows.extend(rows); total_pos += pos; total_neg += neg
        print(f"[{name}] {sid}: pos={pos}, neg={neg}, tiles={pos+neg}")
    df = pd.DataFrame(all_rows, columns=["img","mask","has_tumor","slide_id"])
    df = df.sample(frac=1.0, random_state=SEED).reset_index(drop=True)
    out_csv = OUT_DIR / f"{name}.csv"
    df.to_csv(out_csv, index=False)
    print(f"[{name}] saved {len(df)} tiles → {out_csv} | pos={total_pos} neg={total_neg}")
    return df

In [None]:
all_ids = list_slide_ids()
random.shuffle(all_ids)
need = min(len(all_ids), N_TRAIN_SLIDES + N_VAL_SLIDES + N_TEST_SLIDES)
picked = all_ids[:need]
ntr = min(N_TRAIN_SLIDES, len(picked))
nva = min(N_VAL_SLIDES, max(0, len(picked)-ntr))
train_ids = picked[:ntr]
val_ids   = picked[ntr:ntr+nva]
test_ids  = picked[ntr+nva:]
print(f"Slides → train={len(train_ids)} val={len(val_ids)} test={len(test_ids)}")

train_df = build_split(train_ids, "train")
val_df   = build_split(val_ids,   "val")
test_df  = build_split(test_ids,  "test")
if not (OUT_DIR / "test.csv").exists():
    pd.read_csv(OUT_DIR / "val.csv").to_csv(OUT_DIR / "test.csv", index=False)


In [None]:
class SegDataset(Dataset):
    def __init__(self, manifest_csv, augment=False):
        self.df = pd.read_csv(manifest_csv)
        self.augment = augment
        self.mean = np.array(IMNET_MEAN, dtype=np.float32)
        self.std  = np.array(IMNET_STD,  dtype=np.float32)
        self.df = self.df.dropna(subset=["img","mask"]).reset_index(drop=True)
    def __len__(self): return len(self.df)
    def __getitem__(self, i):
        row = self.df.iloc[i]
        img = cv2.cvtColor(cv2.imread(str(row["img"])), cv2.COLOR_BGR2RGB)
        msk = cv2.imread(str(row["mask"]), cv2.IMREAD_GRAYSCALE)
        if img is None or msk is None:
            raise FileNotFoundError("bad path")
        if self.augment:
            if random.random() < 0.5:
                img = np.ascontiguousarray(np.fliplr(img)); msk = np.ascontiguousarray(np.fliplr(msk))
            if random.random() < 0.5:
                img = np.ascontiguousarray(np.flipud(img)); msk = np.ascontiguousarray(np.flipud(msk))
        img = img.astype(np.float32)/255.0
        img = (img - self.mean)/self.std
        img = np.transpose(img,(2,0,1))
        msk = (msk.astype(np.float32)/255.0)[None]
        return torch.from_numpy(img), torch.from_numpy(msk)

In [None]:
class DoubleConv(nn.Module):
    def __init__(self, in_ch, out_ch):
        super().__init__()
        self.seq = nn.Sequential(
            nn.Conv2d(in_ch, out_ch, 3, padding=1), nn.BatchNorm2d(out_ch), nn.ReLU(inplace=True),
            nn.Conv2d(out_ch, out_ch, 3, padding=1), nn.BatchNorm2d(out_ch), nn.ReLU(inplace=True)
        )
    def forward(self, x): return self.seq(x)

class UNet(nn.Module):
    def __init__(self, n_classes=1):
        super().__init__()
        self.inc   = DoubleConv(3, 32)
        self.down1 = nn.Sequential(nn.MaxPool2d(2), DoubleConv(32, 64))
        self.down2 = nn.Sequential(nn.MaxPool2d(2), DoubleConv(64, 128))
        self.down3 = nn.Sequential(nn.MaxPool2d(2), DoubleConv(128, 256))
        self.down4 = nn.Sequential(nn.MaxPool2d(2), DoubleConv(256, 256))
        self.up1   = nn.ConvTranspose2d(256, 256, 2, stride=2)
        self.dec1  = DoubleConv(256+256, 256)
        self.up2   = nn.ConvTranspose2d(256, 128, 2, stride=2)
        self.dec2  = DoubleConv(128+128, 128)
        self.up3   = nn.ConvTranspose2d(128, 64, 2, stride=2)
        self.dec3  = DoubleConv(64+64, 64)
        self.up4   = nn.ConvTranspose2d(64, 32, 2, stride=2)
        self.dec4  = DoubleConv(32+32, 32)
        self.outc  = nn.Conv2d(32, n_classes, 1)
    def forward(self, x):
        x1 = self.inc(x)
        x2 = self.down1(x1)
        x3 = self.down2(x2)
        x4 = self.down3(x3)
        x5 = self.down4(x4)
        x  = self.up1(x5); x = torch.cat([x, x4], dim=1); x = self.dec1(x)
        x  = self.up2(x);  x = torch.cat([x, x3], dim=1); x = self.dec2(x)
        x  = self.up3(x);  x = torch.cat([x, x2], dim=1); x = self.dec3(x)
        x  = self.up4(x);  x = torch.cat([x, x1], dim=1); x = self.dec4(x)
        return self.outc(x)

In [None]:
def dice_loss(logits, targets, eps=1e-6):
    probs = torch.sigmoid(logits)
    num = 2.0 * (probs * targets).sum(dim=(2,3))
    den = (probs.pow(2) + targets.pow(2)).sum(dim=(2,3)) + eps
    return (1.0 - num / den).mean()





In [None]:
train_loader = DataLoader(SegDataset(OUT_DIR/"train.csv", augment=True),
                          batch_size=BATCH_SIZE, shuffle=True,
                          num_workers=NUM_WORKERS, pin_memory=True)
val_loader   = DataLoader(SegDataset(OUT_DIR/"val.csv", augment=False),
                          batch_size=BATCH_SIZE, shuffle=False,
                          num_workers=NUM_WORKERS, pin_memory=True)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model  = UNet().to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=1e-4)
bce = nn.BCEWithLogitsLoss()
best_val = 1e9

In [None]:
for epoch in range(1, EPOCHS+1):
    model.train(); tr_loss = 0.0
    for imgs, msks in train_loader:
        imgs, msks = imgs.to(device), msks.to(device)
        logits = model(imgs)
        loss = 0.5 * bce(logits, msks) + 0.5 * dice_loss(logits, msks)
        optimizer.zero_grad(); loss.backward(); optimizer.step()
        tr_loss += loss.item() * imgs.size(0)
    tr_loss /= len(train_loader.dataset)
    model.eval(); va_loss = 0.0
    # with torch.no_grad():
    #     for imgs, msks in val_loader:
    #         imgs, msks = imgs.to(device), msks.to(device)
    #         logits = model(imgs)
    #         loss = 0.5 * bce(logits, msks) + 0.5 * dice_loss(logits, msks)
    #         va_loss += loss.item() * imgs.size(0)
    # va_loss /= len(val_loader.dataset)
    print(f"Epoch {epoch:02d}  train_loss={tr_loss:.4f} ")
    # if va_loss < best_val:
    #     best_val = va_loss
    torch.save(model.state_dict(), MODEL_PATH)
    # print("saved best model")

In [None]:
def preprocess_rgb_uint8(img_rgb):
    x = img_rgb.astype(np.float32) / 255.0
    x = (x - np.array(IMNET_MEAN, np.float32)) / np.array(IMNET_STD, np.float32)
    x = np.transpose(x, (2,0,1))
    return torch.from_numpy(x)[None]

def to_red(mask01):
    mk = (mask01.astype(np.uint8) * 255)
    return np.dstack([mk, np.zeros_like(mk), np.zeros_like(mk)])

model.load_state_dict(torch.load(MODEL_PATH, map_location=device))
model.eval()

In [None]:
df_test = pd.read_csv(OUT_DIR/"test.csv")
if "slide_id" in df_test.columns and len(df_test["slide_id"])>0:
    slide_id = df_test["slide_id"].iloc[0]
    df_slide = df_test[df_test["slide_id"] == slide_id].reset_index(drop=True)
else:
    df_slide = df_test.reset_index(drop=True)

K = 12
sel_pos = df_slide[df_slide["has_tumor"] == 1]
if len(sel_pos) >= K:
    sel = sel_pos.head(K).reset_index(drop=True)
else:
    need = min(K - len(sel_pos), len(df_slide))
    sel = pd.concat([sel_pos, df_slide.sample(need, random_state=SEED)], axis=0).head(K).reset_index(drop=True)


In [None]:





rows = len(sel)
plt.figure(figsize=(15, 3.2*rows))
with torch.no_grad():
    for i, row in sel.iterrows():
        img_path = str(row["img"]); msk_path = str(row["mask"])
        im = cv2.cvtColor(cv2.imread(img_path), cv2.COLOR_BGR2RGB)
        gt = cv2.imread(msk_path, cv2.IMREAD_GRAYSCALE)
        if im is None or gt is None:
            continue
        prob = torch.sigmoid(model(preprocess_rgb_uint8(im).to(device)))[0,0].cpu().numpy()
        pred01 = (prob >= 0.5).astype(np.uint8)
        pred_red = to_red(pred01)
        gt_red   = to_red((gt > 127).astype(np.uint8))
        overlay  = cv2.addWeighted(im, 0.70, pred_red, 0.60, 0)
        ax1 = plt.subplot(rows, 5, 5*i+1); ax1.imshow(im);       ax1.set_title(f"Patch #{i+1}"); ax1.axis('off')
        ax2 = plt.subplot(rows, 5, 5*i+2); ax2.imshow(pred_red); ax2.set_title("Predicted");     ax2.axis('off')
        ax3 = plt.subplot(rows, 5, 5*i+3); ax3.imshow(overlay);  ax3.set_title("Overlay");       ax3.axis('off')
        ax4 = plt.subplot(rows, 5, 5*i+4); ax4.imshow(gt_red);   ax4.set_title("Ground truth");  ax4.axis('off')
        ax5 = plt.subplot(rows, 5, 5*i+5); ax5.imshow(prob, cmap='turbo', vmin=0, vmax=1); ax5.set_title("Heatmap"); ax5.axis('off')
plt.tight_layout(); plt.show()


Slides → train=10 val=3 test=3
[train] e3a86173361dbf0574a3781d540b5259: pos=21, neg=1, tiles=22
[train] ba99968b0499b6e2dca442a101a2259c: pos=18, neg=0, tiles=18
[train] 92d2a85fdb81d51f59c5781eb2c5d0a8: pos=5, neg=0, tiles=5
[train] fa1a79a5248bf5f5742fb14dabc070c6: pos=7, neg=0, tiles=7
[train] 6ad78c89d6b8feadad6d3ad85c743e32: pos=6, neg=0, tiles=6
[train] f2f9aa01d545ac770dce5e61e6d756d4: pos=5, neg=0, tiles=5
[train] 933f22ce388303435d472aa4f9a56d66: pos=13, neg=2, tiles=15
[train] fa6a2270986c61fcb0a34e9dd91a910c: pos=6, neg=10, tiles=16
[train] 903c88d5dc12315b4f6e9d6ad6b82537: pos=13, neg=0, tiles=13
[train] 919408ee796bb1d71ca9fc5ee0019400: pos=15, neg=11, tiles=26
[train] saved 133 tiles → /kaggle/working/panda_simple/train.csv | pos=109 neg=24
[val] c449f43f419bef77e8f8569de2858199: pos=11, neg=12, tiles=23
[val] bcc84573e785de42d7af076ae2b30479: pos=10, neg=0, tiles=10
[val] 80b57cbfbaa4a0565a5e4839c24223b6: pos=8, neg=0, tiles=8
[val] saved 41 tiles → /kaggle/working/pand