In [None]:
#http://clam.mahmoodlab.org/

#SSL models for hOptimus

In [1]:
import os

In [3]:
hf_token = os.environ['HUGGINGFACE_TOKEN']

In [4]:
from huggingface_hub import login
import torch
import timm 
from torchvision import transforms


# Login to the Hugging Face hub, using your user access token that can be found here:
# https://huggingface.co/settings/tokens.
login(hf_token)

model = timm.create_model(
    "hf-hub:bioptimus/H-optimus-0", pretrained=True, init_values=1e-5, dynamic_img_size=False
)
model.to("cuda")
model.eval()

transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(
        mean=(0.707223, 0.578729, 0.703617), 
        std=(0.211883, 0.230117, 0.177517)
    ),
])

input = torch.rand(3, 224, 224)
input = transforms.ToPILImage()(input)

# We recommend using mixed precision for faster inference.
with torch.autocast(device_type="cuda", dtype=torch.float16):
    with torch.inference_mode():
        features = model(transform(input).unsqueeze(0).to("cuda"))

assert features.shape == (1, 1536)


In [8]:
import pandas as pd

def download_tcga_pathology():
    """
    Downloads TCGA pathology reports for downstream analysis.
    
    Returns:
        pd.DataFrame: TCGA pathology reports
    """
    # Downloads TCGA pathology reports for downstream analysis.
    os.system("curl -L https://data.mendeley.com/public-files/datasets/hyg5xkznpx/files/60abe141-9352-4a54-943c-3d015eabefea/file_downloaded --output TCGA_Reports.csv.zip")
    os.system("unzip -qq -o TCGA_Reports.csv.zip")
    tcga_reports = pd.read_csv("TCGA_Reports.csv")
    return tcga_reports
# Downloads TCGA pathology reports and extracts simplified disease phrases.
os.chdir("/gpfs/scratch/nk4167/TCGA_Path")
path_use=1
tcga_reports = download_tcga_pathology()
res_all = []
for i in range(path_use):
    patient_name = tcga_reports['patient_filename'][i].split('.')[0]
    text = tcga_reports['text'][i]
    break

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100   134  100   134    0     0    266      0 --:--:-- --:--:-- --:--:--   266
100 10.4M  100 10.4M    0     0  5155k      0  0:00:02  0:00:02 --:--:-- 8693k


In [15]:
#Download a TCGA slide SVS using their API
def download_bs_svs(patient_id,output_dir='./output'):
    """
    Searches GDC for files matching BS slides and downloads them.
    
    Args:
        patient_id (str): TCGA patient identifier
        output_dir (str): Directory to save downloaded slides
    """
    api_url = "https://api.gdc.cancer.gov/files"
    query = {
        "filters": {
            "op": "and",
            "content": [
                {"op": "=", "content": {"field": "cases.submitter_id", "value": [patient_id]}},
                {"op": "=", "content": {"field": "data_format", "value": "SVS"}},
                {"op": "=", "content": {"field": "file_name", "value": "*BS*"}}  # Strict BS filter
            ]
        },
        "fields": "file_id,file_name",
        "format": "JSON",
        "size": 1000
    }

    response = requests.post(api_url, json=query).json()
    files = response.get('data', {}).get('hits', [])
    
    if not files:
        raise ValueError(f"No BS slides found for {patient_id}")

    os.makedirs(output_dir, exist_ok=True)
    downloaded_files = []
    print(files)
    for f in files:
        # Builds the download URL and streams the file to disk.
        dl_url = f"https://api.gdc.cancer.gov/data/{f['file_id']}"
        path = os.path.join(output_dir, f['file_name'])
        
        with requests.get(dl_url, stream=True) as r:
            r.raise_for_status()
            with open(path, 'wb') as f:
                for chunk in r.iter_content(chunk_size=8192):
                    f.write(chunk)
        downloaded_files.append(path)
    
    return downloaded_files

def generate_wsi_patches(wsi_image, wsi_mask, patch_size=224, overlap=256//8, output_dir="./output", 
                         tissue_threshold=0.75, resolution=0.51, min_patches=100):
    """
    Generate overlapping tissue patches from WSIs with quality control.
    
    Args:
        image_path (str): Path to input SVS image
        mask_path (str): Path to segmentation mask
        patch_size (int): Patch size in pixels (square)
        overlap (int): Overlap between patches in pixels
        output_dir (str): Directory to save patches
        tissue_threshold (float): Minimum tissue content (0-1)
        resolution (float): Microns per pixel (default 0.5 = 20x)
        min_patches (int): Minimum number of patches to collect
    
    Returns:
        dict: Metadata about generated patches
    """
    
    # Verify matching dimensions at target resolution
    img_dims = wsi_image.slide_dimensions(resolution=resolution, units='mpp')
    mask_dims = wsi_mask.slide_dimensions(resolution=resolution, units='mpp')
    print(img_dims)
    print(mask_dims)
    # Calculate stride and initialize patch generator
    stride = patch_size - overlap
    patch_gen = SlidingWindowPatchExtractor(
        # Configures sliding window on WSI image and mask.
        input_img=wsi_image,
        patch_size=(patch_size, patch_size),
        stride=(stride, stride),
        input_mask=wsi_mask,
        min_mask_ratio=tissue_threshold
    )
    return(patch_gen)
    
def download_slide(patient_id):
    """
    Downloads a TCGA slide SVS and prepares patches for processing.
    Output directory is named after the patient ID.
    
    Args:
        patient_id (str): TCGA patient identifier

    Returns:
        tuple: Path to SVS, WSIReader object, tissue mask, patch generator
    """
    # Orchestrates download of slide SVS and preparation of patches.
    svs_path = download_bs_svs(patient_id, output_dir=patient_id + 'output')[0]
    wsi = WSIReader.open(svs_path)
    mask = wsi.tissue_mask(resolution=1, units="power")
    patch_gen = generate_wsi_patches(wsi, mask, resolution= wsi.info.mpp[0])
    return(svs_path,wsi,mask,patch_gen)

In [16]:
import requests
import tiatoolbox
from tiatoolbox.models.engine.semantic_segmentor import SemanticSegmentor  # For WSI segmentation.
from tiatoolbox.wsicore.wsireader import WSIReader
from tiatoolbox.utils import imwrite
import numpy as np
from tiatoolbox.tools.patchextraction import SlidingWindowPatchExtractor
from tiatoolbox.models import IOSegmentorConfig
import pandas as pd #type: ignore
from tiatoolbox import rcParam #type: ignore
from pathlib import Path

In [17]:
 svs_path,wsi,mask,patch_gen = download_slide(patient_name)

[{'id': 'ba4c46ba-08dc-4193-a8b2-20f20c1f7a76', 'file_name': 'TCGA-BP-5195-01A-02-BS2.f7182403-4158-4823-9eb7-70a11a6ae26c.svs', 'file_id': 'ba4c46ba-08dc-4193-a8b2-20f20c1f7a76'}, {'id': '333eab78-57c0-4e27-82fa-67dc068b91ec', 'file_name': 'TCGA-BP-5195-01A-01-BS1.6ea4093c-d6bb-4926-b2c3-b7b2d4c12689.svs', 'file_id': '333eab78-57c0-4e27-82fa-67dc068b91ec'}]
[34001 31777]
[34001 31777]


In [19]:
from huggingface_hub import login
import torch
import timm 
from torchvision import transforms


# Login to the Hugging Face hub, using your user access token that can be found here:
# https://huggingface.co/settings/tokens.
login(hf_token)

model = timm.create_model(
    "hf-hub:bioptimus/H-optimus-0", pretrained=True, init_values=1e-5, dynamic_img_size=False
)
model.to("cuda")
model.eval()

transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(
        mean=(0.707223, 0.578729, 0.703617), 
        std=(0.211883, 0.230117, 0.177517)
    ),
])

features_all = []
# We recommend using mixed precision for faster inference.
with torch.autocast(device_type="cuda", dtype=torch.float16):
    for i in patch_gen:
        input = i
        with torch.inference_mode():
            features = model(transform(input).unsqueeze(0).to("cuda"))
            features_all.append(features)

|2025-04-03|05:51:09.812| [INFO] Loading pretrained weights from Hugging Face hub (bioptimus/H-optimus-0)


OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 MiB. GPU 0 has a total capacity of 15.77 GiB of which 704.00 KiB is free. Process 2858670 has 3.90 GiB memory in use. Including non-PyTorch memory, this process has 11.87 GiB memory in use. Of the allocated memory 11.44 GiB is allocated by PyTorch, and 60.72 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)