In [None]:
#http://clam.mahmoodlab.org/

#SSL models for hOptimus

In [1]:
import os

In [3]:
hf_token = os.environ['HUGGINGFACE_TOKEN']

In [4]:
from huggingface_hub import login
import torch
import timm 
from torchvision import transforms


# Login to the Hugging Face hub, using your user access token that can be found here:
# https://huggingface.co/settings/tokens.
login(hf_token)

model = timm.create_model(
    "hf-hub:bioptimus/H-optimus-0", pretrained=True, init_values=1e-5, dynamic_img_size=False
)
model.to("cuda")
model.eval()

transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(
        mean=(0.707223, 0.578729, 0.703617), 
        std=(0.211883, 0.230117, 0.177517)
    ),
])

input = torch.rand(3, 224, 224)
input = transforms.ToPILImage()(input)

# We recommend using mixed precision for faster inference.
with torch.autocast(device_type="cuda", dtype=torch.float16):
    with torch.inference_mode():
        features = model(transform(input).unsqueeze(0).to("cuda"))

assert features.shape == (1, 1536)


In [8]:
import pandas as pd

def download_tcga_pathology():
    """
    Downloads TCGA pathology reports for downstream analysis.
    
    Returns:
        pd.DataFrame: TCGA pathology reports
    """
    # Downloads TCGA pathology reports for downstream analysis.
    os.system("curl -L https://data.mendeley.com/public-files/datasets/hyg5xkznpx/files/60abe141-9352-4a54-943c-3d015eabefea/file_downloaded --output TCGA_Reports.csv.zip")
    os.system("unzip -qq -o TCGA_Reports.csv.zip")
    tcga_reports = pd.read_csv("TCGA_Reports.csv")
    return tcga_reports
# Downloads TCGA pathology reports and extracts simplified disease phrases.
os.chdir("/gpfs/scratch/nk4167/TCGA_Path")
path_use=1
tcga_reports = download_tcga_pathology()
res_all = []
for i in range(path_use):
    patient_name = tcga_reports['patient_filename'][i].split('.')[0]
    text = tcga_reports['text'][i]
    break

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100   134  100   134    0     0    266      0 --:--:-- --:--:-- --:--:--   266
100 10.4M  100 10.4M    0     0  5155k      0  0:00:02  0:00:02 --:--:-- 8693k


In [None]:
# Custom WSI Reader and Tiler Implementation (replacing tiatoolbox)
import numpy as np
from PIL import Image
try:
    import cv2
    HAS_CV2 = True
except ImportError:
    HAS_CV2 = False
    print("OpenCV not available, using basic thresholding for tissue mask")
from typing import Tuple, Iterator, Optional

class CustomWSIReader:
    """Custom WSI reader that works with SVS files using openslide or PIL fallback"""
    def __init__(self, svs_path: str):
        self.svs_path = svs_path
        try:
            import openslide
            self.slide = openslide.OpenSlide(svs_path)
            self.use_openslide = True
            # Get level 0 dimensions (highest resolution)
            self.level0_dimensions = self.slide.level_dimensions[0]
            # Get microns per pixel if available
            try:
                mpp_x = float(self.slide.properties.get('openslide.mpp-x', 0.25))
                mpp_y = float(self.slide.properties.get('openslide.mpp-y', 0.25))
                self.mpp = (mpp_x + mpp_y) / 2
            except:
                self.mpp = 0.25  # Default assumption
        except ImportError:
            # Fallback to PIL for regular images
            print("openslide not available, using PIL fallback")
            self.slide = Image.open(svs_path)
            self.use_openslide = False
            self.level0_dimensions = self.slide.size
            self.mpp = 0.25  # Default assumption
        
        self.info = type('obj', (object,), {'mpp': [self.mpp]})()
    
    def slide_dimensions(self, resolution: float = 0.51, units: str = 'mpp') -> Tuple[int, int]:
        """Get slide dimensions at specified resolution"""
        if units == 'mpp':
            # Calculate scale factor
            scale = self.mpp / resolution
            w = int(self.level0_dimensions[0] * scale)
            h = int(self.level0_dimensions[1] * scale)
            return (w, h)
        return self.level0_dimensions
    
    def read_region(self, location: Tuple[int, int], level: int, size: Tuple[int, int]) -> Image.Image:
        """Read a region from the slide"""
        if self.use_openslide:
            return self.slide.read_region(location, level, size)
        else:
            # For PIL, crop the image
            return self.slide.crop((location[0], location[1], 
                                   location[0] + size[0], location[1] + size[1]))
    
    def get_thumbnail(self, size: Tuple[int, int] = (512, 512)) -> Image.Image:
        """Get a thumbnail of the slide"""
        if self.use_openslide:
            return self.slide.get_thumbnail(size)
        else:
            return self.slide.resize(size, Image.Resampling.LANCZOS)
    
    def tissue_mask(self, resolution: float = 1.0, units: str = "power") -> np.ndarray:
        """Create a simple tissue mask using color thresholding"""
        # Get thumbnail for mask generation
        thumb_size = (1024, 1024)
        thumb = self.get_thumbnail(thumb_size)
        thumb_np = np.array(thumb)
        
        # Convert to HSV for better color-based segmentation
        if HAS_CV2 and len(thumb_np.shape) == 3:
            hsv = cv2.cvtColor(thumb_np, cv2.COLOR_RGB2HSV)
            # Create mask: exclude white/light background
            # Tissue typically has lower brightness
            lower_bound = np.array([0, 0, 0])
            upper_bound = np.array([180, 255, 240])
            mask = cv2.inRange(hsv, lower_bound, upper_bound)
            # Resize mask to match slide dimensions if needed
            target_dims = self.slide_dimensions(resolution=self.mpp, units='mpp')
            mask_resized = cv2.resize(mask, target_dims, interpolation=cv2.INTER_NEAREST)
        else:
            # Fallback: simple brightness-based thresholding
            if len(thumb_np.shape) == 3:
                gray = np.mean(thumb_np, axis=2)
            else:
                gray = thumb_np
            # Simple threshold: tissue is darker than background
            threshold = np.percentile(gray, 80)  # Top 20% brightest = background
            mask = (gray < threshold).astype(np.uint8) * 255
            # Resize to target dimensions
            target_dims = self.slide_dimensions(resolution=self.mpp, units='mpp')
            mask_pil = Image.fromarray(mask).resize(target_dims, Image.Resampling.NEAREST)
            mask_resized = np.array(mask_pil)
        
        return mask_resized / 255.0  # Normalize to 0-1


class SlidingWindowPatchExtractor:
    """Custom sliding window patch extractor"""
    def __init__(self, input_img: CustomWSIReader, patch_size: Tuple[int, int], 
                 stride: Tuple[int, int], input_mask: Optional[np.ndarray] = None,
                 min_mask_ratio: float = 0.75):
        self.input_img = input_img
        self.patch_size = patch_size
        self.stride = stride
        self.input_mask = input_mask
        self.min_mask_ratio = min_mask_ratio
        
        # Get image dimensions at target resolution
        self.dims = input_img.slide_dimensions(resolution=input_img.mpp, units='mpp')
        self.w, self.h = self.dims
        
        # Calculate number of patches
        self.n_patches_x = (self.w - patch_size[0]) // stride[0] + 1
        self.n_patches_y = (self.h - patch_size[1]) // stride[1] + 1
        
        self.patches = []
        self._generate_patches()
    
    def _generate_patches(self):
        """Generate all valid patches"""
        for y in range(0, self.h - self.patch_size[1] + 1, self.stride[1]):
            for x in range(0, self.w - self.patch_size[0] + 1, self.stride[0]):
                # Check mask if available
                if self.input_mask is not None:
                    patch_mask = self.input_mask[y:y+self.patch_size[1], x:x+self.patch_size[0]]
                    mask_ratio = np.mean(patch_mask)
                    if mask_ratio < self.min_mask_ratio:
                        continue
                
                # Extract patch
                patch = self.input_img.read_region((x, y), 0, self.patch_size)
                # Convert to RGB if needed and resize to patch_size
                if isinstance(patch, Image.Image):
                    if patch.mode != 'RGB':
                        patch = patch.convert('RGB')
                    # Resize to ensure exact patch_size
                    if patch.size != self.patch_size:
                        patch = patch.resize(self.patch_size, Image.Resampling.LANCZOS)
                
                self.patches.append(patch)
    
    def __iter__(self):
        """Iterator interface"""
        return iter(self.patches)
    
    def __len__(self):
        return len(self.patches)


def download_bs_svs(patient_id, output_dir='./output'):
    """
    Searches GDC for files matching BS slides and downloads them.
    
    Args:
        patient_id (str): TCGA patient identifier
        output_dir (str): Directory to save downloaded slides
    """
    api_url = "https://api.gdc.cancer.gov/files"
    query = {
        "filters": {
            "op": "and",
            "content": [
                {"op": "=", "content": {"field": "cases.submitter_id", "value": [patient_id]}},
                {"op": "=", "content": {"field": "data_format", "value": "SVS"}},
                {"op": "=", "content": {"field": "file_name", "value": "*BS*"}}  # Strict BS filter
            ]
        },
        "fields": "file_id,file_name",
        "format": "JSON",
        "size": 1000
    }

    response = requests.post(api_url, json=query).json()
    files = response.get('data', {}).get('hits', [])
    
    if not files:
        raise ValueError(f"No BS slides found for {patient_id}")

    os.makedirs(output_dir, exist_ok=True)
    downloaded_files = []
    print(files)
    for f in files:
        # Builds the download URL and streams the file to disk.
        dl_url = f"https://api.gdc.cancer.gov/data/{f['file_id']}"
        path = os.path.join(output_dir, f['file_name'])
        
        with requests.get(dl_url, stream=True) as r:
            r.raise_for_status()
            with open(path, 'wb') as f:
                for chunk in r.iter_content(chunk_size=8192):
                    f.write(chunk)
        downloaded_files.append(path)
    
    return downloaded_files

def generate_wsi_patches(wsi_image: CustomWSIReader, wsi_mask: np.ndarray, 
                         patch_size: int = 224, overlap: int = 256//8, 
                         output_dir: str = "./output", 
                         tissue_threshold: float = 0.75, 
                         resolution: float = 0.51, 
                         min_patches: int = 100):
    """
    Generate overlapping tissue patches from WSIs with quality control.
    
    Args:
        wsi_image: CustomWSIReader object for the image
        wsi_mask: numpy array tissue mask
        patch_size (int): Patch size in pixels (square)
        overlap (int): Overlap between patches in pixels
        output_dir (str): Directory to save patches
        tissue_threshold (float): Minimum tissue content (0-1)
        resolution (float): Microns per pixel (default 0.51 = 20x)
        min_patches (int): Minimum number of patches to collect
    
    Returns:
        SlidingWindowPatchExtractor: Patch generator object
    """
    
    # Verify matching dimensions at target resolution
    img_dims = wsi_image.slide_dimensions(resolution=resolution, units='mpp')
    print(f"Image dimensions: {img_dims}")
    print(f"Mask shape: {wsi_mask.shape}")
    
    # Calculate stride and initialize patch generator
    stride = patch_size - overlap
    patch_gen = SlidingWindowPatchExtractor(
        input_img=wsi_image,
        patch_size=(patch_size, patch_size),
        stride=(stride, stride),
        input_mask=wsi_mask,
        min_mask_ratio=tissue_threshold
    )
    return patch_gen
    
def download_slide(patient_id):
    """
    Downloads a TCGA slide SVS and prepares patches for processing.
    Output directory is named after the patient ID.
    
    Args:
        patient_id (str): TCGA patient identifier (e.g., 'TCGA-BP-5195')

    Returns:
        tuple: Path to SVS, CustomWSIReader object, tissue mask, patch generator
    """
    # Orchestrates download of slide SVS and preparation of patches.
    svs_path = download_bs_svs(patient_id, output_dir=patient_id + '_output')[0]
    wsi = CustomWSIReader(svs_path)
    mask = wsi.tissue_mask(resolution=1, units="power")
    patch_gen = generate_wsi_patches(wsi, mask, resolution=wsi.mpp)
    return (svs_path, wsi, mask, patch_gen)

In [16]:
import requests
import numpy as np
import pandas as pd
from pathlib import Path
# Note: Custom WSI reader and tiler implementation replaces tiatoolbox
# See the previous cell for CustomWSIReader and SlidingWindowPatchExtractor classes

In [17]:
 svs_path,wsi,mask,patch_gen = download_slide(patient_name)

[{'id': 'ba4c46ba-08dc-4193-a8b2-20f20c1f7a76', 'file_name': 'TCGA-BP-5195-01A-02-BS2.f7182403-4158-4823-9eb7-70a11a6ae26c.svs', 'file_id': 'ba4c46ba-08dc-4193-a8b2-20f20c1f7a76'}, {'id': '333eab78-57c0-4e27-82fa-67dc068b91ec', 'file_name': 'TCGA-BP-5195-01A-01-BS1.6ea4093c-d6bb-4926-b2c3-b7b2d4c12689.svs', 'file_id': '333eab78-57c0-4e27-82fa-67dc068b91ec'}]
[34001 31777]
[34001 31777]


In [19]:
from huggingface_hub import login
import torch
import timm 
from torchvision import transforms


# Login to the Hugging Face hub, using your user access token that can be found here:
# https://huggingface.co/settings/tokens.
login(hf_token)

model = timm.create_model(
    "hf-hub:bioptimus/H-optimus-0", pretrained=True, init_values=1e-5, dynamic_img_size=False
)
model.to("cuda")
model.eval()

transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(
        mean=(0.707223, 0.578729, 0.703617), 
        std=(0.211883, 0.230117, 0.177517)
    ),
])

features_all = []
# We recommend using mixed precision for faster inference.
with torch.autocast(device_type="cuda", dtype=torch.float16):
    for patch in patch_gen:
        # Ensure patch is a PIL Image
        if isinstance(patch, np.ndarray):
            patch = Image.fromarray(patch)
        elif not isinstance(patch, Image.Image):
            patch = Image.fromarray(np.array(patch))
        
        # Transform and process
        input_tensor = transform(patch).unsqueeze(0).to("cuda")
        with torch.inference_mode():
            features = model(input_tensor)
            features_all.append(features.cpu())  # Move to CPU to save GPU memory

|2025-04-03|05:51:09.812| [INFO] Loading pretrained weights from Hugging Face hub (bioptimus/H-optimus-0)


OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 MiB. GPU 0 has a total capacity of 15.77 GiB of which 704.00 KiB is free. Process 2858670 has 3.90 GiB memory in use. Including non-PyTorch memory, this process has 11.87 GiB memory in use. Of the allocated memory 11.44 GiB is allocated by PyTorch, and 60.72 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)