## Tutorial 1: Visium HD datasets preprocess demo
#### 1. Download the raw data from 10x Genoimcs ：Metadata: https://cf.10xgenomics.com/samples/spatial-exp/4.0.1/Visium_HD_Mouse_Brain/Visium_HD_Mouse_Brain_binned_outputs.tar.gz; Tif image: https://cf.10xgenomics.com/samples/spatial-exp/4.0.1/Visium_HD_Mouse_Brain/Visium_HD_Mouse_Brain_tissue_image.tif
#### 2. Choose the square_008um as raw data.
#### 3. Preprocess-data format
- `re_image.tif`：Raw histology image
- `pseudo_st.csv`：Gene count matrix。
  - Row 1: Gene names.
  - Row 2 and after: Each row is a spot.
    - Column 1: Spot ID.
    - Column 2 and after: Each column is a gene.
- `pseudo_locs.csv`：Spot locations
  - Row 1: Header
  - Row 2 and after: Each row is a spot. Must match rows in `pseudo_st.csv`
    - Column 1: Spot ID
    - Column 2: x-coordinate (horizontal axis). Must be in the same space as axis-1 (column) of the array indices of pixels in `re_image.tif`.
    - Column 3: y-coordinate (vertical axis). Must be in the same space as axis-0 (row) of the array indices of pixels in `re_image.tif`.
- `mask.png`：Tissue segmentation based on the valid sequencing area.
- `gene_names.txt`: Gene list.
- `pixel-size-raw.txt`：Side length (in micrometers) of pixels in `re_image.tif`.This value is usually between 0.1 and 1.0.
- `radius-raw.txt`：Number of pixels per spot radius in `re_image.tif`.


In [None]:
import os
import gc
import warnings
import pickle
import numpy as np
import pandas as pd
from PIL import Image
import scipy.sparse as sp
import cv2
from skimage.transform import rescale
from scipy.ndimage import zoom
from matplotlib import pyplot as plt
import scanpy as sc
import bin2cell as b2c
from joblib import dump, load
import joblib
warnings.filterwarnings("ignore", category=UserWarning, module='anndata')
Image.MAX_IMAGE_PIXELS = None  # Disable pixel limit for very large images


base_dir = "/home/lixiaoyu/VISD/hd-data"
pre_dir = os.path.join(base_dir, "Pre-data")
raw_dir = os.path.join(base_dir, "Raw-data")

In [3]:
gc.collect()


def load_pickle(filename, verbose=True):
    with open(filename, 'rb') as file:
        x = pickle.load(file)
    if verbose:
        print(f'Pickle loaded from {filename}')
    return x

def save_image(img, filename):
    mkdir(filename)
    Image.fromarray(img).save(filename)
    print(filename)

def read_lines(filename):
    with open(filename, 'r') as file:
        lines = [line.rstrip() for line in file]
    return lines

def load_image(filename, verbose=True):
    img = Image.open(filename)
    img = np.array(img)
    if img.ndim == 3 and img.shape[-1] == 4:
        img = img[..., :3]
    if verbose:
        print(f'Image loaded from {filename}')
    return img

def load_tsv(filename, index=True):
    if index:
        index_col = 0
    else:
        index_col = None
    df = pd.read_csv(filename, header=0, index_col=None)
    print(f'Dataframe loaded from {filename}')
    return df

def get_locs(prefix, target_shape=None):
    locs = load_tsv(f'{prefix}pseudo_locs.csv')
    locs = np.stack([locs['x'], locs['y']], -1)
    if target_shape is not None:
        wsi = load_image(f'{prefix}re_image.tif')
        current_shape = np.array(wsi.shape[:2])
        rescale_factor = current_shape // target_shape
        locs = locs.astype(float)
        locs /= rescale_factor
    locs = locs.round().astype(int)
    return locs

def get_disk_mask(radius, boundary_width=None):
    radius_ceil = np.array(radius).astype(int)
    locs = np.meshgrid(
        np.arange(-radius_ceil, radius_ceil + 1),
        np.arange(-radius_ceil, radius_ceil + 1),
        indexing='ij'
    )
    locs = np.stack(locs, -1)
    distsq = (locs ** 2).sum(-1)
    isin = distsq <= radius ** 2
    if boundary_width is not None:
        isin *= distsq >= (radius - boundary_width) ** 2
    return isin

def get_patches_flat(img, locs, mask):
    shape = np.array(mask.shape)
    center = shape // 2
    r = np.stack([-center, shape - center], axis=-1)
    x_list = []
    for s in locs:
        y, x = s
        if isinstance(y, np.ndarray):
            y = y.item()
        if isinstance(x, np.ndarray):
            x = x.item()
        top_left_y = int(y + r[0][0])
        bottom_right_y = int(y + r[0][1])
        top_left_x = int(x + r[1][0])
        bottom_right_x = int(x + r[1][1])
        patch = img[
            max(0, top_left_y):min(img.shape[0], bottom_right_y),
            max(0, top_left_x):min(img.shape[1], bottom_right_x)
        ]
        if patch.shape[:2] != mask.shape:
            patch = np.pad(
                patch,
                (
                    (max(0, -top_left_y), max(0, bottom_right_y - img.shape[0])),
                    (max(0, -top_left_x), max(0, bottom_right_x - img.shape[1])),
                    (0, 0)
                ),
                mode='constant'
            )
        if mask.all():
            x = patch
        else:
            if patch.shape[:2] == mask.shape:
                x = patch[mask]
            else:
                continue
        x_list.append(x)
    if len(x_list) > 0:
        x_list = np.stack(x_list)
    else:
        x_list = np.array([])
    return x_list

print("✓ Step 1 complete: Utility functions loaded.")


✓ Step 1 complete: Utility functions loaded.


In [4]:
folder_path = os.path.join(pre_dir, "binned_outputs/square_008um")

adata = b2c.read_visium(
    folder_path,
    spaceranger_image_path=os.path.join(folder_path, "spatial")
)

adata = adata[(adata.obs >= 0).all(axis=1)]

raw_pix_size = adata.uns['spatial']['Visium_HD_Mouse_Brain']['scalefactors']['microns_per_pixel']
scale_pix_size = 0.5
scale = raw_pix_size / scale_pix_size

loc = adata.obs.copy()
loc[['pxl_row_in_fullres', 'pxl_col_in_fullres']] *= scale
loc = loc.round().astype(int)

loc.to_csv(os.path.join(pre_dir, "location.csv"))

img = np.array(Image.open(os.path.join(raw_dir, "Visium_HD_Mouse_Brain_tissue_image.tif")))

with open(os.path.join(pre_dir, "data_descrip.txt"), 'w') as f:
    f.write(f"Original pixl-x: {img.shape[0]}\n")
    f.write(f"Original pixl-y: {img.shape[1]}\n")

if img.max() > 255 or img.min() < 0:
    img = (255 * (img - img.min()) / (img.max() - img.min())).astype(np.uint8)
else:
    img = img.astype(np.uint8)

new_size = (int(img.shape[1] * scale), int(img.shape[0] * scale))
img_rescaled = cv2.resize(img, new_size, interpolation=cv2.INTER_AREA)

H, W, _ = img_rescaled.shape
img_rescaled = img_rescaled[:H // 224 * 224, :W // 224 * 224]

with open(os.path.join(pre_dir, "data_descrip.txt"), 'a') as f:
    f.write(f"Rescale pixl-x: {img_rescaled.shape[0]}\n")
    f.write(f"Rescale pixl-y: {img_rescaled.shape[1]}\n")

Image.fromarray(img_rescaled).save(os.path.join(pre_dir, "re_image.tif"))

print("✓ Step 2 complete: Data loaded, coordinates rescaled, and image processed.")


✓ Step 2 complete: Data loaded, coordinates rescaled, and image processed.


In [5]:
def get_adata():
    folder_path = os.path.join(pre_dir, "binned_outputs/square_008um")
    adata = b2c.read_visium(
        folder_path,
        spaceranger_image_path=os.path.join(folder_path, "spatial")
    )
    adata = adata[(adata.obs >= 0).all(axis=1)]
    adata.var_names_make_unique()
    raw_pix_size = adata.uns['spatial']['Visium_HD_Mouse_Brain']['scalefactors']['microns_per_pixel']
    scale_pix_size = 0.5
    scale = raw_pix_size / scale_pix_size
    adata.obs[['pxl_row_in_fullres', 'pxl_col_in_fullres']] *= scale
    adata.obs = adata.obs.round().astype(int)
    return adata

img = np.array(Image.open(os.path.join(pre_dir, "re_image.tif")))
H, W, _ = img.shape

genes_count = 3000
genes_3D = np.zeros((H // 16, W // 16, genes_count), dtype=np.float32)

adata = get_adata()
num_spots = adata.shape[0]
num_genes = adata.shape[1]

sc.pp.highly_variable_genes(adata, flavor="seurat_v3", n_top_genes=genes_count)
adata = adata[:, adata.var['highly_variable']]

pd.Series(adata.var_names).to_csv(
    os.path.join(pre_dir, "gene_names.txt"),
    index=False, header=False
)

X = np.array(adata.X.todense()).astype(np.float32)
for k in range(num_spots):
    i = adata.obs['pxl_row_in_fullres'].iloc[k] // 16
    j = adata.obs['pxl_col_in_fullres'].iloc[k] // 16
    i = min(max(i, 0), genes_3D.shape[0] - 1)
    j = min(max(j, 0), genes_3D.shape[1] - 1)
    genes_3D[i, j, :] = X[k, :]

with open(os.path.join(pre_dir, "data_descrip.txt"), 'a') as f:
    f.write(f"Original spot-number: {num_spots}\n")
    f.write(f"Original gene-number: {num_genes}\n")
    f.write(f"Prepro spot-number: {adata.shape[0]}\n")
    f.write(f"Prepro gene-number: {adata.shape[1]}\n")

with open(os.path.join(pre_dir, "genes_3D.pkl"), 'wb') as f:
    pickle.dump(genes_3D, f, protocol=pickle.HIGHEST_PROTOCOL)

gc.collect()
print("✓ Step 3 complete: 3D gene matrix created and saved.")


✓ Step 3 complete: 3D gene matrix created and saved.


In [6]:
with open(os.path.join(pre_dir, "genes_3D.pkl"), 'rb') as f:
    gene = pickle.load(f)
    matrix = gene

mask = np.any(matrix, axis=2).astype(np.uint8) * 255

plt.imsave(os.path.join(pre_dir, "mask.png"), mask, cmap='gray')

print(f"✓ Step 4 complete: Mask saved. Shape: {mask.shape}")


✓ Step 4 complete: Mask saved. Shape: (812, 644)


In [7]:
diameter = 55
distance = 100
pixel_size = 0.5

he = cv2.imread(os.path.join(pre_dir, "re_image.tif"))

pseudo_coords = []
for i in range(0, int(he.shape[1] * pixel_size - diameter / 2), distance):
    offset = 0 if (i // 100) % 2 == 0 else 50
    for j in range(0, int(he.shape[0] * pixel_size - diameter / 2 - (127.5 - offset)), distance):
        x = i + diameter / 2
        y = j + diameter / 2 + offset
        pseudo_coords.append([x, y])

pseudo_coords = np.array(pseudo_coords).astype(np.float32)
pseudo_coords = pseudo_coords // pixel_size
df = pd.DataFrame(pseudo_coords, columns=['x', 'y'])

mask = load_image(os.path.join(pre_dir, "mask.png")) > 0
mask = mask[:, :, 0]

rows_to_delete = []
H, W = mask.shape
for i in range(H):
    for j in range(W):
        if not mask[i, j]:
            matched = df[(df['x'] // 16 - 3 == j) & (df['y'] // 16 - 3 == i)]
            if not matched.empty:
                rows_to_delete.append(matched.index[0])

df = df.drop(rows_to_delete)

with open(os.path.join(pre_dir, "data_descrip.txt"), 'a') as f:
    f.write(f"Pseudo_locs-x: {df.shape[0]}\n")
    f.write(f"Pseudo_locs-y: {df.shape[1]}\n")

df.to_csv(os.path.join(pre_dir, "pseudo_locs.csv"), index=False, float_format='%.2f')

print(f"✓ Step 5 complete: Pseudo-spots generated and filtered. Total kept: {df.shape[0]}")


Image loaded from /home/lixiaoyu/VISD/data/Pre-data/mask.png
✓ Step 5 complete: Pseudo-spots generated and filtered. Total kept: 2509


In [8]:
with open(os.path.join(pre_dir, "genes_3D.pkl"), 'rb') as f:
    img = pickle.load(f)

radius = 55 / 16
mask = get_disk_mask(radius)
locs = get_locs(pre_dir + "/", np.array([img.shape[0], img.shape[1]]))
x = get_patches_flat(img, locs, mask)
cnts = np.sum(x, axis=1)

with open(os.path.join(pre_dir, "gene_names.txt"), 'r') as file:
    gene_names = [line.strip() for line in file]

cnts_df = pd.DataFrame(data=cnts, columns=gene_names)

with open(os.path.join(pre_dir, "data_descrip.txt"), 'a') as f:
    f.write(f"Pseudo_locs-x: {cnts_df.shape[0]}\n")
    f.write(f"Pseudo_locs-y: {cnts_df.shape[1]}\n")

cnts_df.to_csv(os.path.join(pre_dir, "pseudo_st.csv"), index=False)

print(f"✓ Step 6 complete: Pseudo-spot expression matrix saved. Shape: {cnts_df.shape}")


Dataframe loaded from /home/lixiaoyu/VISD/data/Pre-data/pseudo_locs.csv
Image loaded from /home/lixiaoyu/VISD/data/Pre-data/re_image.tif
✓ Step 6 complete: Pseudo-spot expression matrix saved. Shape: (2162, 3000)
