In [3]:
import scanpy as sc
import squidpy as sq
import numpy as np
import pandas as pd
from anndata import AnnData
import pathlib
import matplotlib.pyplot as plt
import matplotlib as mpl
import skimage
import seaborn as sns
import tangram as tg

sc.logging.print_header()
print(f"squidpy=={sq.__version__}")

%load_ext autoreload
%autoreload 2
%matplotlib inline

scanpy==1.9.8 anndata==0.9.2 umap==0.5.7 numpy==1.22.4 scipy==1.10.1 pandas==2.0.3 scikit-learn==1.3.2 statsmodels==0.14.1 igraph==0.11.8 pynndescent==0.5.13
squidpy==1.2.3


In [4]:
# load the data

adata_st = sq.datasets.visium_fluo_adata_crop()
adata_st = adata_st[
    adata_st.obs.cluster.isin([f"Cortex_{i}" for i in np.arange(1, 5)])
].copy()
img = sq.datasets.visium_fluo_image_crop()

adata_sc = sq.datasets.sc_mouse_cortex()

In [5]:
# filter by overlapping genes

overlapping_genes = list(set(adata_st.var_names) & set(adata_sc.var_names))
print(f"Number of overlapping genes: {len(overlapping_genes)}")

adata_st_filtered = adata_st[:, overlapping_genes].copy()
adata_sc_filtered = adata_sc[:, overlapping_genes].copy()

Number of overlapping genes: 15102


In [6]:
# normalize the data

sc.pp.normalize_total(adata_st_filtered, target_sum=1e4)
sc.pp.log1p(adata_st_filtered)
sc.pp.normalize_total(adata_sc_filtered, target_sum=1e4)
sc.pp.log1p(adata_sc_filtered)

In [8]:
# SAVING CHECKPOINT

adata_st_filtered.write("tangram_from_scratch/adata_st_filtered.h5ad")

adata_sc_filtered.write("tangram_from_scratch/adata_sc_filtered.h5ad")

In [10]:
import scipy.sparse
from scipy import sparse

In [11]:
# get gene expression data (raw feature matrices NO EMBEDDINGS!!!)

"""
This code snippet is extracting the gene expression data from the AnnData objects and converting it to standard numpy arrays.

Conceptually, it's:

Checking if the expression matrix (.X) is stored in a sparse format - which is common in single-cell data to save memory since most genes aren't expressed in most cells
If the matrix is sparse, it converts it to a dense array with .toarray() so we can perform operations like matrix multiplication more easily
If the matrix is already in a dense format, it just uses it directly

This ensures we have the raw gene expression values in a standard numpy array format that's compatible with the subsequent analysis steps, regardless of how the data was originally stored in the AnnData objects.

"""

st_features = adata_st_filtered.X.toarray() if scipy.sparse.issparse(adata_st_filtered.X) else adata_st_filtered.X
sc_features = adata_sc_filtered.X.toarray() if scipy.sparse.issparse(adata_sc_filtered.X) else adata_sc_filtered.X

In [12]:
# SAVING CHECKPOINT

np.save("st_features.npy", st_features)
np.save("sc_features.npy", sc_features)

In [1]:
# RELOAD CHECKPOINT

import numpy as np

centroids = np.loadtxt("tangram_from_scratch/mouse_centroids_minibatch.csv", delimiter=",")
print("Loaded centroids as NumPy array:")
print(centroids)

FileNotFoundError: mouse_centroids_minibatch.csv not found.