# VeloViz with Unwounded Ker IFE cells
Marissa Esteban
1/12/2026
 

In [1]:
# Setup
import scanpy as sc
import scvelo as scv
import anndata
import loompy
import mygene
import scipy
import seaborn as sns
import matplotlib.pyplot as plt
from VelocityObject import VelocityObject
import pandas as pd
import scipy.sparse as sp
import numpy as np
from scipy.io import mmwrite

In [11]:
import numpy as np
import scipy.sparse as sp
import scvelo as scv
import pandas as pd

def _flat_data(X):
    """Return 1D array of stored values (sparse: data only; dense: ravel)."""
    if sp.issparse(X):
        return X.data
    return np.asarray(X).ravel()

def layer_value_stats(adata, layer="velocity", eps=1e-12):
    """Entry-wise stats for a layer: finite/nan/inf/zero/neg counts & fractions."""
    X = adata.layers[layer]
    d = _flat_data(X)

    n = d.size
    finite = np.isfinite(d)
    nan = np.isnan(d)
    inf = np.isinf(d)

    # Note: for sparse matrices, implicit zeros are NOT in X.data
    # so "zero count" here is among stored values only.
    zeros = (d == 0) & finite
    neg = (d < 0) & finite
    pos = (d > 0) & finite

    out = {
        "layer": layer,
        "stored_n": int(n),
        "finite_n": int(finite.sum()),
        "nan_n": int(nan.sum()),
        "inf_n": int(inf.sum()),
        "zero_n(stored)": int(zeros.sum()),
        "neg_n(stored)": int(neg.sum()),
        "pos_n(stored)": int(pos.sum()),
        "finite_frac": float(finite.mean()) if n else np.nan,
        "nan_frac": float(nan.mean()) if n else np.nan,
        "inf_frac": float(inf.mean()) if n else np.nan,
        "zero_frac(stored, finite)": float(zeros.sum() / max(finite.sum(), 1)),
        "neg_frac(stored, finite)": float(neg.sum() / max(finite.sum(), 1)),
        "pos_frac(stored, finite)": float(pos.sum() / max(finite.sum(), 1)),
    }
    return out

def cell_velocity_norm_stats(adata, layer="velocity", eps=1e-12, assume_cells_by_genes=True):
    """
    Cell-wise velocity magnitude stats.
    If your velocity layer is cells x genes (typical for AnnData), assume_cells_by_genes=True.
    """
    V = adata.layers[layer]

    if sp.issparse(V):
        # compute L2 norm per row (cell) efficiently
        if assume_cells_by_genes:
            row_sq_sum = np.array(V.multiply(V).sum(axis=1)).ravel()
            norms = np.sqrt(row_sq_sum)
        else:
            col_sq_sum = np.array(V.multiply(V).sum(axis=0)).ravel()
            norms = np.sqrt(col_sq_sum)
    else:
        Vd = np.asarray(V)
        norms = np.linalg.norm(Vd, axis=1 if assume_cells_by_genes else 0)

    zero = norms == 0
    near_zero = norms < eps
    out = {
        "cells_n": int(norms.size),
        "zero_vel_cells_n": int(zero.sum()),
        "zero_vel_cells_frac": float(zero.mean()),
        "near_zero_vel_cells_n": int(near_zero.sum()),
        "near_zero_vel_cells_frac": float(near_zero.mean()),
        "vel_norm_median": float(np.median(norms)),
        "vel_norm_mean": float(np.mean(norms)),
        "vel_norm_p95": float(np.quantile(norms, 0.95)),
        "vel_norm_max": float(np.max(norms)),
    }
    return out

def report_velocity(adata, label, eps=1e-12):
    """Combine entry-wise + cell-wise stats into a compact report dict."""
    entry = layer_value_stats(adata, layer="velocity", eps=eps)
    cell  = cell_velocity_norm_stats(adata, layer="velocity", eps=eps, assume_cells_by_genes=True)
    return {"mode": label, **entry, **cell}


In [2]:
h5ad_path = "/Volumes/PortableSSD/SRSP/CITEseq_Keratinocytes/UW_IFE.h5ad"
loom_paths = ["/Volumes/PortableSSD/SRSP/CITEseq_looms/YS001/possorted_genome_bam_KLWV0.loom",
              "/Volumes/PortableSSD/SRSP/CITEseq_looms/YS002/possorted_genome_bam_PNEM4.loom",
              "/Volumes/PortableSSD/SRSP/CITEseq_looms/YS003/possorted_genome_bam_8Q2XV.loom",
              "/Volumes/PortableSSD/SRSP/CITEseq_looms/YS004/possorted_genome_bam_0XB2N.loom",
              "/Volumes/PortableSSD/SRSP/CITEseq_looms/YS005/possorted_genome_bam_HX08G.loom",
              "/Volumes/PortableSSD/SRSP/CITEseq_looms/YS006/possorted_genome_bam_751VB.loom"]

citeSEQ_Keratinocytes_IFE = VelocityObject(h5ad_path, loom_paths)

INFO:biothings.client:querying 1-248 ...



==== Converting ENSEMBL Genes ====
Total ldata genes: 33696
ENSEMBL-like genes: 248
Already-symbol-like genes: 33448 



INFO:biothings.client:Finished.
INFO:biothings.client:Pass "returnall=True" to return complete lists of duplicate or missing query terms.


Mapped ENSMUSG → symbol: 203
Unmapped ENSMUSG: 45
Total ENSMUSG IDs after mapping:  45

=== Removing duplicates ===
adata
Cells (original): 3854
Genes (original): 30481
Duplicate cell barcodes: 0
Duplicate genes: 0
ldata
Cells (original): 45487
Genes (original): 33696
Duplicate cell barcodes: 235
Duplicate genes: 27

 adata after dedup: (3854, 30481)
ldata after dedup: (45252, 33696)

=== Subsetting cells by GENES and BARCODES ===
Overlapping Cells: 3794
Overlapping genes: 29365

adata shape: (3794, 29365)
ldata shape: (3794, 29365)
Cells match: True
Genes match: True

Layers in adata_subset: ['spliced', 'unspliced', 'ambiguous']
Spliced shape: (3794, 29365)
Unspliced shape: (3794, 29365)

=== Seurat-derived AnnData (adata) ===
Cells (obs): 3794
Genes (var): 29365
First 10 cell names: ['AAACGAAAGATTCGCT', 'AAACGAAAGCTCACTA', 'AAACGAAGTGTATCCA', 'AAACGAAGTTGCTCGG', 'AAACGAATCTCAACCC', 'AAACGCTCAAGAGATT', 'AAACGCTGTAGGTCAG', 'AAACGCTTCGCACTCT', 'AAAGAACCAACCGGAA', 'AAAGAACCATACAGAA']
Fir

## USING DYNAMICAL MODEL

In [None]:
# pre process and compute velocity


citeSEQ_Keratinocytes_IFE.scVeloPreprocess(n_pcs=20)
citeSEQ_Keratinocytes_IFE.computeVelocity()

velocity_ker_ife = citeSEQ_Keratinocytes_IFE.getAdata()
adata_dyn = velocity_ker_ife.copy()

Filtered out 73 genes that are detected 20 counts (spliced).
Logarithmized X.
computing moments based on connectivities


  log1p(adata)


    finished (0:00:02) --> added 
    'Ms' and 'Mu', moments of un/spliced abundances (adata.layers)
recovering dynamics (using 1/16 cores)
    finished (0:00:46) --> added 
    'fit_pars', fitted parameters for splicing dynamics (adata.var)
computing velocities
    finished (0:00:00) --> added 
    'velocity', velocity vectors for each individual cell (adata.layers)
computing velocity graph (using 1/16 cores)
    finished (0:00:00) --> added 
    'velocity_graph', sparse matrix with cosine correlations (adata.uns)


In [None]:
dyn_report = report_velocity(adata_dyn, label="dynamical")
dyn_report

{'mode': 'dynamical',
 'layer': 'velocity',
 'stored_n': 45653202,
 'finite_n': 1897000,
 'nan_n': 43756202,
 'inf_n': 0,
 'zero_n(stored)': 54020,
 'neg_n(stored)': 1083866,
 'pos_n(stored)': 759114,
 'finite_frac': 0.041552397573339984,
 'nan_frac': 0.95844760242666,
 'inf_frac': 0.0,
 'zero_frac(stored, finite)': 0.028476541908276226,
 'neg_frac(stored, finite)': 0.5713579335793358,
 'pos_frac(stored, finite)': 0.40016552451238796,
 'cells_n': 3794,
 'zero_vel_cells_n': 0,
 'zero_vel_cells_frac': 0.0,
 'near_zero_vel_cells_n': 0,
 'near_zero_vel_cells_frac': 0.0,
 'vel_norm_median': nan,
 'vel_norm_mean': nan,
 'vel_norm_p95': nan,
 'vel_norm_max': nan}

## RUNNING STOCHASTIC

In [9]:
citeSEQ_Keratinocytes_IFE.scVeloPreprocess(n_pcs=20)
citeSEQ_Keratinocytes_IFE.computeVelocity(mode='s')

velocity_ker_ife = citeSEQ_Keratinocytes_IFE.getAdata()
adata_sto = velocity_ker_ife.copy()

sto_report = report_velocity(adata_sto, label="stochastic")
sto_report


Logarithmized X.
computing moments based on connectivities


  log1p(adata)


    finished (0:00:02) --> added 
    'Ms' and 'Mu', moments of un/spliced abundances (adata.layers)
recovering dynamics (using 1/16 cores)
    finished (0:00:46) --> added 
    'fit_pars', fitted parameters for splicing dynamics (adata.var)
computing velocities
    finished (0:00:01) --> added 
    'velocity', velocity vectors for each individual cell (adata.layers)
computing velocity graph (using 1/16 cores)
    finished (0:00:04) --> added 
    'velocity_graph', sparse matrix with cosine correlations (adata.uns)


{'mode': 'stochastic',
 'layer': 'velocity',
 'stored_n': 45653202,
 'finite_n': 45653202,
 'nan_n': 0,
 'inf_n': 0,
 'zero_n(stored)': 11631568,
 'neg_n(stored)': 15956798,
 'pos_n(stored)': 18064836,
 'finite_frac': 1.0,
 'nan_frac': 0.0,
 'inf_frac': 0.0,
 'zero_frac(stored, finite)': 0.25478098995115395,
 'neg_frac(stored, finite)': 0.3495219897171725,
 'pos_frac(stored, finite)': 0.39569702033167353,
 'cells_n': 3794,
 'zero_vel_cells_n': 0,
 'zero_vel_cells_frac': 0.0,
 'near_zero_vel_cells_n': 0,
 'near_zero_vel_cells_frac': 0.0,
 'vel_norm_median': 57.56725311279297,
 'vel_norm_mean': 58.76070785522461,
 'vel_norm_p95': 79.12604179382323,
 'vel_norm_max': 104.34687042236328}

In [None]:
df = pd.DataFrame([dyn_report, sto_report])

cols = [
    "mode",
    "stored_n", "finite_n", "nan_n", "inf_n",
    "zero_n(stored)", "neg_n(stored)", "pos_n(stored)",
    "finite_frac", "nan_frac", "inf_frac",
    "zero_frac(stored, finite)", "neg_frac(stored, finite)", "pos_frac(stored, finite)",
    "cells_n", "zero_vel_cells_n", "zero_vel_cells_frac",
    "near_zero_vel_cells_n", "near_zero_vel_cells_frac",
    "vel_norm_median", "vel_norm_mean", "vel_norm_p95", "vel_norm_max",
]
df[cols]


Unnamed: 0,mode,stored_n,finite_n,nan_n,inf_n,zero_n(stored),neg_n(stored),pos_n(stored),finite_frac,nan_frac,...,"pos_frac(stored, finite)",cells_n,zero_vel_cells_n,zero_vel_cells_frac,near_zero_vel_cells_n,near_zero_vel_cells_frac,vel_norm_median,vel_norm_mean,vel_norm_p95,vel_norm_max
0,dynamical,45653202,1897000,43756202,0,54020,1083866,759114,0.041552,0.958448,...,0.400166,3794,0,0.0,0,0.0,,,,
1,stochastic,45653202,45653202,0,0,11631568,15956798,18064836,1.0,0.0,...,0.395697,3794,0,0.0,0,0.0,57.567253,58.760708,79.126042,104.34687


In [None]:
import scipy.sparse as sp

# choose current
curr = adata.layers["Ms"] if "Ms" in adata.layers else (
       adata.layers["spliced"] if "spliced" in adata.layers else adata.X)

V = adata.layers["velocity"]
proj = curr + V


In [None]:
import numpy as np
import pandas as pd

# if you have HVGs stored
if "highly_variable" in adata.var.columns:
    gene_mask = adata.var["highly_variable"].to_numpy()
else:
    # fallback: take top 2000 by variance of curr
    X = curr.A if sp.issparse(curr) else np.asarray(curr)
    vars_ = X.var(axis=0)
    gene_mask = vars_.argsort()[-2000:]

# apply subset
adata_sub = adata[:, gene_mask].copy()

curr = adata_sub.layers["Ms"] if "Ms" in adata_sub.layers else adata_sub.X
V    = adata_sub.layers["velocity"]
proj = curr + V


In [None]:
import numpy as np
from scipy.io import mmwrite
import scipy.sparse as sp

if not sp.issparse(curr): curr = sp.csr_matrix(curr)
if not sp.issparse(proj): proj = sp.csr_matrix(proj)

mmwrite("curr.mtx", curr)
mmwrite("proj.mtx", proj)

np.savetxt("cells.txt", adata_sub.obs_names.to_numpy(), fmt="%s")
np.savetxt("genes.txt", adata_sub.var_names.to_numpy(), fmt="%s")

# optional: cluster labels / metadata for coloring later
adata_sub.obs.to_csv("obs.csv")
