## RNA Velocity using scVelo
Marissa Esteban

Data: CITEseq YS006_UW

I did pre processing in Suerat and saved object as .h5Seurat to perform RNAvelo in Python wtih scVelo

In [64]:
# Setup
import scanpy as sc
import scvelo as scvelo
import anndata
import loompy
import mygene

In [103]:
# loading in Seurat object and loading in data

adata_import = sc.read_h5ad("/Users/marissaestaban/Documents/CITEseq_data/ys006_named.h5ad")
ldata_import = sc.read_loom('/Users/marissaestaban/SRSP Laboratory Dropbox/SRSP Lab/Resources/Town Square - Data Share/Sequencing Repository/Data/CITEseq Data/YS006_Splicing-Alignment/possorted_genome_bam_751VB.loom')

ldata = ldata_import
adata = adata_import

# match CELL names
ldata.obs.index = [x.split(':')[1] for x in ldata.obs.index]    # possorted_genome_bam_751VB:AAAGTCCGTAAGGTCGx
ldata.obs.index = [x[:-1] for x in ldata.obs.index] 
adata.obs.index = [x.split('-')[0] for x in adata.obs.index]    # TTTGGAGTCGGTTGTA-1_1

# subset loom to only cells in adata
ldata = ldata[adata.obs.index, :]

In [104]:
print("=== Seurat-derived AnnData (adata) ===")
print("Cells (obs):", adata.n_obs)
print("Genes (var):", adata.n_vars)
print("\nFirst 10 cell names:", adata.obs.index[:10].tolist())
print("\nFirst 10 gene names:", adata.var.index[:10].tolist())

print("\n\n=== Loom RNA velocity AnnData (ldata) ===")
print("Cells (obs):", ldata.n_obs)
print("Genes (var):", ldata.n_vars)
print("\nFirst 10 cell names:", ldata.obs.index[:10].tolist())
print("\nFirst 10 gene names:", ldata.var.index[:10].tolist())

print("\n\n=== Layer shapes in ldata ===")
for layer in ["spliced", "unspliced", "ambiguous"]:
    if layer in ldata.layers:
        print(f"{layer}: {ldata.layers[layer].shape}")


=== Seurat-derived AnnData (adata) ===
Cells (obs): 4207
Genes (var): 19581

First 10 cell names: ['AAACCCAAGGGATGTC', 'AAACCCAAGGTACCTT', 'AAACGAAAGGAATTAC', 'AAACGAAAGTGCACAG', 'AAACGAAGTTGCCATA', 'AAACGAATCCAAGGGA', 'AAACGCTAGGGCATGT', 'AAACGCTAGTACTGTC', 'AAACGCTAGTCCTACA', 'AAACGCTCATGAATCC']

First 10 gene names: ['Xkr4', 'Gm1992', 'Gm19938', 'Rp1', 'Sox17', 'Gm37587', 'Mrpl15', 'Lypla1', 'Tcea1', 'Rgs20']


=== Loom RNA velocity AnnData (ldata) ===
Cells (obs): 4207
Genes (var): 33696

First 10 cell names: ['AAACCCAAGGGATGTC', 'AAACCCAAGGTACCTT', 'AAACGAAAGGAATTAC', 'AAACGAAAGTGCACAG', 'AAACGAAGTTGCCATA', 'AAACGAATCCAAGGGA', 'AAACGCTAGGGCATGT', 'AAACGCTAGTACTGTC', 'AAACGCTAGTCCTACA', 'AAACGCTCATGAATCC']

First 10 gene names: ['ENSMUSG00000079800', 'ENSMUSG00000095092', 'ENSMUSG00000079794', 'ENSMUSG00000079192', 'ENSMUSG00000094799', 'ENSMUSG00000095250', 'ENSMUSG00000095787', 'ENSMUSG00000095672', 'ENSMUSG00000094514', 'ENSMUSG00000096100']


=== Layer shapes in ldata ===
splic

In [None]:
# converting the loom's ENSEMBL ID's to match 

mg = mygene.MyGeneInfo()
gene_index = ldata.var.index.to_series()

# only grab the genes that have ENSEMBL IDs
is_ensembl = gene_index.str.startswith("ENSMUSG")

print("Total genes:", gene_index.shape[0])
print("ENSEMBL-like genes:", is_ensembl.sum())
print("Already-symbol-like genes:", (~is_ensembl).sum(), '\n')

# only query the ENSMUSG IDs
ens_ids = gene_index[is_ensembl].unique().tolist()

results = mg.querymany(
    ens_ids,
    scopes="ensemblgene",   # or "ensembl.gene" depending on mygene version, but this usually works
    fields="symbol",
    species="mouse",
    as_dataframe=False
)

# Build mapping dict ENSMUSG -> symbol
ens_to_symbol = {r["query"]: r.get("symbol", None) for r in results}

# QC on mapping
total_ens = len(ens_ids)
mapped_ens = sum(v is not None for v in ens_to_symbol.values())
print("Total ENSMUSG IDs:", total_ens)
print("Mapped ENSMUSG → symbol:", mapped_ens)
print("Unmapped ENSMUSG:", total_ens - mapped_ens)

# 4) Create a 'symbol' column:
# default: keep existing name
ldata.var["symbol"] = gene_index.copy()

# replace ENSMUSG entries with their symbol
for g in gene_index[is_ensembl]:
    sym = ens_to_symbol.get(g, None)
    if sym is not None:
        ldata.var.at[g, "symbol"] = sym


Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed


Total genes: 33696
ENSEMBL-like genes: 248
Already-symbol-like genes: 33448 



13 input query terms found no hit:	['ENSMUSG00000095742', 'ENSMUSG00000095728', 'ENSMUSG00000095076', 'ENSMUSG00000121317', 'ENSMUSG000


Total ENSMUSG IDs: 248
Mapped ENSMUSG → symbol: 203
Unmapped ENSMUSG: 45


In [106]:
common_genes = adata.var.index.intersection(ldata.var.index)
print("Overlapping genes:", len(common_genes))

Overlapping genes: 19572


In [None]:
# Use gene symbols as var index
ldata.var.index = ldata.var["symbol"].astype(str)
ldata.var.index

Index(['ENSMUSG00000079800', 'ENSMUSG00000095092', 'ENSMUSG00000079794',
       'ENSMUSG00000079192', 'LOC105242404', 'LOC118568473',
       'ENSMUSG00000095787', 'ENSMUSG00000095672', 'LOC102636558',
       'ENSMUSG00000096100',
       ...
       'Gm28406', 'Gm29436', 'Gm28407', 'Gm29393', 'Gm21294', 'Gm28672',
       'Gm28670', 'Gm29504', 'Gm20837', 'Gm47283'],
      dtype='object', name='symbol', length=33696)

In [101]:
# putting new names back into loom obj

# Use gene symbols as var index
ldata.var.index = ldata.var["symbol"].astype(str)

# Make them unique
ldata.var.index = (
    ldata.var.index.to_series()
    + "_" + ldata.var.index.to_series().groupby(ldata.var.index).cumcount().astype(str)
)

print("Duplicated symbols after fixing:", ldata.var.index.duplicated().sum())

# Intersect with Seurat genes
common_genes = adata.var.index.intersection(ldata.var.index)
print("Overlapping genes:", len(common_genes))

adata = adata[:, common_genes]
ldata = ldata[:, common_genes]

adata.layers["spliced"]   = ldata.layers["spliced"].toarray()
adata.layers["unspliced"] = ldata.layers["unspliced"].toarray()
adata.layers["ambiguous"] = ldata.layers["ambiguous"].toarray()

Duplicated symbols after fixing: 0
Overlapping genes: 0


In [100]:
ldata.var.index

Index([], dtype='object')