In [2]:
import scanpy as sc
import pandas as pd
import seaborn as sns
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

In [3]:
sc.settings.verbosity = 3             # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_versions()
sc.settings.set_figure_params(dpi=80, frameon=False, figsize=(20, 10), facecolor='white')

-----
anndata     0.11.3
scanpy      1.10.4
-----
PIL                         11.1.0
anyio                       NA
appnope                     0.1.2
asttokens                   NA
attr                        24.3.0
attrs                       24.3.0
babel                       2.16.0
backports                   NA
brotli                      1.0.9
certifi                     2025.01.31
charset_normalizer          3.3.2
comm                        0.2.1
cycler                      0.12.1
cython_runtime              NA
dateutil                    2.9.0.post0
debugpy                     1.8.11
decorator                   5.1.1
defusedxml                  0.7.1
exceptiongroup              1.2.0
executing                   0.8.3
fastjsonschema              NA
h5py                        3.12.1
idna                        3.7
ipykernel                   6.29.5
jaraco                      NA
jedi                        0.19.2
jinja2                      3.1.5
joblib                      1.4.

  mod_version = _find_version(mod.__version__)


Data in Peng et al. (2019) can be downloaded from https://singlecell.broadinstitute.org/single_cell/study/SCP212/molecular-specification-of-retinal-cell-types-underlying-central-and-peripheral-vision-in-primates.

Data in Shekhar et al. (2016) can be downloaded from https://singlecell.broadinstitute.org/single_cell/study/SCP3/retinal-bipolar-neuron-drop-seq.

In [4]:
adata_ref = sc.read_h5ad("MacaqueFovea_BC_velo_ann_v1.h5ad")
adata_ref

AnnData object with n_obs × n_vars = 19990 × 37781
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'barcode', 'annotated', 'cell_class', 'RNA_snn_res.0.5', 'seurat_clusters'
    var: 'vst.mean', 'vst.variance', 'vst.variance.expected', 'vst.variance.standardized', 'vst.variable'

In [5]:
adata_ref.var.index

Index(['PGBD2', 'ENSMFAG00000064508', 'ENSMFAG00000053700',
       'ENSMFAG00000047552', 'OR9G1', 'ENSMFAG00000050442', 'OR2G6',
       'ENSMFAG00000031836', 'OR2T10', 'Y-RNA',
       ...
       'VBP1', 'ENSMFAG00000058523', 'ENSMFAG00000064770',
       'ENSMFAG00000052149', 'ENSMFAG00000048608', 'SPRY3', 'VAMP7', 'IL9R',
       'Metazoa-SRP.3247', 'ENSMFAG00000064014'],
      dtype='object', length=37781)

In [8]:
adata = sc.read_h5ad("MouseBC_int_ann_v3.h5ad")
adata

AnnData object with n_obs × n_vars = 5555 × 31053
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'orig.file', 'animal', 'RNA_snn_res.0.5', 'seurat_clusters', 'dendro_order', 'integrated_snn_res.0.8', 'integrated_snn_res.0.5', 'barcode', 'annotated'
    var: '_index', 'features'

In [9]:
adata.var.index = adata.var['_index']
adata.var.index

Index(['XKR4', 'GM37381', 'RP1', 'SOX17', 'GM37323', 'MRPL15', 'RGS20',
       'NPBWR1', '4732440D04RIK', 'GM26901',
       ...
       'GM28406', 'GM29436', 'GM28407', 'GM29393', 'GM21294', 'GM28672',
       'GM28670', 'GM29504', 'GM20837', 'GM47283'],
      dtype='object', name='_index', length=31053)

In [10]:
var_names = adata_ref.var.index.intersection(adata.var.index)
adata_ref = adata_ref[:, var_names]
adata = adata[:, var_names]

In [11]:
adata_ref

View of AnnData object with n_obs × n_vars = 19990 × 13040
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'barcode', 'annotated', 'cell_class', 'RNA_snn_res.0.5', 'seurat_clusters'
    var: 'vst.mean', 'vst.variance', 'vst.variance.expected', 'vst.variance.standardized', 'vst.variable'

In [12]:
adata

View of AnnData object with n_obs × n_vars = 5555 × 13040
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'orig.file', 'animal', 'RNA_snn_res.0.5', 'seurat_clusters', 'dendro_order', 'integrated_snn_res.0.8', 'integrated_snn_res.0.5', 'barcode', 'annotated'
    var: '_index', 'features'

In [13]:
df_ref = pd.DataFrame.sparse.from_spmatrix(adata_ref.X)
df_ref.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13030,13031,13032,13033,13034,13035,13036,13037,13038,13039
0,0,0,0,0.0,0.0,0.0,1.700059,0,0,0,...,0.0,1.700059,0,0.0,0.0,0.0,0,0,0.0,0
1,0,0,0,0.0,1.429261,0.0,0.0,0,0,0,...,1.429261,0.0,0,1.429261,0.0,0.0,0,0,0.0,0
2,0,0,0,0.0,1.61781,1.61781,0.0,0,0,0,...,1.61781,0.0,0,0.0,0.0,1.61781,0,0,0.0,0
3,0,0,0,1.77109,0.0,0.0,0.0,0,0,0,...,0.0,0.0,0,0.0,0.0,0.0,0,0,0.965367,0
4,0,0,0,0.0,0.0,0.0,1.460467,0,0,0,...,0.0,0.0,0,0.0,1.460467,0.0,0,0,0.0,0


In [14]:
df_ref.columns = adata_ref.var.index
df_ref.index = adata_ref.obs.annotated
df_ref.head()

Unnamed: 0_level_0,TRIM58,NLRP3,SCCPDH,CNST,COX20,DESI2,SDCCAG8,EXO1,WDR64,KMO,...,ATP6AP1,GDI1,FAM50A,PLXNA3,IKBKG,BRCC3,VBP1,SPRY3,VAMP7,IL9R
annotated,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
FMB,0,0,0,0.0,0.0,0.0,1.700059,0,0,0,...,0.0,1.700059,0,0.0,0.0,0.0,0,0,0.0,0
FMB,0,0,0,0.0,1.429261,0.0,0.0,0,0,0,...,1.429261,0.0,0,1.429261,0.0,0.0,0,0,0.0,0
DB3b,0,0,0,0.0,1.61781,1.61781,0.0,0,0,0,...,1.61781,0.0,0,0.0,0.0,1.61781,0,0,0.0,0
DB4,0,0,0,1.77109,0.0,0.0,0.0,0,0,0,...,0.0,0.0,0,0.0,0.0,0.0,0,0,0.965367,0
FMB,0,0,0,0.0,0.0,0.0,1.460467,0,0,0,...,0.0,0.0,0,0.0,1.460467,0.0,0,0,0.0,0


In [15]:
df_ref.to_pickle('mk_df_bc.pkl')

In [16]:
df = pd.DataFrame.sparse.from_spmatrix(adata.X)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13030,13031,13032,13033,13034,13035,13036,13037,13038,13039
0,0,0,0.0,0.0,0,0.0,0.0,0,0,0,...,1.181849,1.181849,1.181849,0,0,0.0,0.0,0,0,0
1,0,0,0.995651,0.995651,0,0.0,0.0,0,0,0,...,1.811473,0.0,0.0,0,0,0.0,0.995651,0,0,0
2,0,0,1.176548,1.176548,0,0.0,1.702257,0,0,0,...,1.176548,0.0,0.0,0,0,1.176548,1.176548,0,0,0
3,0,0,0.0,0.0,0,0.0,0.0,0,0,0,...,0.0,1.861259,0.0,0,0,1.861259,0.0,0,0,0
4,0,0,0.0,0.0,0,2.068235,0.0,0,0,0,...,0.0,0.0,0.0,0,0,2.068235,0.0,0,0,0


In [17]:
df.columns = adata.var.index
df.index = adata.obs.annotated
df.head()

_index,TRIM58,NLRP3,SCCPDH,CNST,COX20,DESI2,SDCCAG8,EXO1,WDR64,KMO,...,ATP6AP1,GDI1,FAM50A,PLXNA3,IKBKG,BRCC3,VBP1,SPRY3,VAMP7,IL9R
annotated,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6,0,0,0.0,0.0,0,0.0,0.0,0,0,0,...,1.181849,1.181849,1.181849,0,0,0.0,0.0,0,0,0
5,0,0,0.995651,0.995651,0,0.0,0.0,0,0,0,...,1.811473,0.0,0.0,0,0,0.0,0.995651,0,0,0
14,0,0,1.176548,1.176548,0,0.0,1.702257,0,0,0,...,1.176548,0.0,0.0,0,0,1.176548,1.176548,0,0,0
14,0,0,0.0,0.0,0,0.0,0.0,0,0,0,...,0.0,1.861259,0.0,0,0,1.861259,0.0,0,0,0
1,0,0,0.0,0.0,0,2.068235,0.0,0,0,0,...,0.0,0.0,0.0,0,0,2.068235,0.0,0,0,0


In [18]:
df.to_pickle('ms_df_bc.pkl')