In [24]:
import os, sys
import scipy.io
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scanpy as sc

In [25]:
# original data with all the information: spatial, classification, and marker genes
# data is from https://datadryad.org/stash/dataset/doi:10.5061/dryad.8t8s248
path = os.path.join('data', 'GSE113576_spatial_classification_marker.csv')
sp = pd.read_csv(path, index_col=0)
sp

Unnamed: 0_level_0,Animal_ID,Animal_sex,Behavior,Bregma,Centroid_X,Centroid_Y,Cell_class,Neuron_cluster_ID,Ace2,Adora2a,...,Penk,Scg2,Sln,Sst,Tac1,Tac2,Th,Trh,Ucn3,Vgf
Cell_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6749ccb4-2ed1-4029-968f-820a287f43c8,1,Female,Naive,0.26,-3211.562145,2608.541476,Astrocyte,,0.0,1.638275,...,0.133016,0.000000,0.865263,0.002977,0.054826,0.008934,0.000000,0.000000,0.000000,0.000000
6cac74bd-4ea7-4701-8701-42563cc65eb8,1,Female,Naive,0.26,-3207.923151,2621.795437,Inhibitory,I-5,0.0,0.000000,...,0.000000,0.000000,0.277939,0.868702,0.580957,0.010079,0.000000,0.000000,0.000000,0.000000
9f29bd57-16a5-4b26-b9f5-37598809da9e,1,Female,Naive,0.26,-3209.578004,2633.153494,Inhibitory,I-6,0.0,0.000000,...,0.213939,0.000000,0.377907,0.049332,0.084898,0.008951,0.000000,0.000000,0.000000,0.000000
d7eb4e0b-276e-47e3-a55c-0b033180a2fe,1,Female,Naive,0.26,-3203.853515,2756.045983,Inhibitory,I-5,0.0,0.000000,...,0.050882,0.089038,0.000000,0.000000,0.001530,0.031364,0.000000,0.000000,0.000000,0.001138
54434f3a-eba9-4aec-af35-c9d317ffa1d5,1,Female,Naive,0.26,-3202.682705,2608.803635,Inhibitory,I-9,0.0,0.000000,...,1.250661,0.159618,0.211159,0.000000,0.087730,0.000000,0.000000,0.000000,0.000000,0.029419
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
eaaf93ba-75b1-40cd-af08-a4d803511354,30,Male,Mating,0.11,2732.438894,-2322.578167,Ambiguous,,0.0,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.040998,0.000000,0.000000,0.024618,0.000000
bb9caf59-d960-452e-a9fd-9238ce5e44ca,30,Male,Mating,0.11,2732.665807,-2120.226450,Ambiguous,,0.0,0.000000,...,0.000000,0.852081,0.000000,0.695548,0.000000,0.024392,0.475944,0.000000,0.000000,0.000000
2f45d61d-3a80-470b-8cb8-251a2c8d2e59,30,Male,Mating,0.11,2826.247461,-2308.947366,Ambiguous,,0.0,0.000000,...,0.058011,0.000000,0.000000,0.000000,0.000000,0.061898,0.000000,0.000000,0.000000,0.000000
180ae0ff-9817-48b9-8d34-b374bda6e316,30,Male,Mating,0.11,2900.963778,-2174.600160,Inhibitory,I-2,0.0,0.000000,...,0.000000,0.326172,0.000000,0.000000,0.000000,0.063518,0.000000,0.000000,0.000000,0.000000


In [26]:
sp.Cell_class.unique()

array(['Astrocyte', 'Inhibitory', 'OD Mature 2', 'Endothelial 1',
       'Ambiguous', 'Pericytes', 'Endothelial 2', 'OD Mature 1',
       'OD Immature 1', 'Excitatory', 'Microglia', 'Endothelial 3',
       'OD Mature 4', 'OD Immature 2', 'OD Mature 3', 'Ependymal'],
      dtype=object)

In [10]:
# single-cell data
# data is from https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE113576

path = os.path.join('data', 'GSE113576_barcodes.tsv.gz')
sc_barcodes = pd.read_csv(path, 
                          compression='gzip', 
                          header=None, 
                          names=['barcodes'],
                          index_col=0)
sc_barcodes

AAACCTGAGATGTGGC-1
AAACCTGCACACAGAG-1
AAACCTGCACTACAGT-1
AAACCTGTCAGGATCT-1
AAACCTGTCGCACTCT-1
...
TTTGGTTGTTATCACG-6
TTTGGTTGTTATTCTC-6
TTTGTCAGTTCCGTCT-6
TTTGTCATCGTGGGAA-6
TTTGTCATCTTTACAC-6


In [11]:
path = os.path.join('data', 'GSE113576_genes.tsv.gz')
sc_genes = pd.read_csv(path, 
                      compression='gzip', 
                      header=None, 
                      names=['genes'],
                      index_col=0)
sc_genes

ENSMUSG00000051951\tXkr4
ENSMUSG00000089699\tGm1992
ENSMUSG00000102343\tGm37381
ENSMUSG00000025900\tRp1
ENSMUSG00000109048\tRp1
...
ENSMUSG00000079808\tAC168977.1
ENSMUSG00000095041\tPISD
ENSMUSG00000063897\tDHRSX
ENSMUSG00000096730\tVmn2r122
ENSMUSG00000095742\tCAAA01147332.1


In [17]:
path = os.path.join('data', 'GSE113576_matrix.mtx.gz')
sc_mtx = scipy.io.mmread(path)
sc_mtx = sc_mtx.todense()
sc_mtx

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 1, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]])

In [18]:
sc_mtx.shape

(27998, 31299)

In [19]:
ad_sc = sc.AnnData(sc_mtx.transpose(), 
                   obs=sc_barcodes, 
                   var=sc_genes)
ad_sc

AnnData object with n_obs × n_vars = 31299 × 27998

In [20]:
ad_sc.obs

AAACCTGAGATGTGGC-1
AAACCTGCACACAGAG-1
AAACCTGCACTACAGT-1
AAACCTGTCAGGATCT-1
AAACCTGTCGCACTCT-1
...
TTTGGTTGTTATCACG-6
TTTGGTTGTTATTCTC-6
TTTGTCAGTTCCGTCT-6
TTTGTCATCGTGGGAA-6
TTTGTCATCTTTACAC-6


In [21]:
ad_sc.var

ENSMUSG00000051951\tXkr4
ENSMUSG00000089699\tGm1992
ENSMUSG00000102343\tGm37381
ENSMUSG00000025900\tRp1
ENSMUSG00000109048\tRp1
...
ENSMUSG00000079808\tAC168977.1
ENSMUSG00000095041\tPISD
ENSMUSG00000063897\tDHRSX
ENSMUSG00000096730\tVmn2r122
ENSMUSG00000095742\tCAAA01147332.1


In [22]:
ad_sc.X

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [27]:
# ad_sc.write_h5ad('data/GSE113576_sc.h5ad')