In [1]:
from os import path
from os import listdir

import pandas as pd
import scipy.sparse as sp
import anndata

Download files to `data/macosko_2015` folder:

Main page: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE63472

- https://ftp.ncbi.nlm.nih.gov/geo/series/GSE63nnn/GSE63472/suppl/GSE63472%5FP14Retina%5Fmerged%5Fdigital%5Fexpression%2Etxt%2Egz
- http://mccarrolllab.org/wp-content/uploads/2015/05/retina_clusteridentities.txt

In [2]:
data_path = path.join("..", "..", "data", "macosko_2015")

In [3]:
listdir(data_path)

['retina_clusteridentities.txt',
 'GSE63472_P14Retina_merged_digital_expression.txt.gz']

In [4]:
%time data = pd.read_table(path.join(data_path, "GSE63472_P14Retina_merged_digital_expression.txt.gz"), index_col=0)

CPU times: user 16min 34s, sys: 30.6 s, total: 17min 5s
Wall time: 17min 1s


In [5]:
data.head()

Unnamed: 0_level_0,r1_GGCCGCAGTCCG,r1_CTTGTGCGGGAA,r1_GCGCAACTGCTC,r1_GATTGGGAGGCA,r1_CCTCCTAGTTGG,r1_AGTCAAGCCCTC,r1_GTGCCGCCTCTC,r1_CCTGTGACACAC,r1_AATCTCGTTAAT,r1_GATTTCCTCTGA,...,p1_GAGGGGCTCTAA,p1_AGCCAAGGCTCG,p1_TGAGTCGTCTTA,p1_AACGGTCGCTTT,p1_CGAATACGTGTC,p1_TCAAAAGCCGGG,p1_ATTAAGTTCCAA,p1_CTGTCTGAGACC,p1_TAACGCGCTCCT,p1_ATTCTTGTTCTT
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
KITL,0,0,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
TMTC3,3,0,0,0,2,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
CEP290,1,3,0,2,1,18,10,3,4,3,...,0,0,0,0,0,0,0,0,0,0
4930430F08RIK,2,1,2,0,1,1,0,1,1,1,...,0,0,0,0,0,0,1,0,0,0
1700017N19RIK,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
data.shape

(24658, 49300)

In [7]:
cluster_ids = pd.read_table(path.join(data_path, "retina_clusteridentities.txt"), header=None, index_col=0, squeeze=True)
cluster_ids.head()

0
r1_GGCCGCAGTCCG     2
r1_CTTGTGCGGGAA     2
r1_GCGCAACTGCTC     2
r1_GATTGGGAGGCA     2
r1_GTGCCGCCTCTC    25
Name: 1, dtype: int64

In [8]:
data = data.loc[:, cluster_ids.index]

In [9]:
data.shape, cluster_ids.shape

((24658, 44808), (44808,))

In [10]:
cell_types = cluster_ids.astype(object)

cell_types.loc[cell_types == 1] = "Horizontal cells"
cell_types.loc[cell_types == 2] = "Retinal ganglion cells"
cell_types.loc[cell_types.isin(range(3, 24))] = "Amacrine cells"
cell_types.loc[cell_types == 24] = "Rods"
cell_types.loc[cell_types == 25] = "Cones"
cell_types.loc[cell_types.isin(range(26, 34))] = "Bipolar cells"
cell_types.loc[cell_types == 34] = "Muller glia"
cell_types.loc[cell_types == 35] = "Astrocytes"
cell_types.loc[cell_types == 36] = "Fibroblasts"
cell_types.loc[cell_types == 37] = "Vascular endothelium"
cell_types.loc[cell_types == 38] = "Pericytes"
cell_types.loc[cell_types == 39] = "Microglia"

cell_types.value_counts()

Rods                      29400
Bipolar cells              6285
Amacrine cells             4426
Cones                      1868
Muller glia                1624
Retinal ganglion cells      432
Vascular endothelium        252
Horizontal cells            252
Fibroblasts                  85
Microglia                    67
Pericytes                    63
Astrocytes                   54
Name: 1, dtype: int64

In [11]:
x = sp.csr_matrix(data.values)
x

<24658x44808 sparse matrix of type '<class 'numpy.longlong'>'
	with 32805477 stored elements in Compressed Sparse Row format>

In [12]:
adata = anndata.AnnData(
    x.T,
    uns={"name": "macosko_2015", "year": 2015, "organism": "mouse", "tissue": "retina"},
    var={"var_names": data.index.values},
    obs={
        "obs_names": data.columns.values,
        "cluster_ids": cluster_ids.values,
        "labels": cell_types.values,
    },
)
adata

AnnData object with n_obs × n_vars = 44808 × 24658 
    obs: 'cluster_ids', 'labels'
    uns: 'name', 'year', 'organism', 'tissue'

In [13]:
adata.write_h5ad(path.join("..", "..", "data", "h5ad", "macosko_2015.h5ad"))

... storing 'labels' as categorical
