In [1]:
from os import path
from os import listdir

import pandas as pd
import scipy.sparse as sp
import anndata

Download files to `data/hochgerner_2018` folder:

Main page:
https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE104323

- ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE104nnn/GSE104323/suppl/GSE104323_10X_expression_data_V2.tab.gz
- ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE104nnn/GSE104323/suppl/GSE104323_metadata_barcodes_24185cells.txt.gz

In [2]:
data_path = path.join("..", "..", "data", "hochgerner_2018")

In [3]:
listdir(data_path)

['GSE104323_metadata_barcodes_24185cells.txt.gz',
 'GSE104323_10X_expression_data_V2.tab.gz']

In [4]:
%time data = pd.read_table(path.join(data_path, "GSE104323_10X_expression_data_V2.tab.gz"), index_col=0)

CPU times: user 3min 49s, sys: 13 s, total: 4min 2s
Wall time: 4min 2s


In [5]:
data.head()

Unnamed: 0_level_0,10X79_1_TCTACCATGCCTAA-,10X79_2_GTACTAGTGAACAT-,10X79_2_AATCAGTACCTACA-,10X79_1_CGGGTTCTTGAGGT-,10X79_1_GTGGAAGGCGTACA-,10X79_1_GTCCGCAAGCCATT-,10X79_2_TAAAGCAATACGCT-,10X79_1_AGTGATCAGCAACT-,10X79_1_CTCAATCCCAAGAT-,10X79_1_CCTTGTCGGATGTT-,...,10X79_2_CCAAATCCTCCTAG-,10X79_2_ATGTAGTTATCGGT-,10X80_2_GCTAATCTTATCTG-,10X80_1_CATTTAGTACGCGA-,10X80_2_GAATCTCAGCGACC-,10X80_1_CACGGTCTACGAGT-,10X80_2_ATTGCAGCCACGTC-,10X80_1_GCGGTTCGGCATCG-,10X83_4_TTCAGCATACTCTT-,10X80_1_TTACCAGGAGTAGA-
cellid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0610007P14Rik,0,0,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
0610009B22Rik,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0610009L18Rik,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0610009O20Rik,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
0610010F05Rik,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
data.shape

(27933, 24185)

In [7]:
metadata = pd.read_table(path.join(data_path, "GSE104323_metadata_barcodes_24185cells.txt.gz"), index_col=0)
metadata.head()

Unnamed: 0_level_0,source name,organism,characteristics: strain,characteristics: age,characteristics: sex of pooled animals,characteristics: cell cluster,molecule,SRR run accession,raw file (original file name),UMI_CellularBarcode
Sample name (24185 single cells),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
10X79_1_AAACTAGCTAGCCC-,dentate gyrus,Mus musculus,hGFAP-GFP,P120,2males+1female,Neuroblast,total RNA,SRR6089817,10X79_1_AAACTAGCTAGCCC.fq.gz,CGGCGATCCC_AAACTAGCTAGCCC
10X79_1_AAACTAGGATGTAT-,dentate gyrus,Mus musculus,hGFAP-GFP,P120,2males+1female,OPC,total RNA,SRR6089947,10X79_1_AAACTAGGATGTAT.fq.gz,AGTGGTAATG_AAACTAGGATGTAT
10X79_1_AAACTCACGGCGTT-,dentate gyrus,Mus musculus,hGFAP-GFP,P120,2males+1female,GC-adult,total RNA,SRR6089529,10X79_1_AAACTCACGGCGTT.fq.gz,GGGTGCGCTC_AAACTCACGGCGTT
10X79_1_AAACTGTCGGCTCA-,dentate gyrus,Mus musculus,hGFAP-GFP,P120,2males+1female,MOL,total RNA,SRR6089595,10X79_1_AAACTGTCGGCTCA.fq.gz,CCTTTCAACG_AAACTGTCGGCTCA
10X79_1_AAACTGTGATAAGT-,dentate gyrus,Mus musculus,hGFAP-GFP,P120,2males+1female,OPC,total RNA,SRR6090058,10X79_1_AAACTGTGATAAGT.fq.gz,CCTTTCAGGT_AAACTGTGATAAGT


The metadata contains some NaN rows at the end, so remove those.

In [8]:
nan_mask = metadata.isna().all(axis=1)
nan_mask.sum()

31

In [9]:
metadata = metadata.loc[~nan_mask]
metadata.shape

(24185, 10)

Select the subset of cells for which we have metadata.

In [10]:
data = data.loc[:, metadata.index]
data.shape

(27933, 24185)

In [11]:
cell_types = metadata["characteristics: cell cluster"].copy()

cell_types = cell_types.replace({
    "Immature-Pyr": "immature pyramidal neuron",
    "GC-juv": "juevnile granule cell",
    "GC-adult": "granule cell",
    "Immature-GC": "immature granule cell",
    "Neuroblast": "neuroblast",
    "Astro-adult": "astrocyte",
    "Immature-GABA": "immature GABAergic neuron",
    "Astro-juv": "juvenile astrocyte",
    "OPC": "oligodendrocyte precursor cell",
    "MOL": "oligodendrocyte",
    "RGL_young": "radial glial cell",
    "Immature-Astro": "immature astrocyte",
    "Endothelial": "endothelial cell",
    "Cajal-Retzius": "Cajal-Retzius cell",
    "CA3-Pyr": "pyramidal neuron",
    "nIPC-perin": "neuronal intermediate progenitor cells",
    "nIPC": "neuronal intermediate progenitor cells",
    "MiCajal-Retziusoglia": "glial cell",
    "NFOL": "newly formed oligodendrocyte",
    "GABA": "GABAergic neuron",
    "RGL": "radial glial cell",
    "Ependymal": "ependymal cell",
    "VLMC": "vascular and leptomeningeal cell",
    "PVM": "perivascular macrophage",
})

pd.Series.value_counts(cell_types)

immature pyramidal neuron                 4520
juevnile granule cell                     3420
granule cell                              2613
immature granule cell                     2419
neuroblast                                1381
astrocyte                                 1232
immature GABAergic neuron                 1024
radial glial cell                          885
juvenile astrocyte                         821
neuronal intermediate progenitor cells     821
oligodendrocyte precursor cell             794
oligodendrocyte                            704
immature astrocyte                         651
endothelial cell                           543
Cajal-Retzius cell                         535
pyramidal neuron                           532
glial cell                                 428
newly formed oligodendrocyte               232
GABAergic neuron                           209
ependymal cell                             182
vascular and leptomeningeal cell           160
perivascular 

In [12]:
x = sp.csr_matrix(data.values)
x

<27933x24185 sparse matrix of type '<class 'numpy.longlong'>'
	with 46646438 stored elements in Compressed Sparse Row format>

In [13]:
adata = anndata.AnnData(
    x.T,
    uns={"name": "hochgerner_2018", "year": 2018, "organism": "mouse", "tissue": "dentate gyrus"},
    var={"var_names": data.index.values},
    obs={
        "obs_names": data.columns.values,
        "labels": cell_types.values.astype(str),
        "age": metadata["characteristics: age"].values,
    },
)
adata

AnnData object with n_obs × n_vars = 24185 × 27933 
    obs: 'labels', 'age'
    uns: 'name', 'year', 'organism', 'tissue'

In [14]:
adata.write_h5ad(path.join("..", "..", "data", "h5ad", "hochgerner_2018.h5ad"))

... storing 'labels' as categorical
... storing 'age' as categorical
