In [1]:
from os import path
from os import listdir

import pandas as pd
import scipy.sparse as sp
import anndata

Download files to `data/harris_2018` folder:

Main page:
https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE99888

- ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE99nnn/GSE99888/suppl/GSE99888_gene_expression.tab.gz

In [2]:
data_path = path.join("..", "..", "data", "harris_2018")

In [3]:
listdir(data_path)

['GSE99888_gene_expression.tab.gz']

In [4]:
%time data = pd.read_table(path.join(data_path, "GSE99888_gene_expression.tab.gz"), index_col=0)

CPU times: user 25.1 s, sys: 1.7 s, total: 26.8 s
Wall time: 26.8 s


In [5]:
data.head()

Unnamed: 0,10X36_1_AAGGCTACTTTCTG,10X36_1_CACCGGGATAAGCC,10X36_1_CAGCCTTGCCCACT,10X36_1_CCTATAACCTACCC,10X36_1_CTTTGATGACACAC,10X36_1_GGATACTGGGTACT,10X36_1_GGATTTCTGACACT,10X36_1_GTAGTGTGACACCA,10X36_1_TGGAAGCTACTAGC,10X36_1_TTCCAAACTACGCA,...,10X38_2_TTCACAACTTGCTT,10X38_2_TTCACCCTCCATGA,10X38_2_TTCATTCTTTTGCT,10X38_2_TTCCTAGATCCAGA,10X38_2_TTCTGATGTGTCTT,10X38_2_TTGACACTGTCAAC,10X38_2_TTGCTATGAGCATC,10X38_2_TTGTAGCTCAAAGA,10X38_2_TTTATCCTTAGACC,10X38_2_TTTGCATGGTACGT
Xkr4,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,1,0,0,0
Gm1992,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Gm37381,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Rp1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Rp1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
data.shape

(27998, 6971)

In [7]:
data = data[~data.index.duplicated()]
data.shape

(27933, 6971)

In [8]:
x = sp.csr_matrix(data.values)
x

<27933x6971 sparse matrix of type '<class 'numpy.longlong'>'
	with 13974597 stored elements in Compressed Sparse Row format>

In [9]:
adata = anndata.AnnData(
    x.T,
    uns={"name": "harris_2018", "year": 2018, "organism": "mouse", "tissue": "brain"},
    var={"var_names": data.index.values},
    obs={"obs_names": data.columns.values},
)
adata

AnnData object with n_obs × n_vars = 6971 × 27933 
    uns: 'name', 'year', 'organism', 'tissue'

In [10]:
adata.write_h5ad(path.join("..", "..", "data", "h5ad", "harris_2018.h5ad"))