In [1]:
from os import path
from os import listdir

import pandas as pd
import scipy.sparse as sp
import anndata

Download files to `data/cao_2019` folder:

Main page: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE119945

- ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE119nnn/GSE119945/suppl/GSE119945_gene_count.txt.gz
- ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE119nnn/GSE119945/suppl/GSE119945_cell_annotate.csv.gz
- ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE119nnn/GSE119945/suppl/GSE119945_gene_annotate.csv.gz

In [2]:
data_path = path.join("..", "..", "data", "cao_2019")

In [3]:
listdir(data_path)

['GSE119945_gene_count.txt.gz',
 'GSE119945_cell_annotate.csv.gz',
 'GSE119945_gene_annotate.csv.gz']

In [4]:
%time adata = anndata.read_mtx(path.join(data_path, "GSE119945_gene_count.txt.gz"))

CPU times: user 1h 20min 27s, sys: 1min 28s, total: 1h 21min 55s
Wall time: 1h 21min 56s


In [5]:
adata = adata.T
adata

AnnData object with n_obs × n_vars = 2058652 × 26183 

In [6]:
%time cell_annotations = pd.read_csv(path.join(data_path, "GSE119945_cell_annotate.csv.gz"))

CPU times: user 8.26 s, sys: 512 ms, total: 8.77 s
Wall time: 8.77 s


In [7]:
adata.obs = cell_annotations.set_index("sample")
adata.obs.head()

Unnamed: 0_level_0,id,sex,day,Total_mRNAs,tsne_1,tsne_2,Main_Cluster,sub_tsne_1,sub_tsne_2,Sub_Cluster,db_score,detected_doublet,potential_doublet_cluster
sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
sci3-me-001.GTCGGAGTTTGAGGTAGAA,38,F,13.5,335.0,2.502835,2.589219,5.0,0.0,0.0,unknown,0.013367,False,
sci3-me-001.ATTAGTCTGTGTATAATACG,9,M,11.5,1989.0,19.352971,-5.523709,20.0,4.01933,18.306513,1,0.00416,False,False
sci3-me-001.GAGGAACTTAATACCATCC,26,F,10.5,393.0,-15.372493,-5.7596,21.0,0.0,0.0,unknown,0.002524,False,
sci3-me-001.TTCGCGGATACTCTCTCAA,16,F,13.5,227.0,-0.750546,1.439024,17.0,0.0,0.0,unknown,0.006938,False,
sci3-me-001.ACTGGTTATTTGCGCCATCT,33,M,12.5,241.0,2.402283,12.564681,6.0,0.0,0.0,unknown,0.002918,False,


In [8]:
%time gene_annotations = pd.read_csv(path.join(data_path, "GSE119945_gene_annotate.csv.gz"))

CPU times: user 72 ms, sys: 0 ns, total: 72 ms
Wall time: 72.7 ms


In [9]:
gene_annotations.head()

Unnamed: 0,gene_id,gene_type,gene_short_name
0,ENSMUSG00000051951.5,protein_coding,Xkr4
1,ENSMUSG00000103377.1,TEC,Gm37180
2,ENSMUSG00000104017.1,TEC,Gm37363
3,ENSMUSG00000103025.1,TEC,Gm37686
4,ENSMUSG00000089699.1,antisense,Gm1992


In [10]:
adata.var = gene_annotations.set_index("gene_short_name")
adata.var.head()

Unnamed: 0_level_0,gene_id,gene_type
gene_short_name,Unnamed: 1_level_1,Unnamed: 2_level_1
Xkr4,ENSMUSG00000051951.5,protein_coding
Gm37180,ENSMUSG00000103377.1,TEC
Gm37363,ENSMUSG00000104017.1,TEC
Gm37686,ENSMUSG00000103025.1,TEC
Gm1992,ENSMUSG00000089699.1,antisense


In [11]:
adata

AnnData object with n_obs × n_vars = 2058652 × 26183 
    obs: 'id', 'sex', 'day', 'Total_mRNAs', 'tsne_1', 'tsne_2', 'Main_Cluster', 'sub_tsne_1', 'sub_tsne_2', 'Sub_Cluster', 'db_score', 'detected_doublet', 'potential_doublet_cluster'
    var: 'gene_id', 'gene_type'

In [12]:
adata.obs = adata.obs.rename(columns={"Main_Cluster": "cluster_id"})

In [13]:
adata.uns["name"] = "cao_2019"
adata.uns["year"] = 2019
adata.uns["organism"] = "mouse"
adata.uns["tissue"] = "embryo"

In [14]:
adata

AnnData object with n_obs × n_vars = 2058652 × 26183 
    obs: 'id', 'sex', 'day', 'Total_mRNAs', 'tsne_1', 'tsne_2', 'cluster_id', 'sub_tsne_1', 'sub_tsne_2', 'Sub_Cluster', 'db_score', 'detected_doublet', 'potential_doublet_cluster'
    var: 'gene_id', 'gene_type'
    uns: 'name', 'year', 'organism', 'tissue'

In [15]:
adata.write_h5ad(path.join("..", "..", "data", "h5ad", "cao_2019.h5ad"))

... storing 'sex' as categorical
... storing 'Sub_Cluster' as categorical
... storing 'detected_doublet' as categorical
... storing 'potential_doublet_cluster' as categorical
... storing 'gene_type' as categorical
