In [1]:
from os import path
from os import listdir

import pandas as pd
import scipy.sparse as sp
import anndata

Download files to `data/tasic_2018` folder:

Main page: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE115746

- https://ftp.ncbi.nlm.nih.gov/geo/series/GSE115nnn/GSE115746/suppl/GSE115746%5Fcells%5Fexon%5Fcounts%2Ecsv%2Egz
- https://ftp.ncbi.nlm.nih.gov/geo/series/GSE115nnn/GSE115746/suppl/GSE115746%5Fcomplete%5Fmetadata%5F28706%2Dcells%2Ecsv%2Egz

Download cluster annotations for clusters
https://raw.githubusercontent.com/berenslab/rna-seq-tsne/master/data/tasic-sample_heatmap_plot_data.csv

In [2]:
data_path = path.join("..", "..", "data", "tasic_2018")

In [3]:
listdir(data_path)

['GSE115746_cells_exon_counts.csv.gz',
 'tasic-sample_heatmap_plot_data.csv',
 'GSE115746_complete_metadata_28706-cells.csv.gz']

In [4]:
%time data = pd.read_csv(path.join(data_path, "GSE115746_cells_exon_counts.csv.gz"), index_col=0)

CPU times: user 6min 22s, sys: 14.9 s, total: 6min 37s
Wall time: 6min 37s


In [5]:
data.head()

Unnamed: 0,F2S4_150422_002_A01,F2S4_150422_002_B01,F2S4_150422_002_C01,F2S4_150422_002_D01,F2S4_150422_002_E01,F2S4_150422_002_F01,F2S4_150422_002_G01,F2S4_150422_002_H01,F2S4_150427_001_A01,F2S4_150427_001_B01,...,F1S4_180124_316_G01,F1S4_180124_316_H01,F1S4_180124_317_A01,F1S4_180124_317_B01,F1S4_180124_317_C01,F1S4_180124_317_D01,F1S4_180124_317_E01,F1S4_180124_317_F01,F1S4_180124_317_G01,F1S4_180124_317_H01
0610005C13Rik,0,0,0,0,0,0,0,0,0,0,...,0,0,0,25,0,0,0,0,0,0
0610006L08Rik,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0610007P14Rik,95,48,101,51,36,128,126,102,64,75,...,105,139,103,205,88,172,27,108,23,168
0610009B22Rik,68,47,43,42,58,14,27,68,19,83,...,41,86,271,118,101,174,102,121,111,119
0610009E02Rik,0,0,11,0,0,1,0,0,2,0,...,0,0,0,0,0,0,0,0,0,7


In [6]:
data.shape

(45768, 23178)

In [7]:
%time metadata = pd.read_csv(path.join(data_path, "GSE115746_complete_metadata_28706-cells.csv.gz"), index_col=0)

CPU times: user 256 ms, sys: 36 ms, total: 292 ms
Wall time: 292 ms


In [8]:
metadata.head()

Unnamed: 0_level_0,title,source_name,organism,donor_id,donor_sex,donor_genotype,injection_type,injection_target,injected_material,dissected_region,...,sequencing_tube,sequencing_batch,sequencing_qc_pass_fail,cell_class,cell_subclass,cell_cluster,molecule,SRA_Run,GEO_Sample,GEO_Sample_Title
sample_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
F0S4_150820_019_A01,F0S4_150820_019_A01,Control: 1E-8 ERCC MIX1,Control,,,,,,,,...,LS-14690,R8S4-160203,Pass,ERCC,,,ERCC,SRR7322881,GSM3189899,Control: 1E-8 ERCC MIX1_1
F0S4_150820_019_B01,F0S4_150820_019_B01,Control: No Template Control,Control,,,,,,,,...,LS-14690,R8S4-160203,Pass,NTC,,,Total RNA,SRR7323629,GSM3189901,Control: No Template Control_1
F0S4_150820_019_C01,F0S4_150820_019_C01,Control: 1E-8 ERCC MIX1,Control,,,,,,,,...,LS-14690,R8S4-160203,Pass,ERCC,,,ERCC,SRR7322992,GSM3189899,Control: 1E-8 ERCC MIX1_1
F0S4_150820_019_D01,F0S4_150820_019_D01,Control: No Template Control,Control,,,,,,,,...,LS-14690,R8S4-160203,Pass,NTC,,,Total RNA,SRR7323740,GSM3189901,Control: No Template Control_1
F0S4_150820_019_E01,F0S4_150820_019_E01,Control: 10pg Zyagen Mouse Whole Brain Total RNA,Control,,,,,,,,...,LS-14690,R8S4-160203,Pass,MouseWholeRNA,,,Total RNA,SRR7321958,GSM3189897,Control: 10pg Zyagen Mouse Whole Brain Total R...


In [9]:
x = sp.csr_matrix(data.values)
x

<45768x23178 sparse matrix of type '<class 'numpy.longlong'>'
	with 214290227 stored elements in Compressed Sparse Row format>

In [10]:
adata = anndata.AnnData(
    x.T,
    uns={"name": "tasic_2018", "year": 2018, "organism": "mouse", "tissue": "brain"},
    var={"var_names": data.index.values},
    obs={
        "obs_names": data.columns.values,
        "cell_class": metadata["cell_class"].astype(str),
        "batch": metadata["source_name"].astype(str),
    },
)
adata

AnnData object with n_obs × n_vars = 23178 × 45768 
    obs: 'cell_class', 'batch'
    uns: 'name', 'year', 'organism', 'tissue'

In [11]:
adata.write_h5ad(path.join("..", "..", "data", "h5ad", "tasic_2018.h5ad"))

... storing 'cell_class' as categorical
... storing 'batch' as categorical
