In [1]:
from os import path
from os import listdir

import pandas as pd
import scipy.sparse as sp
import anndata

Download files to `data/shekhar_2016` folder:

Main page: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE81904

- https://ftp.ncbi.nlm.nih.gov/geo/series/GSE81nnn/GSE81904/suppl/GSE81904_BipolarUMICounts_Cell2016.txt.gz
- https://s3.amazonaws.com/scrnaseq-public-datasets/manual-data/shekhar/clust_retinal_bipolar.txt

In [2]:
data_path = path.join("..", "..", "data", "shekhar_2016")

In [3]:
listdir(data_path)

['GSE81904_BipolarUMICounts_Cell2016.txt.gz', 'clust_retinal_bipolar.txt']

In [4]:
%time data = pd.read_table(path.join(data_path, "GSE81904_BipolarUMICounts_Cell2016.txt.gz"), index_col=0)

CPU times: user 15min 13s, sys: 30.2 s, total: 15min 44s
Wall time: 15min 39s


In [5]:
data.head()

Unnamed: 0,Bipolar1_CCCACAAGACTA,Bipolar1_TCGCCTCGTAAG,Bipolar1_CAAAGCATTTGC,Bipolar1_CTTTTGATTGAC,Bipolar1_GCTCCAATGACA,Bipolar1_AAATACCCTCAT,Bipolar1_TGCATGCGTCCA,Bipolar1_TTCCGGCTTTTC,Bipolar1_CAACGTATCCTT,Bipolar1_CATCGAACGACG,...,Bipolar6_GACACATTAATG,Bipolar6_GCCGCTTTCGTG,Bipolar6_GCGACGTGAATA,Bipolar6_GTACTTGGAGGA,Bipolar6_GTGTGTTCCTAG,Bipolar6_GAATTTGCTGAC,Bipolar6_GCGGGACAATAC,Bipolar6_GTGGTGCGAAGT,Bipolar6_GAAGAATGCGCT,Bipolar6_GGCAACACGATA
0610005C13Rik,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0610007P14Rik,0,0,0,0,2,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
0610009B22Rik,0,1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0610009E02Rik,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0610009L18Rik,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
data.shape

(24904, 44994)

In [7]:
metadata = pd.read_table(path.join(data_path, "clust_retinal_bipolar.txt"), index_col=0)
metadata.head()

Unnamed: 0_level_0,CLUSTER,SUB-CLUSTER
NAME,Unnamed: 1_level_1,Unnamed: 2_level_1
Bipolar1_CCCACAAGACTA,BC5D,BC5D
Bipolar1_TCGCCTCGTAAG,Doublets/Contaminants,Doublets/Contaminants
Bipolar1_CAAAGCATTTGC,BC6,BC6
Bipolar1_CTTTTGATTGAC,BC7 (Cone Bipolar cell 7),BC7 (Cone Bipolar cell 7)
Bipolar1_GCTCCAATGACA,RBC (Rod Bipolar cell),RBC (Rod Bipolar cell)


In [8]:
data = data.loc[:, metadata.index.values]
metadata = metadata.loc[data.columns.values]

In [9]:
data.shape, metadata.shape

((24904, 27499), (27499, 2))

In [10]:
cell_types = metadata["SUB-CLUSTER"].astype(object)

cell_types[cell_types.str.contains("BC1")] = "type 1 cone bipolar cell"
cell_types[cell_types.str.contains("BC2")] = "type 2 cone bipolar cell"
cell_types[cell_types.str.contains("BC3")] = "type 3 cone bipolar cell"
cell_types[cell_types.str.contains("BC4")] = "type 4 cone bipolar cell"
cell_types[cell_types.str.contains("BC5")] = "type 5 cone bipolar cell"
cell_types[cell_types.str.contains("BC6")] = "type 6 cone bipolar cell"
cell_types[cell_types.str.contains("BC7")] = "type 7 cone bipolar cell"

# This is a mixture, so go up one level in the ontology
cell_types[cell_types == "BC8/9 (mixture of BC8 and BC9)"] = "type 8/9 cone bipolar cell"

# Non-bipolar cells
cell_types[cell_types == "RBC (Rod Bipolar cell)"] = "rod bipolar cell"
cell_types[cell_types == "MG (Mueller Glia)"] = "Mueller cell"
cell_types[cell_types == "AC (Amacrine cell)"] = "amacrine cell"
cell_types[cell_types == "Rod Photoreceptors"] = "retinal rod cell"
cell_types[cell_types == "Cone Photoreceptors"] = "retinal cone cell"

cell_types.value_counts()

rod bipolar cell              10888
type 5 cone bipolar cell       4641
Mueller cell                   2945
type 1 cone bipolar cell       1883
type 7 cone bipolar cell       1759
type 6 cone bipolar cell       1702
type 3 cone bipolar cell       1352
Doublets/Contaminants           669
type 2 cone bipolar cell        558
type 4 cone bipolar cell        398
type 8/9 cone bipolar cell      313
amacrine cell                   252
retinal rod cell                 91
retinal cone cell                48
Name: SUB-CLUSTER, dtype: int64

In [11]:
# Remove all the doublet/contaminants
contaminant_mask = cell_types == "Doublets/Contaminants"
metadata = metadata.iloc[~contaminant_mask.values]
cell_types = cell_types[~contaminant_mask]
data = data.iloc[:, ~contaminant_mask.values]

In [12]:
data.shape

(24904, 26830)

In [13]:
x = sp.csr_matrix(data.values)
x

<24904x26830 sparse matrix of type '<class 'numpy.longlong'>'
	with 23628425 stored elements in Compressed Sparse Row format>

In [14]:
adata = anndata.AnnData(
    x.T,
    uns={"name": "shekhar_2016", "year": 2016, "organism": "mouse", "tissue": "retina"},
    var={"var_names": data.index.values},
    obs={
        "obs_names": data.columns.values,
        "labels": cell_types.values,
    },
)
adata

AnnData object with n_obs × n_vars = 26830 × 24904 
    obs: 'labels'
    uns: 'name', 'year', 'organism', 'tissue'

In [15]:
adata.write_h5ad(path.join("..", "..", "data", "h5ad", "shekhar_2016.h5ad"))

... storing 'labels' as categorical
