# Download

- Drop-seq http://dropviz.org/

- 10X http://mousebrain.org/downloads.html

- SPLiT-seq https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE110823

Supplementary files "preprocess_mousebrain_data_all.R" and "preprocess_mousebrain_meta_all.R"

# Preprocess

In [1]:
import os
import numpy as np
import pandas as pd
import scanpy as sc
import rpy2.robjects as ro
import rpy2.robjects.numpy2ri
import anndata2ri
import scipy.io as sio
import anndata as ad

In [2]:
data_path = "../datasets/preprocessed"

if not os.path.exists(data_path):
    
    os.makedirs(data_path)


datapath_Drop = "../datasets/raw/MouseBrain"
datapath_10X = "../datasets/raw/MouseBrain/10X"  

ro.r['library']("DropSeq.util")
ro.r['library']("Seurat")
ro.r['library']("SingleCellExperiment")


## Drop-seq
ro.r('''load("%s")''' % (datapath_Drop + "/Drop-seq_all.RData"))
ro.r("sce <- CreateSeuratObject(sce)")
ro.r("sce <- as.SingleCellExperiment(sce)")
anndata2ri.activate()
adata_dropseq = ro.r('as(sce, "SingleCellExperiment")')
adata_dropseq = adata_dropseq[~adata_dropseq.obs.index.duplicated(), ~adata_dropseq.var.index.duplicated()]
ro.r('''load("%s")''' % (datapath_Drop + "/Drop-seq_meta_all.RData"))
ro.r("cellid <- as.vector(meta_all$cellid)")
ro.r("celltype <- as.vector(meta_all$class)")
cellid = list(ro.r("cellid"))
celltype = list(ro.r("celltype"))
meta_dropseq = pd.DataFrame({"celltype": celltype}, index=cellid)
meta_dropseq = meta_dropseq[~meta_dropseq.index.duplicated()]
adata_dropseq = adata_dropseq[meta_dropseq.index, :]

## 10X
adata_10X = sc.read_loom(datapath_10X + "/l5_all.loom")
adata_10X = adata_10X[~adata_10X.obs.index.duplicated(), ~adata_10X.var.index.duplicated()]
meta_10X = adata_10X.obs[["Tissue", "Class"]]
meta_10X = meta_10X[~meta_10X.index.duplicated()]
adata_10X = adata_10X[meta_10X.index, :]


adata_10X = adata_10X[~adata_10X.obs.index.duplicated(), ~adata_10X.var.index.duplicated()]
meta_10X = meta_10X[~meta_10X.index.duplicated()]
adata_dropseq = adata_dropseq[~adata_dropseq.obs.index.duplicated(), ~adata_dropseq.var.index.duplicated()]
meta_dropseq = meta_dropseq[~meta_dropseq.index.duplicated()]
meta_dropseq["batch"] = "Cell (Drop-seq)"
meta_10X["batch"] = "Cell (10X)"
meta_10X["Class"][meta_10X["Class"] == 'Neurons'] = "Neuron"
meta_10X["Class"][meta_10X["Class"] == 'Astrocytes'] = "Astrocyte"
meta_10X.rename(columns = {'Tissue': 'tissue', 'Class': 'celltype'}, inplace = True)


## single-nuclei
#Load Data
data = sio.loadmat('../datasets/raw/MouseBrain/SPLiT-seq/GSM3017261_150000_CNS_nuclei.mat')
#Digital Expression Matrix
DGE = data['DGE']
#Barcodes
barcodes = np.array(["sn"+str(i) for i in range(DGE.shape[0])])
#Genes
genes = pd.Series(data['genes']).str.strip(' ')
#Sample types
sample_type = pd.Series(data['sample_type']).str.strip(' ')
#Main cluster assignment
cluster_assignment = pd.Series(data['cluster_assignment']).str.strip(' ')

idx = (sample_type.astype(str) == "p11_brain") | (sample_type.astype(str) == "p11_spine")

DGE = DGE[idx, :]
cluster_assignment = cluster_assignment[idx]
barcodes = barcodes[idx]
sample_type = sample_type[idx]
cluster = cluster_assignment.copy()

cluster[((cluster_assignment == "55 Oligo MFOL2") |
         (cluster_assignment == "56 Oligo MFOL1") |
         (cluster_assignment == "57 Oligo MOL") |
         (cluster_assignment == "58 Oligo NFOL1") |
         (cluster_assignment == "59 Oligo COP1") |
         (cluster_assignment == "60 Oligo COP2"))] = "Oligodendrocyte"

cluster[(cluster_assignment == "61 OPC")] = "Polydendrocyte"  
cluster[(cluster_assignment == "62 Macrophage")] = "Macrophage" 
cluster[(cluster_assignment == "63 Microglia")] = "Microglia" 
cluster[(cluster_assignment == "64 Endothelia")] = "Endothelial"
cluster[(cluster_assignment == "65 SMC")] = "Mural" 

cluster[((cluster_assignment == "66 VLMC Slc6a13") |
         (cluster_assignment == "67 VLMC Slc47a1"))] = "Vascular and leptomeningeal cells"

cluster[((cluster_assignment == "68 Astro Slc7a10") |
         (cluster_assignment == "69 Astro Prdm16") |
         (cluster_assignment == "70 Astro Gfap") |
         (cluster_assignment == "71 Bergmann Glia"))] = "Astrocyte"

cluster[(cluster_assignment == "72 Ependyma")] = "Ependymal"
cluster[(cluster_assignment == "73 OEC")] = "Olfactory ensheathing cells" 

idx = [i not in ["Oligodendrocyte", "Polydendrocyte",
                 "Macrophage", "Microglia",
                 "Endothelial", "Mural",
                 "Vascular and leptomeningeal cells", "Astrocyte",
                 "Ependymal", "Olfactory ensheathing cells"] for i in cluster]
cluster[idx] = "Neuron"

adata_sn = ad.AnnData(X=DGE.tocsr())
adata_sn.obs.index = barcodes
adata_sn.var.index = genes
adata_sn = adata_sn[~adata_sn.obs.index.duplicated(), ~adata_sn.var.index.duplicated()]
meta_sn = pd.DataFrame({"celltype": list(cluster), "sampleid": list(sample_type)}, index=barcodes)
meta_sn = meta_sn[~meta_sn.index.duplicated()]
adata_sn = adata_sn[meta_sn.index, :]

adata_sn = adata_sn[~adata_sn.obs.index.duplicated(), ~adata_sn.var.index.duplicated()]
meta_sn = meta_sn[~meta_sn.index.duplicated()]
meta_sn["batch"] = "Nuclei (SPLiT-seq)"


# meta.to_pickle(os.path.join(data_path, "meta_raw.pkl"))
# meta = pd.read_pickle(os.path.join(data_path, "meta_raw.pkl"))
# meta = meta.loc[list(adata_dropseq.obs.index) + list(adata_10X.obs.index), ]
# meta_dropseq.to_pickle(os.path.join(data_path, "meta_dropseq.pkl"))
# meta_10X.to_pickle(os.path.join(data_path, "meta_10X.pkl"))
# meta_sn.to_pickle(os.path.join(data_path, "meta_sn.pkl"))
# meta.to_csv(os.path.join(data_path, "meta.csv"))

# adata_dropseq.obs = adata_dropseq.obs[['ident']]
adata_dropseq_v2 = ad.AnnData(X=adata_dropseq.X)
adata_dropseq_v2.obs.index = adata_dropseq.obs.index
adata_dropseq_v2.var.index = adata_dropseq.var.index
adata_dropseq_v2.obs = meta_dropseq
adata_dropseq_v2.write(filename=os.path.join(data_path, "MouseBrain_Cell_Drop-seq.h5ad"))

adata_10X_v2 = ad.AnnData(X=adata_10X.X)
adata_10X_v2.obs.index = adata_10X.obs.index
adata_10X_v2.var.index = adata_10X.var.index
adata_10X_v2.obs = meta_10X
adata_10X_v2.write(filename=os.path.join(data_path, "MouseBrain_Cell_10X.h5ad"))

adata_sn_v2 = ad.AnnData(X=adata_sn.X)
adata_sn_v2.obs.index = adata_sn.obs.index
adata_sn_v2.var.index = adata_sn.var.index
adata_sn_v2.obs = meta_sn
adata_sn_v2.write(filename=os.path.join(data_path, "MouseBrain_Nuclei_SPLiT-seq.h5ad"))

adata_dropseq_10X = sc.concat((adata_dropseq_v2, adata_10X_v2))
adata_dropseq_10X.write(filename=os.path.join(data_path, "MouseBrain_Cell.h5ad"))

adata_dropseq_sn = sc.concat((adata_dropseq_v2, adata_sn_v2))
adata_dropseq_sn.write(filename=os.path.join(data_path, "MouseBrain_Drop-seq_Nuclei.h5ad"))

adata_10X_sn = sc.concat((adata_10X_v2, adata_sn_v2))
adata_10X_sn.write(filename=os.path.join(data_path, "MouseBrain_10X_Nuclei.h5ad"))

adata_full = sc.concat((adata_dropseq_v2, adata_10X_v2, adata_sn_v2))
adata_full.write(filename=os.path.join(data_path, "MouseBrain_Cell_Nuclei.h5ad"))

R[write to console]: Attaching SeuratObject

R[write to console]: Loading required package: SummarizedExperiment

R[write to console]: Loading required package: MatrixGenerics

R[write to console]: Loading required package: matrixStats

R[write to console]: 
Attaching package: ‘MatrixGenerics’


R[write to console]: The following objects are masked from ‘package:matrixStats’:

    colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
    colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
    colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
    colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
    colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
    colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
    colWeightedMeans, colWeightedMedians, colWeightedSds,
    colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
    rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
    rowCumsums, rowDiffs, rowIQR

In [None]:
# Run the following code in R to convert h5ad to h5seurat which Seurat handles
# Check current directory first
# library(SeuratDisk)
# Convert('MouseBrain_Cell.h5ad', 'h5seurat')
# Convert('MouseBrain_Drop-seq_Nuclei.h5ad', 'h5seurat')
# Convert('MouseBrain_10X_Nuclei.h5ad', 'h5seurat')
# Convert('MouseBrain_Cell_Nuclei.h5ad', 'h5seurat')