# Download

- Human dataset GSE142585

- Macaque dataset GSE142585

- Mouse dataset https://www.ebi.ac.uk/arrayexpress/experiments/E-MTAB-6946/

Supplementary files "orthologues_human_mouse.txt", "orthologues_human_macaque.txt" and "preprocess_spermatogenesis.R"

# Preprocess

In [6]:
import rpy2.robjects as ro
import rpy2.robjects.numpy2ri
import anndata2ri
import numpy as np
import pandas as pd
import anndata
import os
import scanpy as sc

In [7]:
ro.r['library']("Seurat")

ro.r('''load("../datasets/raw/Spermatogenesis/obj_human_sharedgenes.RData")''') # obj_human
ro.r('''load("../datasets/raw/Spermatogenesis/obj_mouse_sharedgenes.RData")''') # obj_mouse
ro.r('''load("../datasets/raw/Spermatogenesis/obj_macaque_sharedgenes.RData")''') # obj_macaque


ro.r('''cnts_human <- obj_human[["RNA"]]@counts''')
counts_human = np.array(ro.r("as.matrix(cnts_human)"))
genes_human = list(ro.r("as.vector(rownames(cnts_human))"))
cells_human = list(ro.r("as.vector(colnames(cnts_human))"))


ro.r('''cnts_mouse <- obj_mouse[["RNA"]]@counts''')
ro.r("cell_type <- as.vector(obj_mouse$cell_type)")
counts_mouse = np.array(ro.r('''as.matrix(cnts_mouse)'''))
genes_mouse = list(ro.r("as.vector(rownames(cnts_mouse))"))
cells_mouse = list(ro.r("as.vector(colnames(cnts_mouse))"))
cell_type_mouse = list(ro.r("as.vector(cell_type)"))


ro.r('''cnts_macaque <- obj_macaque[["RNA"]]@counts''')
counts_macaque = np.array(ro.r('''as.matrix(cnts_macaque)'''))
genes_macaque = list(ro.r("as.vector(rownames(cnts_macaque))"))
cells_macaque = list(ro.r("as.vector(colnames(cnts_macaque))"))


adata_human = anndata.AnnData(counts_human.T)
adata_human.obs.index = cells_human
adata_human.var.index = genes_human
adata_mouse = anndata.AnnData(counts_mouse.T)
adata_mouse.obs.index = cells_mouse
adata_mouse.var.index = genes_mouse
adata_macaque = anndata.AnnData(counts_macaque.T)
adata_macaque.obs.index = cells_macaque
adata_macaque.var.index = genes_macaque

ro.r('''anno_human <- read.table("../datasets/raw/Spermatogenesis/human/GSE142585_MergedHumanTestis4_PerCellAttributes.txt")''')
ro.r('''write.csv(anno_human, file="../datasets/raw/Spermatogenesis/create_GSE142585_MergedHumanTestis4_PerCellAttributes.csv")''')
cell_type_human_ref = pd.read_csv("../datasets/raw/Spermatogenesis/create_GSE142585_MergedHumanTestis4_PerCellAttributes.csv", index_col=0)
cell_type_human_ref = cell_type_human_ref.loc[list(adata_human.obs.index)]

ro.r('''anno_macaque <- read.table("../datasets/raw/Spermatogenesis/macaque/GSE142585_MergedMonkeyTestis5_PerCellAttributes.txt")''')
ro.r('''write.csv(anno_macaque, file="../datasets/raw/Spermatogenesis/create_GSE142585_MergedMonkeyTestis5_PerCellAttributes.csv")''')
cell_type_macaque_ref = pd.read_csv("../datasets/raw/Spermatogenesis/create_GSE142585_MergedMonkeyTestis5_PerCellAttributes.csv", index_col=0)
cell_type_macaque_ref = cell_type_macaque_ref.loc[list(adata_macaque.obs.index)]

cell_type_mouse = pd.read_csv("../datasets/raw/Spermatogenesis/mouseP30-anno.csv", index_col="barcode")

adata_human.obs["celltype"] = cell_type_human_ref["CellType"].values
adata_human.obs["batch"] = "human"
adata_mouse.obs["celltype"] = cell_type_mouse["cell_type_new"].values
adata_mouse.obs["batch"] = "mouse"
adata_macaque.obs["celltype"] = cell_type_macaque_ref["CellType"].values
adata_macaque.obs["batch"] = "macaque"

data_path = "../datasets/preprocessed/"

if not os.path.exists(data_path):
    
    os.makedirs(data_path)

adata_human.obs.index = adata_human.obs.index + "_human"
adata_mouse.obs.index = adata_mouse.obs.index + "_mouse"
adata_macaque.obs.index = adata_macaque.obs.index + "_macaque"

print(len(set(adata_human.var.index) & set(adata_mouse.var.index)))
print(len(set(adata_human.var.index) & set(adata_macaque.var.index)))
print(len(set(adata_mouse.var.index) & set(adata_macaque.var.index)))

adata_human.write(filename=os.path.join(data_path, "Spermatogenesis_human.h5ad"))
adata_mouse.write(filename=os.path.join(data_path, "Spermatogenesis_mouse.h5ad"))
adata_macaque.write(filename=os.path.join(data_path, "Spermatogenesis_macaque.h5ad"))

adata_human_macaque = sc.concat((adata_human, adata_macaque))
adata_human_macaque.write(filename=os.path.join(data_path, "Spermatogenesis_human_macaque.h5ad"))

adata_full = sc.concat((adata_human, adata_mouse, adata_macaque))
adata_full.write(filename=os.path.join(data_path, "Spermatogenesis_human_macaque_mouse.h5ad"))

... storing 'celltype' as categorical
... storing 'batch' as categorical


11922
11922
11922


... storing 'celltype' as categorical
... storing 'batch' as categorical
... storing 'celltype' as categorical
... storing 'batch' as categorical
... storing 'batch' as categorical
... storing 'batch' as categorical


In [3]:
# Run the following code in R to convert h5ad to h5seurat which Seurat handles
# Check current directory first
# library(SeuratDisk)
# Convert('Spermatogenesis_human_macaque.h5ad', 'h5seurat')
# Convert('Spermatogenesis_human_macaque_mouse.h5ad', 'h5seurat')