# Download

The raw datasets are available for download from GSE109037 (human) and GSE109033 (mouse). In order to reproduce the results, it is advisable to ensure that the folders for both human and mouse datasets contain the following files: `barcodes.tsv`, `genes.tsv`, and `matrix.mtx`.

# Preprocess

In [None]:
import numpy as np
import pandas as pd
import scanpy as sc

In [None]:
adata_human = sc.read_10x_mtx('../datasets/raw/Sperm/human')
adata_mouse = sc.read_10x_mtx('../datasets/raw/Sperm/mouse')

adata_human.obs.index = ['human.' + item.split('-')[0] + '.' + item.split('-')[1] for item in adata_human.obs.index]
adata_mouse.obs.index = ['mouse.' + item.split('-')[0] + '.' + item.split('-')[1] for item in adata_mouse.obs.index]

anno_human = pd.read_csv('../datasets/raw/Sperm/human_cell_type.csv', index_col = 0)
anno_mouse = pd.read_csv('../datasets/raw/Sperm/mouse_cell_type.csv', index_col = 0)

anno_human.rename(columns = {'x': 'celltype'}, inplace = True)
anno_mouse.rename(columns = {'x': 'celltype'}, inplace = True)

anno_human.index = ['.'.join(item.split('-')) for item in anno_human.index]
anno_mouse.index = ['.'.join(item.split('-')) for item in anno_mouse.index]

adata_human.obs = anno_human.loc[adata_human.obs.index]
adata_mouse.obs = anno_mouse.loc[adata_mouse.obs.index]

adata_human.obs['batch'] = 'human'
adata_mouse.obs['batch'] = 'mouse'

adata_full = sc.concat((adata_human, adata_mouse))
adata_full.obs.celltype = [item.lower() for item in adata_full.obs.celltype]

adata_full.write('../datasets/preprocessed/Sperm_human_mouse.h5ad')

In [None]:
# Run the following code in R to convert h5ad to h5seurat which Seurat handles
# Check current directory first
# library(SeuratDisk)
# Convert('Sperm_human_mouse.h5ad', 'h5seurat')