# Data preprocessing

This document describes the data preprocessing steps used in the paper.
* six gold standard datasets
* six silver standard datasets
* two large datasets

## six gold standard datasets

In [None]:
import scanpy as sc
import pandas as pd
import numpy as np
import h5py

In [None]:
c1=pd.read_csv('gold_countsfile.csv')
label1=pd.read_csv('gold_label.csv')
counts_list=c1.iloc[:,1].tolist().sort()
label_list=label1.iloc[:,1].tolist().sort()
filename=['Biase','deng','goolam','kolodziejczyk','pollen','yan']
for i in range(len(filename)):
    data=sc.read_csv(counts_list[i]).T
    label=pd.read_csv(label_list[i])
    data.obs['celltype']=pd.Categorical(label['label'])

    data.var['dropouts'] = np.sum(data.to_df()==0,axis=0)/data.to_df().shape[0]*100
    data = data[:,data.var['dropouts'].values >10]
    data = data[:,data.var['dropouts'].values <90]
    sc.pp.normalize_total(data, target_sum=1e4)

    sc.pp.log1p(data,base=2)

    sc.pp.highly_variable_genes(data, min_mean=0.0125, max_mean=3, min_disp=0.5)
    data = data[:, data.var.highly_variable]

    sc.pp.scale(data, max_value=10)

    data.var['MT']=data.var_names.str.startswith('MT')
    data.var['ercc'] = data.var_names.str.startswith('ERCC')  # annotate the group of mitochondrial genes as 'mt'
    sc.pp.calculate_qc_metrics(data, qc_vars=['ercc'], percent_top=None, log1p=False, inplace=True)
    data = data[:,~data.var['ercc']]
    data=data[:,~data.var['MT']]

    sc.tl.pca(data,svd_solver='arpack')
    data.write(filename[i]+'_pca.h5ad')

## six silver standard datasets

In [None]:
file='datasets/'
filenames=['10X_PBMC.h5','human_kidney.h5','mouse_retain.h5','worm_neuron.h5']
for i in range(len(file)):
    print(file[i])
    data_mat = h5py.File(file+file[i])
    data = sc.AnnData(np.array(data_mat['X']))
    data.obs['label'] = np.array(data_mat['Y'])
    sc.pp.filter_genes(data, min_counts=1)
    sc.pp.filter_cells(data, min_counts=1)
    sc.pp.normalize_per_cell(data,target_sum=1e4)
    sc.pp.log1p(data,base=2)
    sc.pp.highly_variable_genes(data, min_mean=0.0125, max_mean=3, min_disp=0.5)
    data = data[:, data.var.highly_variable]
    sc.pp.scale(data, max_value=10)
    sc.tl.pca(data, svd_solver='arpack')
    filename=file+'results/',file[i].split('.')[0]
    data.write(filename,'.h5ad')

### TAM FACS

In [None]:
adata=sc.read('tabula-muris-senis-facs-official-raw-obj.h5ad')
sc.pp.filter_genes(adata,min_cells=3)
sc.pp.filter_cells(adata,min_genes=250)
sc.pp.filter_cells(adata,min_counts=5000)
sc.pp.normalize_per_cell(adata,counts_per_cell_after=1e4)
sc.pp.log1p(adata)
sc.pp.highly_variable_genes(adata,min_mean=0.0125, max_mean=3, min_disp=0.5)
sc.pp.scale(adata,max_value=10)
data.write('facs_pca.h5ad')

## two large datasets

### Mouse brain

In [None]:
data=sc.read('1.3m_scAIDE_labels.h5ad')
data.obs['label']=pd.Categorical(data.obs['pred'].values.tolist())
print(data.obs['label'])
data.var_names_make_unique()  # this is unnecessary if using `var_names='gene_ids'` in `sc.read_10x_mtx`
sc.pp.filter_cells(data, min_genes=200)
sc.pp.filter_genes(data, min_cells=3)
data.var['mt'] = data.var_names.str.startswith('MT-')  # annotate the group of mitochondrial genes as 'mt'
sc.pp.calculate_qc_metrics(data, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)
data = data[data.obs.n_genes_by_counts < 2500, :]
data = data[data.obs.pct_counts_mt < 5, :]
sc.pp.normalize_total(data, target_sum=1e4)
sc.pp.log1p(data)
sc.pp.highly_variable_genes(data, min_mean=0.0125, max_mean=3, min_disp=0.5)
data = data[:,data.var.highly_variable]
sc.pp.scale(data, max_value=10)
sc.tl.pca(data)
data.write('brain_pca.h5ad')

### MCA

In [None]:
dat = sc.read('MCA_BatchRemoved_Merge_dge.h5ad')
sc.pp.filter_genes(dat, min_cells=3)
sc.pp.filter_cells(dat, min_genes=100)
sc.pp.normalize_total(dat, target_sum=1e4) ##标准化
sc.pp.log1p(dat,base=2)
sc.pp.highly_variable_genes(dat, min_mean=0.0125, max_mean=3, min_disp=0.5)
data1 = dat[:, dat.var.highly_variable]
sc.pp.scale(data1, max_value=10)
sc.tl.pca(data1, svd_solver='arpack')
data1.write('MCA_BatchRemoved_Merge_dge.h5ad')