# Data preprocessing

This document describes the data preprocessing steps used in the paper.
* six gold standard datasets
* six silver standard datasets
* two large datasets

## six gold standard datasets

In [None]:
import scanpy as sc
import pandas as pd
import numpy as np
import h5py

In [None]:
file ='scRNA-seq/data/gold_standard/gold_counts_label/'
c1=pd.read_csv(file+'gold_countsfile.csv')
label1=pd.read_csv(file+'gold_label.csv')
counts_list=c1.iloc[:,1].tolist()
label_list=label1.iloc[:,1].tolist()
filename=['Biase','deng','goolam','kolodziejczyk','pollen','yan']
for i in range(len(filename)):
    data=sc.read_csv(file+counts_list[i]).T
    label=pd.read_csv(file+label_list[i])
    print(label.iloc[0:5,-1])
    data.obs['celltype']=pd.Categorical(label.iloc[:,-1])
    data.var['dropouts'] = np.sum(data.to_df()==0,axis=0)/data.to_df().shape[0]*100
    
    print(f'***********************step1: gene/cell filtering')
    data = data[:,data.var['dropouts'].values >10]
    data = data[:,data.var['dropouts'].values <90]

    print(f'***********************step2: normalization')
    sc.pp.normalize_total(data, target_sum=1e4)
    sc.pp.log1p(data)

    print(f'***********************step3: selection of highly variable genes')
    sc.pp.highly_variable_genes(data, min_mean=0.0125, max_mean=3, min_disp=0.5)
    data = data[:, data.var.highly_variable]

    print(f'***********************step4: dimension reduction by PCA')
    sc.pp.scale(data, max_value=10)
    data.var['MT']=data.var_names.str.startswith('MT')
    data.var['ercc'] = data.var_names.str.startswith('ERCC')  # annotate the group of mitochondrial genes as 'mt'
    sc.pp.calculate_qc_metrics(data, qc_vars=['ercc'], percent_top=None, log1p=False, inplace=True)
    data = data[:,~data.var['ercc']]
    data=data[:,~data.var['MT']]
    sc.tl.pca(data,svd_solver='arpack')
    print(f'{counts_list[i]}: after processing: {data.shape}')
    data.write(filename[i]+'_pca.h5ad')

## six silver standard datasets

In [None]:
import h5py
import scanpy as sc
import numpy as np
file=['10X_PBMC.h5','human_kidney_counts.h5','Shekhar_mouse_retina_raw_data.h5',
'CITE_CBMC_counts_top2000.h5','worm_neuron_cell.h5']
for i in range(len(file)):
    print(file[i])
    data_mat = h5py.File(file[i])
    data = sc.AnnData(np.array(data_mat['X']))
    data.obs['label'] = np.array(data_mat['Y'])
    print(f'***********************step1: gene/cell filtering')
    sc.pp.filter_genes(data, min_counts=1)
    sc.pp.filter_cells(data, min_counts=1)
    
    print(f'***********************step2: normalization')
    sc.pp.normalize_per_cell(data)
    sc.pp.log1p(data)
    
    print(f'***********************step3: selection of highly variable genes')
    sc.pp.highly_variable_genes(data, min_mean=0.0125, max_mean=3, min_disp=0.5)
    data = data[:, data.var.highly_variable]
    print(f'{file[i]}: after processing: {data.shape}')
    print(f'***********************step4: dimension reduction by PCA')
    sc.pp.scale(data, max_value=10)
    sc.tl.pca(data, svd_solver='arpack')
    filename=file[i].split('.')[0]
    data.write(filename+'.h5ad')

### TAM FACS --- checked

In [None]:
adata=sc.read('tabula-muris-senis-facs-official-raw-obj.h5ad')
print(f'***********************step1: gene/cell filtering')
sc.pp.filter_genes(adata,min_cells=3)
sc.pp.filter_cells(adata,min_genes=250) 
sc.pp.calculate_qc_metrics(data, percent_top=None, log1p=False, inplace=True)
sc.pp.filter_cells(adata,min_counts=5000)

print(f'***********************step2: normalization')
# sc.pp.normalize_total(adata,target_sum=1e4)
sc.pp.normalize_per_cell(adata,counts_per_cell_after=1e4)
sc.pp.log1p(adata)

print(f'***********************step3: selection of highly variable genes')
sc.pp.highly_variable_genes(adata,min_mean=0.0125, max_mean=3, min_disp=0.5)
adata = adata[:, adata.var.highly_variable]

print(f'***********************step4: dimension reduction by PCA')
sc.pp.scale(adata,max_value=10)
sc.tl.pca(adata, svd_solver='arpack')
adata.write('facs_pca.h5ad')

## two large datasets

### Mouse brain 

In [None]:
# '/public/home/zhengxq/nieyating/H/1.3m_scAIDE_labels.h5ad'
data=sc.read('1.3m_scAIDE_labels.h5ad')
data.obs['label']=pd.Categorical(data.obs['pred'].values.tolist())
print(data.obs['label'])
data.var_names_make_unique()
print(f'***********************step1: gene/cell filtering')
sc.pp.filter_cells(data, min_genes=200)
sc.pp.filter_genes(data, min_cells=3)
sc.pp.calculate_qc_metrics(data, percent_top=None, log1p=False, inplace=True)
# data = data[data.obs.n_genes_by_counts < 2500, :]
data = data[data.obs.pct_counts_mt < 5, :]

print(f'***********************step2: normalization')
sc.pp.normalize_total(data, target_sum=1e4)
sc.pp.log1p(data)

print(f'***********************step3: selection of highly variable genes')
sc.pp.highly_variable_genes(data, min_mean=0.0125, max_mean=3, min_disp=0.5)
data = data[:,data.var.highly_variable]

print(f'***********************step4: dimension reduction by PCA')
sc.pp.scale(data, max_value=10)
sc.tl.pca(data)
data.write('brain_pca.h5ad')

### MCA

In [None]:
# /public/home/zhengxq/weinn/Clustering_scRNA/scRNA-seq/Cluster0906/guodata
dat = sc.read('MCA_BatchRemoved_Merge_dge.h5ad')
print(f'***********************step1: gene/cell filtering')
sc.pp.filter_genes(dat, min_cells=3)
sc.pp.filter_cells(dat, min_genes=100)

print(f'***********************step2: normalization')
sc.pp.normalize_total(dat, target_sum=1e4) 
sc.pp.log1p(dat)

print(f'***********************step3: selection of highly variable genes')
sc.pp.highly_variable_genes(dat, min_mean=0.0125, max_mean=3, min_disp=0.5)
dat = dat[:, dat.var.highly_variable]

print(f'***********************step4: dimension reduction by PCA')
sc.pp.scale(dat, max_value=10)
sc.tl.pca(dat, svd_solver='arpack')
dat.write('MCA_processed.h5ad')