# Preprocessing of individual datasets

Import packages... 
- [scanpy](https://scanpy.readthedocs.io/en/stable/index.html)


In [1]:
import scanpy as sc
import scanpy.external as sce
import pandas as pd

import os

In [2]:
DATADIR="/workdir/dwm269/scMuscle2/data/align_out"

Load metadata

In [78]:
meta = pd.read_csv("../scMuscle2_metadata_v1-0.csv")
meta = meta.loc[meta["include"],] # remove unwanted metadata
meta = meta[[x in ["fastq", "bam"] for x in meta["file.format"]]] # remove samples w/ download issues

meta = meta[[x in ["muscle", "tendon"] for x in meta["tissue"]]] # subset by tissue 
meta = meta[[x in ["Homo sapiens"] for x in meta["species"]]] # subset by species , "Mus musculus"
# meta = meta.iloc[[8,12,23,34,45],] # subset by row index

meta = meta[[x != "" for x in meta["GSM.accession"]]] # 
meta = meta[[x != "" for x in meta["sample"]]] # 

meta.index = list(range(0, len(meta))) # reset row indices
meta

Unnamed: 0,source.label,sample,description,tissue,subtissue,comments,include,species,GSE.accession,GSM.accession,...,sample.accession,SAMN.accession,file_checksum,other.accession,source,manuscript.doi,manuscript.pubmed,experiment.instrument,study.title,study.abstract
0,Dyer 2022,SJRHB030680_R1,A single-cell/nucleus atlas of pediatric rhabd...,muscle,tumor,Patient tumor,True,Homo sapiens,GSE174376,GSM5293229,...,,SAMN19159611,,,,,,Illumina NovaSeq 6000,,
1,Nakajima 2021,1_iPS,"Human iPS cell line, 1231A3",tendon,iPSC,,True,Homo sapiens,GSE156753,GSM4743488,...,SRS7251319,SAMN15893659,,,"Najajima et al, Nature Communications, 20021",,https://pubmed.ncbi.nlm.nih.gov/34408142/,Illumina HiSeq 2500,Grafting induced pluripotent stem cells-derive...,Self-renewal of tendons is rare since the vasc...
2,Nakajima 2021,2_PSM,iPS-derived presomitic mesoderm,tendon,iPSC,,True,Homo sapiens,GSE156753,GSM4743489,...,SRS7251320,SAMN15893658,,,"Najajima et al, Nature Communications, 20021",,https://pubmed.ncbi.nlm.nih.gov/34408142/,Illumina HiSeq 2500,Grafting induced pluripotent stem cells-derive...,Self-renewal of tendons is rare since the vasc...
3,Nakajima 2021,3_SM,iPS-derived somitic mesoderm,tendon,iPSC,,True,Homo sapiens,GSE156753,GSM4743490,...,SRS7251321,SAMN15893657,,,"Najajima et al, Nature Communications, 20021",,https://pubmed.ncbi.nlm.nih.gov/34408142/,Illumina HiSeq 2500,Grafting induced pluripotent stem cells-derive...,Self-renewal of tendons is rare since the vasc...
4,Nakajima 2021,4_SCL,iPS-derived sclerotome,tendon,iPSC,,True,Homo sapiens,GSE156753,GSM4743491,...,SRS7251322,SAMN15893656,,,"Najajima et al, Nature Communications, 20021",,https://pubmed.ncbi.nlm.nih.gov/34408142/,Illumina HiSeq 2500,Grafting induced pluripotent stem cells-derive...,Self-renewal of tendons is rare since the vasc...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
110,Lee 2021,BTS_D7_2,Day 7 post Burn/Tenotomy with sham neurectomy ...,tendon,,151bp R1; 28bp until poly(T),True,Homo sapiens,GSE163446,GSM4979414,...,SRS7895118,SAMN17106058,,,"Lee et al, Nature Communications, 2021",,https://pubmed.ncbi.nlm.nih.gov/34400627/,Illumina NovaSeq 6000,scRNA sequencing of cells harvested from the t...,scRNA sequencing was performed on cells harves...
111,Lee 2021,BTS_D7_3,Day 7 post Burn/Tenotomy with sham neurectomy ...,tendon,,151bp R1; 28bp until poly(T),True,Homo sapiens,GSE163446,GSM4979415,...,SRS7895119,SAMN17106057,,,"Lee et al, Nature Communications, 2021",,https://pubmed.ncbi.nlm.nih.gov/34400627/,Illumina NovaSeq 6000,scRNA sequencing of cells harvested from the t...,scRNA sequencing was performed on cells harves...
112,Lee 2021,BTN_D7_1,Day 7 post Burn/Tenotomy with neurectomy injur...,tendon,,151bp R1; 28bp until poly(T),True,Homo sapiens,GSE163446,GSM4979416,...,SRS7895120,SAMN17106056,,,"Lee et al, Nature Communications, 2021",,https://pubmed.ncbi.nlm.nih.gov/34400627/,Illumina NovaSeq 6000,scRNA sequencing of cells harvested from the t...,scRNA sequencing was performed on cells harves...
113,Lee 2021,BTN_D7_2,Day 7 post Burn/Tenotomy with neurectomy injur...,tendon,,151bp R1; 28bp until poly(T),True,Homo sapiens,GSE163446,GSM4979417,...,SRS7895121,SAMN17106065,,,"Lee et al, Nature Communications, 2021",,https://pubmed.ncbi.nlm.nih.gov/34400627/,Illumina NovaSeq 6000,scRNA sequencing of cells harvested from the t...,scRNA sequencing was performed on cells harves...


In [25]:
meta.species.value_counts()

Mus musculus                        378
Homo sapiens                        172
Danio rerio                          38
Homo sapiens and Pan troglodytes     22
Rattus norvegicus                     8
Gallus gallus                         7
Macaca fascicularis                   1
Name: species, dtype: int64

In [4]:
# Calculate the number of PCs that contain some proportion (default is 95%) of the variance
def npcs(
  ADATA,
  var_perc=0.95,
  reduction="pca"
):
    import numpy as np
    get_var = lambda i: np.var(ADATA.obsm[reduction][:,i])

    if ADATA.obsm[reduction] is None:
        print(f"Reduction {reduction}, not found!")
        return None
    else:
        var_tmp = [get_var(i) for i in list(range(0,ADATA.obsm[reduction].shape[1]))]
        var_cut = var_perc * np.sum(var_tmp)
        n_pcs = 0
        var_sum = 0
        while var_sum<var_cut and n_pcs<ADATA.obsm[reduction].shape[1]-1:
            var_sum = var_sum + var_tmp[n_pcs]
            n_pcs = n_pcs + 1

        return(n_pcs)

Read in count data & initialize anndata objects. ALso add metadata to each object

In [85]:
# try:
#     scm_list.index
# except:
scm_list = pd.Series(
    index = meta["GSM.accession"],
    data = [""]*meta.shape[0]
)

In [86]:
for i in range(0,meta.shape[0]):
    if os.path.exists(DATADIR+"/"+meta["GSM.accession"][i]+"/STARsolo/Solo.out/GeneFull/filtered/matrix.mtx.gz"):
        print("Sample: " + meta["GSM.accession"][i]) 
#         try:
        scm_list[i] = sc.read_10x_mtx(
                path=DATADIR+"/"+meta["GSM.accession"][i]+"/STARsolo/Solo.out/GeneFull/filtered", 
                var_names='gene_symbols',
                make_unique=True,
                cache=True
            )
          
        scm_list[i].var_names_make_unique()  # this is unnecessary if using `var_names='gene_ids'` in `sc.read_10x_mtx`

        for j in range(0,meta.shape[1]): #metadata features
            scm_list[i].obs[meta.columns[j]] = meta.iloc[i,j]

        scm_list[i].layers['counts'] = scm_list[i].X # save counts as a layer for future plotting

        print("     Loaded " + str(scm_list[i].shape[0]) + " cells and " + str(scm_list[i].shape[1]) + " genes...")
    else:
        print("Can't find counts for " + str(meta["GSM.accession"][i]) + "...")
#         except:
#             print("Exception with " + meta["sample"][i])

Can't find counts for GSM5293229...
Sample: GSM4743488
     Loaded 4838 cells and 61860 genes...
Sample: GSM4743489
     Loaded 3151 cells and 61860 genes...
Sample: GSM4743490
     Loaded 3047 cells and 61860 genes...
Sample: GSM4743491
     Loaded 2500 cells and 61860 genes...
Sample: GSM4743492
     Loaded 2172 cells and 61860 genes...
Sample: GSM4743493
     Loaded 2642 cells and 61860 genes...
Sample: GSM4743494
     Loaded 2706 cells and 61860 genes...
Sample: GSM4743495
     Loaded 3199 cells and 61860 genes...
Sample: GSM5848681
     Loaded 5498 cells and 61860 genes...
Sample: GSM5848680
     Loaded 5875 cells and 61860 genes...
Sample: GSM5848679
     Loaded 6434 cells and 61860 genes...
Sample: GSM5848678
     Loaded 10062 cells and 61860 genes...
Sample: GSM4272893
     Loaded 2392 cells and 61860 genes...
Sample: GSM4272894
     Loaded 902 cells and 61860 genes...
Sample: GSM4272895
     Loaded 1667 cells and 61860 genes...
Sample: GSM4272896
     Loaded 381 cells and 6186

Add ambient-RNA-scrubbed counts

In [None]:
#TODO

In [83]:
import gc

gc.collect()

272464425

QC filter and preprocess individual datasets

In [87]:
# gene/transcript, mito filters count filters
for i in range(0,meta.shape[0]):
    try:
        print(meta["sample"][i] + ': ' + str(scm_list[i].shape[0]) + " cells and " + str(scm_list[i].shape[1]) +' features...')  

        # Hard filters for feature and UMI counts
        sc.pp.filter_cells(
            scm_list[i],
            min_genes=500
        )
        sc.pp.filter_cells(
            scm_list[i], 
            min_counts=1000
        )
#         sc.pp.filter_cells(
#             scm_list[i], 
#             max_counts=25000
#         )

        # Hard filter for sparsely detected features
#         sc.pp.filter_genes(
#             scm_list[i],
#             min_cells=5
#         ) 

        scm_list[i].var['mito'] = scm_list[i].var_names.str.startswith('MT-')  # annotate the group of mitochondrial genes as 'mt'
        sc.pp.calculate_qc_metrics(
            scm_list[i], 
            qc_vars=['mito'], 
            percent_top=None, 
            log1p=False, 
            inplace=True
        )  

        # QC filter(s)
        scm_list[i] = scm_list[i][scm_list[i].obs.pct_counts_mito < 40, :] 

        print('After filtering: ' + str(scm_list[i].shape[0]) + " cells and " + str(scm_list[i].shape[1]) +' features...')  
    
    except:
            print("Exception with " + meta["sample"][i])
        
    print(" ")

Exception with SJRHB030680_R1
 
1_iPS: 4838 cells and 61860 features...
After filtering: 3051 cells and 61860 features...
 
2_PSM: 3151 cells and 61860 features...
After filtering: 1811 cells and 61860 features...
 
3_SM: 3047 cells and 61860 features...
After filtering: 2233 cells and 61860 features...
 
4_SCL: 2500 cells and 61860 features...
After filtering: 1869 cells and 61860 features...
 
5_SYN_day2: 2172 cells and 61860 features...
After filtering: 1936 cells and 61860 features...
 
6_SYN_day4: 2642 cells and 61860 features...
After filtering: 2484 cells and 61860 features...
 
7_SYN_day6: 2706 cells and 61860 features...
After filtering: 2577 cells and 61860 features...
 
8_SYN_day8: 3199 cells and 61860 features...
After filtering: 2531 cells and 61860 features...
 
RD cell line with LARRY barcodes after washing out Differentiation medium at third time point: 5498 cells and 61860 features...
After filtering: 5274 cells and 61860 features...
 
RD cell line with LARRY barcodes 

Add metadata to each object

## Doublet removal via Scrublet

Estimate doublet scores

In [None]:
sc.settings.set_figure_params(
    fontsize=8
)

for i in range(0,meta.shape[0]):
    try:
        sc.external.pp.scrublet(
            scm_list[i]
        )
        sc.external.pl.scrublet_score_distribution(
            scm_list[i],
            figsize =[6,2.25]
        )
            
    except:
            print("Exception with " + meta["sample"][i])
        
    print(" ")

Exception with SJRHB030680_R1
 


  view_to_actual(adata)
Trying to set attribute `.obs` of view, copying.


Automatically set threshold at doublet score = 0.48
Detected doublet rate = 0.1%
Estimated detectable doublet fraction = 0.7%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 9.1%


  pl.show()


 


  view_to_actual(adata)
Trying to set attribute `.obs` of view, copying.


Automatically set threshold at doublet score = 0.41
Detected doublet rate = 0.1%
Estimated detectable doublet fraction = 2.0%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 2.7%


  pl.show()


 


  view_to_actual(adata)
Trying to set attribute `.obs` of view, copying.


Automatically set threshold at doublet score = 0.44
Detected doublet rate = 0.1%
Estimated detectable doublet fraction = 3.3%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 2.7%


  pl.show()


 


  view_to_actual(adata)
Trying to set attribute `.obs` of view, copying.


Automatically set threshold at doublet score = 0.42
Detected doublet rate = 0.2%
Estimated detectable doublet fraction = 1.4%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 11.5%


  pl.show()


 


  view_to_actual(adata)
Trying to set attribute `.obs` of view, copying.


Automatically set threshold at doublet score = 0.42
Detected doublet rate = 0.2%
Estimated detectable doublet fraction = 6.6%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 3.1%


  pl.show()


 


  view_to_actual(adata)
Trying to set attribute `.obs` of view, copying.


Automatically set threshold at doublet score = 0.45
Detected doublet rate = 0.1%
Estimated detectable doublet fraction = 2.1%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 5.8%


  pl.show()


 


  view_to_actual(adata)
Trying to set attribute `.obs` of view, copying.


Automatically set threshold at doublet score = 0.45
Detected doublet rate = 0.3%
Estimated detectable doublet fraction = 2.6%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 10.3%


  pl.show()


 


  view_to_actual(adata)
Trying to set attribute `.obs` of view, copying.


Automatically set threshold at doublet score = 0.45
Detected doublet rate = 0.2%
Estimated detectable doublet fraction = 2.4%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 6.7%


  pl.show()


 


  view_to_actual(adata)
Trying to set attribute `.obs` of view, copying.


Automatically set threshold at doublet score = 0.55
Detected doublet rate = 0.1%
Estimated detectable doublet fraction = 0.6%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 9.4%


  pl.show()


 


  view_to_actual(adata)
Trying to set attribute `.obs` of view, copying.


Automatically set threshold at doublet score = 0.55
Detected doublet rate = 0.1%
Estimated detectable doublet fraction = 0.9%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 8.9%


  pl.show()


 


  view_to_actual(adata)
Trying to set attribute `.obs` of view, copying.


Automatically set threshold at doublet score = 0.56
Detected doublet rate = 0.0%
Estimated detectable doublet fraction = 1.0%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 3.3%


  pl.show()


 


  view_to_actual(adata)
Trying to set attribute `.obs` of view, copying.


Automatically set threshold at doublet score = 0.63
Detected doublet rate = 0.0%
Estimated detectable doublet fraction = 0.1%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 20.0%


  pl.show()


 


  view_to_actual(adata)
Trying to set attribute `.obs` of view, copying.


Automatically set threshold at doublet score = 0.41
Detected doublet rate = 0.7%
Estimated detectable doublet fraction = 10.3%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 6.9%


  pl.show()


 


  view_to_actual(adata)
Trying to set attribute `.obs` of view, copying.


Automatically set threshold at doublet score = 0.29
Detected doublet rate = 0.6%
Estimated detectable doublet fraction = 27.1%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 2.0%


  pl.show()


 


  view_to_actual(adata)
Trying to set attribute `.obs` of view, copying.


Automatically set threshold at doublet score = 0.36
Detected doublet rate = 0.4%
Estimated detectable doublet fraction = 10.8%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 3.9%


  pl.show()


 


  view_to_actual(adata)
Trying to set attribute `.obs` of view, copying.


Automatically set threshold at doublet score = 0.20
Detected doublet rate = 1.2%
Estimated detectable doublet fraction = 26.9%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 4.4%


  pl.show()


 


  view_to_actual(adata)
Trying to set attribute `.obs` of view, copying.


Automatically set threshold at doublet score = 0.32
Detected doublet rate = 0.0%
Estimated detectable doublet fraction = 7.7%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 0.0%


  pl.show()


 


  view_to_actual(adata)
Trying to set attribute `.obs` of view, copying.


Automatically set threshold at doublet score = 0.29
Detected doublet rate = 1.5%
Estimated detectable doublet fraction = 31.8%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 4.7%


  pl.show()


 


  view_to_actual(adata)
Trying to set attribute `.obs` of view, copying.


Automatically set threshold at doublet score = 0.43
Detected doublet rate = 0.3%
Estimated detectable doublet fraction = 25.3%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 1.2%


  pl.show()


 


  view_to_actual(adata)
Trying to set attribute `.obs` of view, copying.


Automatically set threshold at doublet score = 0.53
Detected doublet rate = 0.6%
Estimated detectable doublet fraction = 9.0%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 6.2%


  pl.show()


 


  view_to_actual(adata)
Trying to set attribute `.obs` of view, copying.


Automatically set threshold at doublet score = 0.20
Detected doublet rate = 4.4%
Estimated detectable doublet fraction = 51.2%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 8.6%


  pl.show()


 


  view_to_actual(adata)
Trying to set attribute `.obs` of view, copying.


Automatically set threshold at doublet score = 0.21
Detected doublet rate = 3.4%
Estimated detectable doublet fraction = 52.5%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 6.5%


  pl.show()


 


  view_to_actual(adata)
Trying to set attribute `.obs` of view, copying.


Automatically set threshold at doublet score = 0.50
Detected doublet rate = 0.3%
Estimated detectable doublet fraction = 3.4%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 9.8%


  pl.show()


 


  view_to_actual(adata)
Trying to set attribute `.obs` of view, copying.


Automatically set threshold at doublet score = 0.54
Detected doublet rate = 0.0%
Estimated detectable doublet fraction = 0.4%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 9.1%


  pl.show()


 


  view_to_actual(adata)
Trying to set attribute `.obs` of view, copying.


Automatically set threshold at doublet score = 0.48
Detected doublet rate = 0.0%
Estimated detectable doublet fraction = 0.3%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 12.5%


  pl.show()


 


  view_to_actual(adata)
Trying to set attribute `.obs` of view, copying.


Automatically set threshold at doublet score = 0.40
Detected doublet rate = 0.4%
Estimated detectable doublet fraction = 1.7%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 26.4%


  pl.show()


 


  view_to_actual(adata)
Trying to set attribute `.obs` of view, copying.


Automatically set threshold at doublet score = 0.49
Detected doublet rate = 0.2%
Estimated detectable doublet fraction = 2.2%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 11.4%


  pl.show()


 


  view_to_actual(adata)
Trying to set attribute `.obs` of view, copying.


Automatically set threshold at doublet score = 0.46
Detected doublet rate = 0.1%
Estimated detectable doublet fraction = 1.6%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 4.7%


  pl.show()


 


  view_to_actual(adata)
Trying to set attribute `.obs` of view, copying.


Automatically set threshold at doublet score = 0.39
Detected doublet rate = 0.1%
Estimated detectable doublet fraction = 0.4%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 33.3%


  pl.show()


 


  view_to_actual(adata)
Trying to set attribute `.obs` of view, copying.


Automatically set threshold at doublet score = 0.39
Detected doublet rate = 0.5%
Estimated detectable doublet fraction = 3.5%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 13.2%


  pl.show()


 


  view_to_actual(adata)
Trying to set attribute `.obs` of view, copying.


Automatically set threshold at doublet score = 0.55
Detected doublet rate = 0.1%
Estimated detectable doublet fraction = 1.2%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 4.8%


  pl.show()


 


  view_to_actual(adata)
Trying to set attribute `.obs` of view, copying.


Automatically set threshold at doublet score = 0.52
Detected doublet rate = 0.1%
Estimated detectable doublet fraction = 0.4%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 13.8%


  pl.show()


 


  view_to_actual(adata)
Trying to set attribute `.obs` of view, copying.


Automatically set threshold at doublet score = 0.46
Detected doublet rate = 0.2%
Estimated detectable doublet fraction = 0.6%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 29.4%


  pl.show()


 


  view_to_actual(adata)
Trying to set attribute `.obs` of view, copying.


Automatically set threshold at doublet score = 0.51
Detected doublet rate = 0.3%
Estimated detectable doublet fraction = 4.9%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 7.1%


  pl.show()


 


  view_to_actual(adata)
Trying to set attribute `.obs` of view, copying.


Automatically set threshold at doublet score = 0.40
Detected doublet rate = 0.2%
Estimated detectable doublet fraction = 0.3%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 73.7%


  pl.show()


 


  view_to_actual(adata)
Trying to set attribute `.obs` of view, copying.


Automatically set threshold at doublet score = 0.43
Detected doublet rate = 0.0%
Estimated detectable doublet fraction = 0.9%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 5.4%


  pl.show()


 


  view_to_actual(adata)
Trying to set attribute `.obs` of view, copying.


Automatically set threshold at doublet score = 0.41
Detected doublet rate = 0.1%
Estimated detectable doublet fraction = 0.2%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 32.3%


  pl.show()


 


  view_to_actual(adata)
Trying to set attribute `.obs` of view, copying.


Automatically set threshold at doublet score = 0.44
Detected doublet rate = 0.0%
Estimated detectable doublet fraction = 0.1%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 11.1%


  pl.show()


 


  view_to_actual(adata)
Trying to set attribute `.obs` of view, copying.


Automatically set threshold at doublet score = 0.49
Detected doublet rate = 0.0%
Estimated detectable doublet fraction = 0.1%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 32.3%


  pl.show()


 


  view_to_actual(adata)
Trying to set attribute `.obs` of view, copying.


Automatically set threshold at doublet score = 0.60
Detected doublet rate = 0.0%
Estimated detectable doublet fraction = 0.1%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 0.0%


  pl.show()


 


  view_to_actual(adata)
Trying to set attribute `.obs` of view, copying.


Automatically set threshold at doublet score = 0.58
Detected doublet rate = 0.0%
Estimated detectable doublet fraction = 0.5%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 3.2%


  pl.show()


 


  view_to_actual(adata)
Trying to set attribute `.obs` of view, copying.


Automatically set threshold at doublet score = 0.58
Detected doublet rate = 0.0%
Estimated detectable doublet fraction = 0.4%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 4.1%


  pl.show()


 


  view_to_actual(adata)
Trying to set attribute `.obs` of view, copying.


Automatically set threshold at doublet score = 0.55
Detected doublet rate = 0.1%
Estimated detectable doublet fraction = 2.3%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 4.7%


  pl.show()


 


  view_to_actual(adata)
Trying to set attribute `.obs` of view, copying.


Automatically set threshold at doublet score = 0.62
Detected doublet rate = 0.0%
Estimated detectable doublet fraction = 0.1%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 0.0%


  pl.show()


 


  view_to_actual(adata)
Trying to set attribute `.obs` of view, copying.


Automatically set threshold at doublet score = 0.13
Detected doublet rate = 9.2%
Estimated detectable doublet fraction = 15.1%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 61.0%


  pl.show()


 


  view_to_actual(adata)
Trying to set attribute `.obs` of view, copying.


Automatically set threshold at doublet score = 0.60
Detected doublet rate = 0.0%
Estimated detectable doublet fraction = 1.4%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 3.5%


  pl.show()


 


  view_to_actual(adata)
Trying to set attribute `.obs` of view, copying.


Automatically set threshold at doublet score = 0.26
Detected doublet rate = 0.2%
Estimated detectable doublet fraction = 20.2%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 1.0%


  pl.show()


 


  view_to_actual(adata)
Trying to set attribute `.obs` of view, copying.


Automatically set threshold at doublet score = 0.18
Detected doublet rate = 1.1%
Estimated detectable doublet fraction = 24.9%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 4.5%


  pl.show()


 


  view_to_actual(adata)
Trying to set attribute `.obs` of view, copying.


Automatically set threshold at doublet score = 0.21
Detected doublet rate = 0.9%
Estimated detectable doublet fraction = 15.7%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 5.5%


  pl.show()


 


  view_to_actual(adata)
Trying to set attribute `.obs` of view, copying.


Automatically set threshold at doublet score = 0.14
Detected doublet rate = 3.4%
Estimated detectable doublet fraction = 5.6%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 60.0%


  pl.show()


 


  view_to_actual(adata)
Trying to set attribute `.obs` of view, copying.


Automatically set threshold at doublet score = 0.22
Detected doublet rate = 0.0%
Estimated detectable doublet fraction = 0.1%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 0.0%


  pl.show()


 


  view_to_actual(adata)
Trying to set attribute `.obs` of view, copying.


Automatically set threshold at doublet score = 0.30
Detected doublet rate = 1.3%
Estimated detectable doublet fraction = 30.0%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 4.5%


  pl.show()


 


  view_to_actual(adata)
Trying to set attribute `.obs` of view, copying.


Automatically set threshold at doublet score = 0.12
Detected doublet rate = 8.1%
Estimated detectable doublet fraction = 4.4%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 184.6%


  pl.show()


 


  view_to_actual(adata)
Trying to set attribute `.obs` of view, copying.


Automatically set threshold at doublet score = 0.17
Detected doublet rate = 4.9%
Estimated detectable doublet fraction = 4.1%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 120.0%


  pl.show()


 


  view_to_actual(adata)
Trying to set attribute `.obs` of view, copying.


Automatically set threshold at doublet score = 0.14
Detected doublet rate = 1.1%
Estimated detectable doublet fraction = 6.0%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 18.2%


  pl.show()


 


  view_to_actual(adata)
Trying to set attribute `.obs` of view, copying.


Automatically set threshold at doublet score = 0.37
Detected doublet rate = 0.1%
Estimated detectable doublet fraction = 0.5%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 12.8%


  pl.show()


 


  view_to_actual(adata)
Trying to set attribute `.obs` of view, copying.


Automatically set threshold at doublet score = 0.14
Detected doublet rate = 5.0%
Estimated detectable doublet fraction = 5.4%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 90.9%


  pl.show()


 


  view_to_actual(adata)
Trying to set attribute `.obs` of view, copying.


Automatically set threshold at doublet score = 0.14
Detected doublet rate = 8.0%
Estimated detectable doublet fraction = 6.9%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 116.7%


  pl.show()


 


  view_to_actual(adata)
Trying to set attribute `.obs` of view, copying.


Automatically set threshold at doublet score = 0.60
Detected doublet rate = 0.1%
Estimated detectable doublet fraction = 0.3%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 18.2%


  pl.show()


 


  view_to_actual(adata)
Trying to set attribute `.obs` of view, copying.


Automatically set threshold at doublet score = 0.10
Detected doublet rate = 22.2%
Estimated detectable doublet fraction = 21.7%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 102.3%


  pl.show()


 


  view_to_actual(adata)
Trying to set attribute `.obs` of view, copying.


Automatically set threshold at doublet score = 0.11
Detected doublet rate = 9.3%
Estimated detectable doublet fraction = 12.0%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 77.8%


  pl.show()


 


  view_to_actual(adata)
Trying to set attribute `.obs` of view, copying.


Automatically set threshold at doublet score = 0.16
Detected doublet rate = 2.0%
Estimated detectable doublet fraction = 5.3%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 37.5%


  pl.show()


 


  view_to_actual(adata)
Trying to set attribute `.obs` of view, copying.


Automatically set threshold at doublet score = 0.11
Detected doublet rate = 6.9%
Estimated detectable doublet fraction = 20.4%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 34.0%


  pl.show()


 


  view_to_actual(adata)
Trying to set attribute `.obs` of view, copying.


Automatically set threshold at doublet score = 0.09
Detected doublet rate = 18.0%
Estimated detectable doublet fraction = 27.5%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 65.3%


  pl.show()


 


  view_to_actual(adata)
Trying to set attribute `.obs` of view, copying.


Automatically set threshold at doublet score = 0.10
Detected doublet rate = 28.7%
Estimated detectable doublet fraction = 17.0%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 169.2%


  pl.show()


 


  view_to_actual(adata)
Trying to set attribute `.obs` of view, copying.


Automatically set threshold at doublet score = 0.16
Detected doublet rate = 6.3%
Estimated detectable doublet fraction = 17.3%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 36.4%


  pl.show()


 


  view_to_actual(adata)
Trying to set attribute `.obs` of view, copying.


Automatically set threshold at doublet score = 0.12
Detected doublet rate = 12.8%
Estimated detectable doublet fraction = 6.4%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 200.0%


  pl.show()


 


  view_to_actual(adata)
Trying to set attribute `.obs` of view, copying.


Automatically set threshold at doublet score = 0.16
Detected doublet rate = 2.1%
Estimated detectable doublet fraction = 4.1%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 50.0%


  pl.show()


 


  view_to_actual(adata)
Trying to set attribute `.obs` of view, copying.


Automatically set threshold at doublet score = 0.14
Detected doublet rate = 4.7%
Estimated detectable doublet fraction = 5.7%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 83.3%


  pl.show()


 


  view_to_actual(adata)
Trying to set attribute `.obs` of view, copying.


Automatically set threshold at doublet score = 0.24
Detected doublet rate = 3.2%
Estimated detectable doublet fraction = 40.8%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 7.8%


  pl.show()


 


  view_to_actual(adata)
Trying to set attribute `.obs` of view, copying.


Automatically set threshold at doublet score = 0.54
Detected doublet rate = 0.0%
Estimated detectable doublet fraction = 0.3%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 0.0%


  pl.show()


 


  view_to_actual(adata)


Automatically set threshold at doublet score = 0.41
Detected doublet rate = 0.0%
Estimated detectable doublet fraction = 0.1%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 9.5%


Trying to set attribute `.obs` of view, copying.
  pl.show()


 


  view_to_actual(adata)
Trying to set attribute `.obs` of view, copying.


Automatically set threshold at doublet score = 0.39
Detected doublet rate = 0.4%
Estimated detectable doublet fraction = 8.8%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 4.5%


  pl.show()


 


  view_to_actual(adata)
Trying to set attribute `.obs` of view, copying.


Automatically set threshold at doublet score = 0.17
Detected doublet rate = 5.9%
Estimated detectable doublet fraction = 50.0%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 11.8%


  pl.show()


 


  view_to_actual(adata)
Trying to set attribute `.obs` of view, copying.


Automatically set threshold at doublet score = 0.60
Detected doublet rate = 0.1%
Estimated detectable doublet fraction = 1.0%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 10.3%


  pl.show()


 


  view_to_actual(adata)
Trying to set attribute `.obs` of view, copying.


Automatically set threshold at doublet score = 0.48
Detected doublet rate = 0.1%
Estimated detectable doublet fraction = 3.4%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 3.7%


  pl.show()


 


  view_to_actual(adata)
Trying to set attribute `.obs` of view, copying.


Automatically set threshold at doublet score = 0.53
Detected doublet rate = 0.0%
Estimated detectable doublet fraction = 1.0%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 2.2%


  pl.show()


 


  view_to_actual(adata)
Trying to set attribute `.obs` of view, copying.


Automatically set threshold at doublet score = 0.54
Detected doublet rate = 0.3%
Estimated detectable doublet fraction = 4.8%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 6.6%


  pl.show()


 


  view_to_actual(adata)
Trying to set attribute `.obs` of view, copying.


Automatically set threshold at doublet score = 0.54
Detected doublet rate = 0.3%
Estimated detectable doublet fraction = 5.9%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 5.9%


  pl.show()


 


  view_to_actual(adata)
Trying to set attribute `.obs` of view, copying.


Automatically set threshold at doublet score = 0.54
Detected doublet rate = 0.2%
Estimated detectable doublet fraction = 4.9%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 3.7%


  pl.show()


 


  view_to_actual(adata)
Trying to set attribute `.obs` of view, copying.


Automatically set threshold at doublet score = 0.59
Detected doublet rate = 0.1%
Estimated detectable doublet fraction = 1.3%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 7.8%


  pl.show()


 


  view_to_actual(adata)
Trying to set attribute `.obs` of view, copying.


Automatically set threshold at doublet score = 0.10
Detected doublet rate = 13.3%
Estimated detectable doublet fraction = 11.1%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 120.0%


  pl.show()


 


  view_to_actual(adata)
Trying to set attribute `.obs` of view, copying.


Automatically set threshold at doublet score = 0.56
Detected doublet rate = 0.1%
Estimated detectable doublet fraction = 3.1%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 3.2%


  pl.show()


 


  view_to_actual(adata)
Trying to set attribute `.obs` of view, copying.


Automatically set threshold at doublet score = 0.14
Detected doublet rate = 5.4%
Estimated detectable doublet fraction = 7.6%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 70.6%


  pl.show()


 


  view_to_actual(adata)
Trying to set attribute `.obs` of view, copying.


Automatically set threshold at doublet score = 0.19
Detected doublet rate = 1.6%
Estimated detectable doublet fraction = 4.5%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 35.3%


  pl.show()


 


  view_to_actual(adata)
Trying to set attribute `.obs` of view, copying.


Automatically set threshold at doublet score = 0.14
Detected doublet rate = 5.2%
Estimated detectable doublet fraction = 20.3%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 25.5%


  pl.show()


 


  view_to_actual(adata)
Trying to set attribute `.obs` of view, copying.


Automatically set threshold at doublet score = 0.12
Detected doublet rate = 11.8%
Estimated detectable doublet fraction = 9.4%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 125.0%


  pl.show()


 


  view_to_actual(adata)
Trying to set attribute `.obs` of view, copying.


Automatically set threshold at doublet score = 0.14
Detected doublet rate = 9.6%
Estimated detectable doublet fraction = 13.9%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 68.8%


  pl.show()


 


  view_to_actual(adata)
Trying to set attribute `.obs` of view, copying.


Automatically set threshold at doublet score = 0.55
Detected doublet rate = 0.2%
Estimated detectable doublet fraction = 3.5%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 5.1%


  pl.show()


 


  view_to_actual(adata)
Trying to set attribute `.obs` of view, copying.


Automatically set threshold at doublet score = 0.54
Detected doublet rate = 0.2%
Estimated detectable doublet fraction = 1.5%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 14.6%


  pl.show()


 


  view_to_actual(adata)
Trying to set attribute `.obs` of view, copying.


Automatically set threshold at doublet score = 0.14
Detected doublet rate = 8.5%
Estimated detectable doublet fraction = 7.4%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 114.3%


  pl.show()


 


  view_to_actual(adata)
Trying to set attribute `.obs` of view, copying.


Automatically set threshold at doublet score = 0.14
Detected doublet rate = 1.1%
Estimated detectable doublet fraction = 9.8%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 11.8%


  pl.show()


 


  view_to_actual(adata)
Trying to set attribute `.obs` of view, copying.


Automatically set threshold at doublet score = 0.14
Detected doublet rate = 6.4%
Estimated detectable doublet fraction = 12.2%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 52.2%


  pl.show()


 


  view_to_actual(adata)
Trying to set attribute `.obs` of view, copying.


Automatically set threshold at doublet score = 0.06
Detected doublet rate = 51.1%
Estimated detectable doublet fraction = 33.0%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 154.8%


  pl.show()


 


  view_to_actual(adata)
Trying to set attribute `.obs` of view, copying.


Automatically set threshold at doublet score = 0.11
Detected doublet rate = 15.3%
Estimated detectable doublet fraction = 5.1%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 300.0%


  pl.show()


 


  view_to_actual(adata)
Trying to set attribute `.obs` of view, copying.


Automatically set threshold at doublet score = 0.55
Detected doublet rate = 0.2%
Estimated detectable doublet fraction = 3.3%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 4.8%


  pl.show()


 


  view_to_actual(adata)
Trying to set attribute `.obs` of view, copying.


Automatically set threshold at doublet score = 0.35
Detected doublet rate = 0.1%
Estimated detectable doublet fraction = 0.3%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 17.4%


  pl.show()


 


  view_to_actual(adata)
Trying to set attribute `.obs` of view, copying.


Automatically set threshold at doublet score = 0.68
Detected doublet rate = 0.0%
Estimated detectable doublet fraction = 0.5%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 2.6%


  pl.show()


 


  view_to_actual(adata)
Trying to set attribute `.obs` of view, copying.


Automatically set threshold at doublet score = 0.54
Detected doublet rate = 0.0%
Estimated detectable doublet fraction = 0.1%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 15.4%


  pl.show()


 


  view_to_actual(adata)
Trying to set attribute `.obs` of view, copying.


Automatically set threshold at doublet score = 0.57
Detected doublet rate = 0.0%
Estimated detectable doublet fraction = 0.2%
Overall doublet rate:
	Expected   = 5.0%
	Estimated  = 6.7%


  pl.show()


 


Estimate doublet score cutoff values for each sample

In [None]:
cutoff_threshold = [
    0.63, 0.63, 
    0.59, #D4_200um
    0.52, 
    0.6, #D4_1000um
    0.24, 
    0.2, 0.24,
#     0.58, # D20_600um
    0.63, # D21_200um
    0.18,
    0.18 #D21_1000um
]

In [56]:
scm_list

['',
 AnnData object with n_obs × n_vars = 5234 × 61860
     obs: 'source.label', 'sample', 'description', 'tissue', 'subtissue', 'comments', 'include', 'species', 'GSE.accession', 'GSM.accession', 'chemistry', 'file.format', 'SRR.accession', 'file_link', 'experiment.accession', 'study.accession', 'sample.accession', 'SAMN.accession', 'file_checksum', 'other.accession', 'source', 'manuscript.doi', 'manuscript.pubmed', 'experiment.instrument', 'study.title', 'study.abstract', 'n_genes', 'n_counts', 'n_genes_by_counts', 'total_counts', 'total_counts_mito', 'pct_counts_mito', 'doublet_score', 'predicted_doublet'
     var: 'gene_ids', 'feature_types', 'mito', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts'
     uns: 'scrublet'
     layers: 'counts',
 AnnData object with n_obs × n_vars = 5110 × 61860
     obs: 'source.label', 'sample', 'description', 'tissue', 'subtissue', 'comments', 'include', 'species', 'GSE.accession', 'GSM.accession', 'chemistry', 'file.form

In [None]:
print("Final cell & feature counts:\n")
for i in range(0,meta.shape[0]):    
    scm_list[i] = scm_list[i][scm_list[i].obs["doublet_score"] < cutoff_threshold[i],]
    print(meta["sample"][i] + ': ' + str(scm_list[i].shape[0]) + " cells and " + str(scm_list[i].shape[1]) +' features...')  
    print("")

## Merge and save final AnnData object

In [None]:
scm = scm_list[0].concatenate(
    scm_list[1:],
    index_unique=None
#     join="inner"
#     batch_key="sample",
#     batch_categories=meta["sample"]
)

print(scm.shape)

In [9]:
meta['tissue']


0      muscle
1      muscle
2      muscle
3      muscle
4      muscle
        ...  
338    muscle
339    muscle
340    muscle
341    muscle
342    muscle
Name: tissue, Length: 343, dtype: object