# Preprocessing of individual datasets

Import packages... 
- [scanpy](https://scanpy.readthedocs.io/en/stable/index.html)


In [None]:
import scanpy as sc
import scanpy.external as sce
import pandas as pd

import os

In [None]:
DATADIR="/workdir/dwm269/scMuscle2/data/align_out"

Load metadata

In [None]:
meta = pd.read_csv("../scMuscle2_metadata_v1-0.csv")
meta = meta.loc[meta["include"],] # remove unwanted metadata
meta = meta[[x in ["fastq", "bam"] for x in meta["file.format"]]] # remove samples w/ download issues

meta = meta[[x in ["muscle", "tendon"] for x in meta["tissue"]]] # subset by tissue 
meta = meta[[x in ["Homo sapiens"] for x in meta["species"]]] # subset by species , "Mus musculus"
# meta = meta.iloc[[8,12,23,34,45],] # subset by row index

meta = meta[[x != "" for x in meta["GSM.accession"]]] # 
meta = meta[[x != "" for x in meta["sample"]]] # 

meta.index = list(range(0, len(meta))) # reset row indices
meta

In [None]:
meta.species.value_counts()

In [None]:
# Calculate the number of PCs that contain some proportion (default is 95%) of the variance
def npcs(
  ADATA,
  var_perc=0.95,
  reduction="pca"
):
    import numpy as np
    get_var = lambda i: np.var(ADATA.obsm[reduction][:,i])

    if ADATA.obsm[reduction] is None:
        print(f"Reduction {reduction}, not found!")
        return None
    else:
        var_tmp = [get_var(i) for i in list(range(0,ADATA.obsm[reduction].shape[1]))]
        var_cut = var_perc * np.sum(var_tmp)
        n_pcs = 0
        var_sum = 0
        while var_sum<var_cut and n_pcs<ADATA.obsm[reduction].shape[1]-1:
            var_sum = var_sum + var_tmp[n_pcs]
            n_pcs = n_pcs + 1

        return(n_pcs)

Read in count data & initialize anndata objects. ALso add metadata to each object

In [None]:
# try:
#     scm_list.index
# except:
scm_list = pd.Series(
    index = meta["GSM.accession"],
    data = [""]*meta.shape[0]
)

In [None]:
for i in range(0,meta.shape[0]):
    if os.path.exists(DATADIR+"/"+meta["GSM.accession"][i]+"/STARsolo/Solo.out/GeneFull/filtered/matrix.mtx.gz"):
        print("Sample: " + meta["GSM.accession"][i]) 
#         try:
        scm_list[i] = sc.read_10x_mtx(
                path=DATADIR+"/"+meta["GSM.accession"][i]+"/STARsolo/Solo.out/GeneFull/filtered", 
                var_names='gene_symbols',
                make_unique=True,
                cache=True
            )
          
        scm_list[i].var_names_make_unique()  # this is unnecessary if using `var_names='gene_ids'` in `sc.read_10x_mtx`

        for j in range(0,meta.shape[1]): #metadata features
            scm_list[i].obs[meta.columns[j]] = meta.iloc[i,j]

        scm_list[i].layers['counts'] = scm_list[i].X # save counts as a layer for future plotting

        print("     Loaded " + str(scm_list[i].shape[0]) + " cells and " + str(scm_list[i].shape[1]) + " genes...")
    else:
        print("Can't find counts for " + str(meta["GSM.accession"][i]) + "...")
#         except:
#             print("Exception with " + meta["sample"][i])

Add ambient-RNA-scrubbed counts

In [None]:
#TODO

In [None]:
import gc

gc.collect()

QC filter and preprocess individual datasets

In [None]:
# gene/transcript, mito filters count filters
for i in range(0,meta.shape[0]):
    try:
        print(meta["GSM.accession"][i] + ': ' + str(scm_list[i].shape[0]) + " cells and " + str(scm_list[i].shape[1]) +' features...')  

        # Hard filters for feature and UMI counts
        sc.pp.filter_cells(
            scm_list[i],
            min_genes=500
        )
        sc.pp.filter_cells(
            scm_list[i], 
            min_counts=1000
        )
#         sc.pp.filter_cells(
#             scm_list[i], 
#             max_counts=25000
#         )

        # Hard filter for sparsely detected features
#         sc.pp.filter_genes(
#             scm_list[i],
#             min_cells=5
#         ) 

        scm_list[i].var['mito'] = scm_list[i].var_names.str.startswith('MT-')  # annotate the group of mitochondrial genes as 'mt'
        sc.pp.calculate_qc_metrics(
            scm_list[i], 
            qc_vars=['mito'], 
            percent_top=None, 
            log1p=False, 
            inplace=True
        )  

        # QC filter(s)
        scm_list[i] = scm_list[i][scm_list[i].obs.pct_counts_mito < 40, :] 

        print('After filtering: ' + str(scm_list[i].shape[0]) + " cells and " + str(scm_list[i].shape[1]) +' features...')  
    
    except:
            print("Exception with " + meta["GSM.accession"][i])
        
    print(" ")

Add metadata to each object

## Doublet removal via Scrublet

Estimate doublet scores

In [None]:
sc.settings.set_figure_params(
    fontsize=8
)

for i in range(0,meta.shape[0]):
    try:
        sc.external.pp.scrublet(
            scm_list[i]
        )
        sc.external.pl.scrublet_score_distribution(
            scm_list[i],
            figsize =[6,2.25]
        )
            
    except:
            print("Exception with " + meta["sample"][i])
        
    print(" ")

Estimate doublet score cutoff values for each sample

In [None]:
cutoff_threshold = [
    0.63, 0.63, 
    0.59, #D4_200um
    0.52, 
    0.6, #D4_1000um
    0.24, 
    0.2, 0.24,
#     0.58, # D20_600um
    0.63, # D21_200um
    0.18,
    0.18 #D21_1000um
]

In [None]:
scm_list

In [None]:
print("Final cell & feature counts:\n")
for i in range(0,meta.shape[0]):    
    scm_list[i] = scm_list[i][scm_list[i].obs["doublet_score"] < cutoff_threshold[i],]
    print(meta["sample"][i] + ': ' + str(scm_list[i].shape[0]) + " cells and " + str(scm_list[i].shape[1]) +' features...')  
    print("")

## Merge and save final AnnData object

In [None]:
scm = scm_list[0].concatenate(
    scm_list[1:],
    index_unique=None
#     join="inner"
#     batch_key="sample",
#     batch_categories=meta["sample"]
)

print(scm.shape)

In [None]:
meta['tissue']
