In [3]:
import scanpy as sc
import glob
import os
from functools import reduce
import anndata as ad
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib as mpl
import importlib
from sctools import qc, plot

## Quality control of raw data

In [4]:
adata = sc.read_h5ad(
    '../data/inflammatory_disease.h5ad'
)
# remove UC, atopic dermatitis, pbmcs and doublets
adata = adata[
    ~adata.obs.sample_id.str.startswith('GSM') &
    ~adata.obs.tissue.str.startswith('blood') & 
    ~adata.obs.tissue.str.startswith('Blood') & 
    ~adata.obs.doublet &
    ~(adata.obs.status == 'atopic eczema')
].copy()
adata

AnnData object with n_obs × n_vars = 422087 × 33538
    obs: 'sample_id', 'patient_id', 'status', 'tissue', 'cell_fraction', 'doublet', 'doublet_score'

In [20]:
adata.obs[adata.obs.status == 'psoriasis']

Unnamed: 0,sample_id,patient_id,status,tissue,cell_fraction,doublet,doublet_score
AAACCTGAGACTAGAT-57,SKN8090576,P1,psoriasis,dermis,CD45+,False,1.111986
AAACCTGAGAGACTAT-57,SKN8090576,P1,psoriasis,dermis,CD45+,False,0.000005
AAACCTGAGCGCTCCA-57,SKN8090576,P1,psoriasis,dermis,CD45+,False,0.000003
AAACCTGAGGACCACA-57,SKN8090576,P1,psoriasis,dermis,CD45+,False,0.007664
AAACCTGAGGCTCAGA-57,SKN8090576,P1,psoriasis,dermis,CD45+,False,3.118231
...,...,...,...,...,...,...,...
TTTGTCATCAACCAAC-80,SKN8090607,P3,psoriasis,epidermis,CD45-,False,0.318868
TTTGTCATCAGAAATG-80,SKN8090607,P3,psoriasis,epidermis,CD45-,False,11.435356
TTTGTCATCCCACTTG-80,SKN8090607,P3,psoriasis,epidermis,CD45-,False,0.002657
TTTGTCATCTATCGCC-80,SKN8090607,P3,psoriasis,epidermis,CD45-,False,0.000061


In [8]:
qc.compute_qc_metrics(adata)

In [None]:
fig = plot.qc.plot_qc(
    adata,
    sample_id_column = 'sample_id'
)
fig.set_figwidth(20)
fig.set_figheight(adata.obs.sample_id.nunique() * 2.5)
fig.tight_layout()

In [None]:
MIN_RNA_FEATURES = 750
MAX_RNA_FEATURES = 6000
MAX_PERCENT_MT = 15
qc_thresholds = {
    k: {'nFeature_RNA': (MIN_RNA_FEATURES, MAX_RNA_FEATURES), 'percent_mt': (0, MAX_PERCENT_MT), 'percent_ribo': (5, 100)} 
    for k in adata.obs.sample_id
}

try:
    assert len(qc_thresholds) == adata.obs.sample_id.nunique()
    
except AssertionError:
    print('qc_threshold does not have the same length as there are unique sample_ids! Please make sure all sample_ids are in qc_thresholds!')
    raise AssertionError
    
try:
    assert all(x in qc_thresholds for x in adata.obs.sample_id.unique())
    
except AssertionError:
    print([x for x in adata.obs.sample_id.unique() if not x in qc_thresholds], 'are missing from qc_thresholds! Please check your code!')
    raise AssertionError

qc.apply_qc_thresholds(
    adata, 
    'sample_id', 
    qc_thresholds
)
ncells = adata.obs.shape[0]
nfiltered = ncells - adata.obs.qc_pass.sum()
print(
    f'{nfiltered} of {ncells} cells would be removed using the current qc thresholds'
)

In [None]:
fig = plot.qc.plot_qc(
    adata,
    thresholds = qc_thresholds,
    sample_id_column = 'sample_id'
)
fig.set_figwidth(20)
fig.set_figheight(adata.obs.sample_id.nunique() * 2.5)
fig.tight_layout()

In [6]:
# apply qc filter
adata = adata[adata.obs.qc_pass, :].copy()

In [13]:
# filter genes
nexpressed_threshold = 100
ngenes_passed = (qc.get_nexpressed(adata) >= nexpressed_threshold).sum()
ngenes = adata.var.shape[0]
print(
    f'{ngenes_passed} of {ngenes} are retained requiring their expression in {nexpressed_threshold} or more cells'
)

20912 of 33538 are retained requiring their expression in 100 or more cells


In [14]:
adata = adata[:, (qc.get_nexpressed(adata) >= nexpressed_threshold)].copy()

In [15]:
adata.write(
    '../data/skin_inflammatory_disease.qcfiltered.h5ad'
)