### Basic QC for non-human primate samples (Gideon)

#### Objective: Run basic QC for week 10 BAL samples in non-human primates from Gideon et al 2021

- **Developed by**: Mairi McClean
- **Affiliation**: Institute of Computational Biology, Computational Health Centre, Helmholtz Munich
- **v221209**

## Load modules

In [1]:
import anndata
import logging
import numpy as np
import pandas as pd
import scanpy as sc
import seaborn as sns
import matplotlib.pyplot as plt
import igraph as ig
from matplotlib import colors
from matplotlib import rcParams

In [2]:
sc.settings.verbosity = 3
sc.logging.print_versions()
sc.settings.set_figure_params(dpi = 160, color_map = 'RdPu', dpi_save = 180, vector_friendly = True, format = 'svg')

-----
anndata     0.9.1
scanpy      1.9.3
-----
CoreFoundation      NA
Foundation          NA
PIL                 9.5.0
PyObjCTools         NA
anyio               NA
appnope             0.1.3
asttokens           NA
attr                23.1.0
babel               2.12.1
backcall            0.2.0
certifi             2023.05.07
cffi                1.15.1
charset_normalizer  3.1.0
cloudpickle         2.2.1
colorama            0.4.6
comm                0.1.3
cycler              0.10.0
cython_runtime      NA
cytoolz             0.12.0
dask                2023.5.1
dateutil            2.8.2
debugpy             1.6.7
decorator           5.1.1
defusedxml          0.7.1
executing           1.2.0
fastjsonschema      NA
h5py                3.8.0
idna                3.4
igraph              0.10.4
importlib_resources NA
ipykernel           6.23.1
jedi                0.18.2
jinja2              3.1.2
joblib              1.2.0
json5               NA
jsonschema          4.17.3
jupyter_events      0.6.3
ju

### Read in datasets

- Week 4 data

In [10]:
adata_1 = sc.read_10x_mtx('/Users/mairi.mcclean/example_data_lake/INBOX/tb_cc/raw_data_objects/nhp/gideon2021/week4/')
adata_1

--> This might be very slow. Consider passing `cache=True`, which enables much faster reading from a cache file.


AnnData object with n_obs × n_vars = 10006 × 24820
    var: 'gene_ids'

- Week 10

In [14]:
adata_2 = sc.read_10x_mtx('/Users/mairi.mcclean/example_data_lake/INBOX/tb_cc/raw_data_objects/nhp/gideon2021/week10/')
adata_2

--> This might be very slow. Consider passing `cache=True`, which enables much faster reading from a cache file.


AnnData object with n_obs × n_vars = 109584 × 28155
    var: 'gene_ids'

- Fix .var objects

 - Week 4

In [18]:
adata_1.var

Unnamed: 0,gene_ids
A2ML1,A2ML1
A3GALT2,A3GALT2
A4GALT,A4GALT
AAAS,AAAS
AACS,AACS
...,...
ZYG11A,ZYG11A
ZYG11B,ZYG11B
ZYX,ZYX
ZZEF1,ZZEF1


In [19]:
adata_1.var.rename(columns={'gene_ids': 'gene_name'}, inplace=True)
adata_1.var


Unnamed: 0,gene_name
A2ML1,A2ML1
A3GALT2,A3GALT2
A4GALT,A4GALT
AAAS,AAAS
AACS,AACS
...,...
ZYG11A,ZYG11A
ZYG11B,ZYG11B
ZYX,ZYX
ZZEF1,ZZEF1


In [20]:
adata_1.var = adata_1.var.drop('gene_name', axis=1)

In [21]:
adata_1.var

A2ML1
A3GALT2
A4GALT
AAAS
AACS
...
ZYG11A
ZYG11B
ZYX
ZZEF1
ZZZ3


- Week 10

In [22]:
adata_2.var

Unnamed: 0,gene_ids
A1BG,
A2ML1,0.0
A3GALT2,1.0
A4GALT,2.0
A4GNT,3.0
...,...
LOC107130452,28149.0
LOC107130791,28150.0
SCP2D1,28151.0
TAS2R8,28152.0


In [23]:
adata_2.var.rename(columns={'gene_ids': 'gene_name'}, inplace=True)
adata_2.var


Unnamed: 0,gene_name
A1BG,
A2ML1,0.0
A3GALT2,1.0
A4GALT,2.0
A4GNT,3.0
...,...
LOC107130452,28149.0
LOC107130791,28150.0
SCP2D1,28151.0
TAS2R8,28152.0


In [24]:
adata_2.var = adata_2.var.drop('gene_name', axis=1)

- Concatenate

In [30]:
adata_2.obs

Array1_3817_AAAACCCATATC
Array1_3817_AAAAGTAGTTTA
Array1_3817_AAACATTGACCC
Array1_3817_AAACGCAATCCT
Array1_3817_AAACGTCGTACA
...
Array9_4017_TTTTTGAGGCCT
Array9_4017_TTTTTGGAAACC
Array9_4017_TTTTTGTGCCTC
Array9_4017_TTTTTTAAAGAT
Array9_4017_TTTTTTCACCTT


In [28]:
adata_total = adata_1.concatenate(adata_2, join='inner', batch_key='dataset', batch_categories=['part1', 'part2'])
adata_total


See the tutorial for concat at: https://anndata.readthedocs.io/en/latest/concatenation.html


AttributeError: Can only use .str accessor with string values!

In [24]:
print(adata_3.var.dtypes)

Series([], dtype: object)


In [31]:
print(adata_3.obs.dtypes)

Series([], dtype: object)


In [None]:
adata_total.obs

- Make names unique 
> Done on concatenated object

In [None]:
adata_total.var_names_make_unique()
sample_object = adata_total.copy()
sample_object

In [None]:
# Make sure that all names are unique in new data object. 

# Turns var names into string - always required?
sample_object.var_names = [str(i) for i in sample_object.var_names]
sample_object.var_names_make_unique()

### Top 20 scatterplot

In [None]:
sc.pl.highest_expr_genes(sample_object, n_top=20)

### Inital filtering

- By gene number

In [None]:
sc.pp.filter_cells(sample_object, min_genes = 200)
print(sample_object.n_obs, sample_object.n_vars)

- By cell number 

In [None]:
sc.pp.filter_genes(sample_object, min_cells = 3)
sample_object.shape

### QC metrics

In [None]:
sample_object.var['mt'] = sample_object.var_names.str.startswith('MT')
sample_object.var['ribo'] = sample_object.var_names.str.startswith(("RPS","RPL"))
sample_object.var["hb"] = sample_object.var_names.str.contains(("^HB[^(P)]"))
sample_object.var

In [None]:
sample_object.var['mt'].value_counts()

In [None]:
sample_object.var['ribo'].value_counts()

In [None]:
sample_object.var['hb'].value_counts()

In [None]:
sc.pp.calculate_qc_metrics(sample_object, qc_vars = ['mt', 'ribo', 'hb'], percent_top = None, log1p = False, inplace = True)

In [None]:
sample_object.obs

- QC visualisation

In [None]:
p1 = sns.displot(sample_object.obs["total_counts"], bins=100, kde=False)
# sc.pl.violin(adata, 'total_counts')
p2 = sc.pl.violin(sample_object, "pct_counts_mt")
p3 = sc.pl.scatter(sample_object, "total_counts", "n_genes_by_counts", color="pct_counts_mt")

In [None]:
sc.pl.violin(sample_object, ['n_genes_by_counts', 'total_counts', 'pct_counts_ribo'],
             jitter=0.4, multi_panel=True)

### QC-based filtering

In [None]:
sample_object.obs.head()

In [None]:
sample_object.shape

In [None]:
sample_object.obs["mt_ok"] = sample_object.obs['pct_counts_mt'] <= 20
sample_object.obs.head()

In [None]:
sample_object.obs.mt_ok.value_counts()

In [None]:
print(f"Total number of cells: {sample_object.n_obs}")
sample_object = sample_object[(sample_object.obs.mt_ok)].copy()

print(f"Number of cells after filtering of low quality cells: {sample_object.n_obs}")

In [None]:
p1 = sc.pl.scatter(sample_object, "total_counts", "n_genes_by_counts", color="pct_counts_mt")

### Sample sex covariate

In [None]:
annot = sc.queries.biomart_annotations(
        "mfascicularis",
        ["ensembl_gene_id", "external_gene_name", "start_position", "end_position", "chromosome_name"],
    ).set_index("ensembl_gene_id")


In [None]:
annot.head()

- Y chrom

In [None]:
chrY_genes = sample_object.var_names.intersection(annot.index[annot.chromosome_name == "Y"])
chrY_genes

In [None]:
sample_object.obs['percent_chrY'] = np.sum(
    sample_object[:, chrY_genes].X, axis = 1).A1 / np.sum(sample_object.X, axis = 1).A1 * 100

- X chromo

In [None]:
chrX_genes = sample_object.var_names.intersection(annot.index[annot.chromosome_name == "X"])
chrX_genes

In [None]:
chrX_sum = sample_object[:, chrX_genes].X.sum(axis=1)
total_sum = sample_object.X.sum(axis=1)

# Flatten the arrays and calculate percent_chrY
sample_object.obs['percent_chrX'] = (np.asarray(chrX_sum).flatten() / np.asarray(total_sum).flatten()) * 100
sample_object.obs

In [None]:
# Graph of samples with X chrom
sample_object.obs.percent_chrX.value_counts()

In [None]:
sample_object.obs.shape

In [None]:
sc.set_figure_params(figsize=(25, 7),dpi=100)
sc.pl.violin(sample_object, ["percent_chrX"], jitter = 0.4, groupby = 'part', rotation = 45)

### Cell cycle scores

In [None]:
!if [ ! -f /Users/mairi.mcclean/data/qc_files/mfasc_orthog_cell_cycle_genes.txt ]; then curl -o /Users/mairi.mcclean/data/qc_files/mfasc_orthog_cell_cycle_genes.txt https://raw.githubusercontent.com/Talavera-Lopez-Lab/Mairi-PhD/main/sc_temp_folder/cell_cycle_genes/mfasc_orthog_cell_cycle_genes.txt?token=GHSAT0AAAAAACBVCJYZCXEBCEQZZKWQJC22ZD7AUZA

In [None]:
cell_cycle_genes = [x.strip() for x in open('/Users/mairi.mcclean/data/qc_files/mfasc_orthog_cell_cycle_genes.txt')]
print(len(cell_cycle_genes))

# Split into 2 lists
g2m_genes = cell_cycle_genes[:46]
s_genes = cell_cycle_genes[46:]

cell_cycle_genes = [x for x in cell_cycle_genes if x in sample_object.var_names]
print(len(cell_cycle_genes))

In [None]:
adata_log = anndata.AnnData(X = sample_object.X,  var = sample_object.var, obs = sample_object.obs)
sc.pp.normalize_total(adata_log, target_sum = 1e6, exclude_highly_expressed = True)
sc.pp.log1p(adata_log)

In [None]:
# Set specific figure params

sc.set_figure_params(figsize=(10, 7))
sc.tl.score_genes_cell_cycle(adata_log, s_genes = s_genes, g2m_genes = g2m_genes)
sc.pl.violin(adata_log, ['S_score', 'G2M_score'],
             jitter = 0.4, groupby = 'part', rotation = 45)

In [None]:
sample_object.obs['S_score'] = adata_log.obs['S_score']
sample_object.obs['G2M_score'] = adata_log.obs['G2M_score']
sample_object

### Predict doublets

In [None]:
import scrublet

In [None]:
scrub = scrublet.Scrublet(sample_object.X)

In [None]:
sample_object.obs['doublet_scores'], sample_object.obs['predicted_doublets'] = scrub.scrub_doublets()
scrub.plot_histogram()

In [None]:
sum(sample_object.obs['predicted_doublets'])

In [None]:
# Re-set figure params
sc.settings.set_figure_params(dpi = 160, color_map = 'RdPu', dpi_save = 180, vector_friendly = True, format = 'svg')

# Plot doublet detection results
sc.pl.violin(sample_object, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt','pct_counts_ribo', 'predicted_doublets'],
             jitter = 0.2, groupby = 'part', rotation = 45, multi_panel=False)

### Prepare counts for individual slots

In [None]:
sample_object.raw = sample_object.copy()
sample_object.layers['counts'] = sample_object.X.copy()
sample_object.layers["sqrt_norm"] = np.sqrt(
    sc.pp.normalize_total(sample_object, inplace = False)["X"]
)
sample_object

### Export object

In [None]:
sample_object.write('/Volumes/Lacie/data_lake/Mairi_example/processed_files/abridged_qc/nhp/Gideon2021_scrna_granhomog_mm_221209_qcd.h5ad')