# Prepare resources

In [1]:
# download list of TFs
!wget -O ../scenic_resource/hs_hgnc_curated_tfs.txt https://raw.githubusercontent.com/aertslab/pySCENIC/master/resources/hs_hgnc_curated_tfs.txt
!wget -O ../scenic_resource/hs_hgnc_tfs.txt https://raw.githubusercontent.com/aertslab/pySCENIC/master/resources/hs_hgnc_tfs.txt
!wget -O ../scenic_resource/lambert2018.txt https://raw.githubusercontent.com/aertslab/pySCENIC/master/resources/lambert2018.txt

--2024-06-07 09:36:13--  https://raw.githubusercontent.com/aertslab/pySCENIC/master/resources/hs_hgnc_curated_tfs.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.110.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 8585 (8.4K) [text/plain]
Saving to: ‘../scenic_resource/hs_hgnc_curated_tfs.txt’


2024-06-07 09:36:14 (133 MB/s) - ‘../scenic_resource/hs_hgnc_curated_tfs.txt’ saved [8585/8585]

--2024-06-07 09:36:14--  https://raw.githubusercontent.com/aertslab/pySCENIC/master/resources/hs_hgnc_tfs.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 11353 (11K) [text/plain]
Saving 

In [2]:
def read_tf_file(filename):
    with open(filename, 'r') as tf_file:
        tfs = set(line.rstrip() for line in tf_file)
    
    return tfs


hgnc_tfs = read_tf_file('../scenic_resource/hs_hgnc_tfs.txt')
lambert_tfs = read_tf_file('../scenic_resource/lambert2018.txt')
hgnc_tfs_curated = read_tf_file('../scenic_resource/hs_hgnc_curated_tfs.txt')

(hgnc_tfs & lambert_tfs) == hgnc_tfs_curated

True

since they already provide the curated list we actually don't need to do all the things described [here](https://github.com/aertslab/pySCENIC/blob/master/notebooks/pySCENIC%20-%20List%20of%20Transcription%20Factors.ipynb). However, the [Nature protocol](https://www.nature.com/articles/s41596-020-0336-2) suggest using the full list found in `hgnc_tfs.txt` so we just follow this one and us it.

In [5]:
# download cistrarget databases
!wget -O ../scenic_resource/motifs-v9-nr.hgnc-m0.001-o0.0.tbl https://resources.aertslab.org/cistarget/motif2tf/motifs-v9-nr.hgnc-m0.001-o0.0.tbl
!wget -O ../scenic_resource/hg38__refseq-r80__10kb_up_and_down_tss.mc9nr.genes_vs_motifs.rankings.feather https://resources.aertslab.org/cistarget/databases/homo_sapiens/hg38/refseq_r80/mc9nr/gene_based/hg38__refseq-r80__10kb_up_and_down_tss.mc9nr.genes_vs_motifs.rankings.feather

--2024-06-07 09:56:35--  https://resources.aertslab.org/cistarget/motif2tf/motifs-v9-nr.hgnc-m0.001-o0.0.tbl
Resolving resources.aertslab.org (resources.aertslab.org)... 134.58.50.9
Connecting to resources.aertslab.org (resources.aertslab.org)|134.58.50.9|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 103568514 (99M)
Saving to: ‘../scenic_resource/motifs-v9-nr.hgnc-m0.001-o0.0.tbl.html’


2024-06-07 09:56:37 (61.7 MB/s) - ‘../scenic_resource/motifs-v9-nr.hgnc-m0.001-o0.0.tbl.html’ saved [103568514/103568514]

--2024-06-07 09:56:37--  https://resources.aertslab.org/cistarget/databases/homo_sapiens/hg38/refseq_r80/mc9nr/gene_based/hg38__refseq-r80__10kb_up_and_down_tss.mc9nr.genes_vs_motifs.rankings.feather
Resolving resources.aertslab.org (resources.aertslab.org)... 134.58.50.9
Connecting to resources.aertslab.org (resources.aertslab.org)|134.58.50.9|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1248772066 (1.2G)
Saving to: ‘../scen

# Run SCENIC
This follows the description of the full pipeline [here](https://github.com/aertslab/pySCENIC/blob/master/notebooks/pySCENIC%20-%20Full%20pipeline.ipynb). Beware that you may need to install pyscenic and aboreto from their git source. By the time of writing this there was one bug in either of their PyPI distributions that were fixed in the git sources but not propagated to PyPI due to them not bumping the package versions.

In [1]:
import anndata as ad

from arboreto.utils import load_tf_names
from arboreto.algo import grnboost2

from ctxcore.rnkdb import FeatherRankingDatabase as RankingDatabase
from pyscenic.utils import modules_from_adjacencies
from pyscenic.prune import prune2df, df2regulons
from pyscenic.aucell import aucell

In [2]:
adata = ad.read_h5ad('../data/tregs.tissue.scps.integrated.annotated.h5ad')
adata

AnnData object with n_obs × n_vars = 9773 × 20912
    obs: 'sample_id', 'patient_id', 'status', 'tissue', 'cell_fraction', 'doublet', 'doublet_score', 'nFeature_RNA', 'nCount_RNA', 'percent_mt', 'percent_ribo', 'qc_pass', '_scvi_batch', '_scvi_labels', 'leiden_scvi_0.1', 'coarse_cell_types', 'leiden_scvi_0.4', 'dataset', 'sat1_status', 'clustering', 'sat1_status_majority_vote'
    uns: '_scvi_manager_uuid', '_scvi_uuid', 'leiden', 'log1p', 'neighbors', 'umap'
    obsm: 'X_scvi', 'X_umap'
    layers: 'counts'
    obsp: 'connectivities', 'distances'

In [3]:
is_disease = adata.obs.status != 'normal'
is_sat1hi = adata.obs.sat1_status_majority_vote == 'SAT1_hi'
is_sat1lo = adata.obs.sat1_status_majority_vote == 'SAT1_lo'

data = {
    'all': adata,
    'all_disease': adata[is_disease],
    'sat1hi': adata[is_disease & is_sat1hi],
    'sat1lo': adata[is_disease & is_sat1lo]
}
data

{'all': AnnData object with n_obs × n_vars = 9773 × 20912
     obs: 'sample_id', 'patient_id', 'status', 'tissue', 'cell_fraction', 'doublet', 'doublet_score', 'nFeature_RNA', 'nCount_RNA', 'percent_mt', 'percent_ribo', 'qc_pass', '_scvi_batch', '_scvi_labels', 'leiden_scvi_0.1', 'coarse_cell_types', 'leiden_scvi_0.4', 'dataset', 'sat1_status', 'clustering', 'sat1_status_majority_vote'
     uns: '_scvi_manager_uuid', '_scvi_uuid', 'leiden', 'log1p', 'neighbors', 'umap'
     obsm: 'X_scvi', 'X_umap'
     layers: 'counts'
     obsp: 'connectivities', 'distances',
 'all_disease': View of AnnData object with n_obs × n_vars = 4982 × 20912
     obs: 'sample_id', 'patient_id', 'status', 'tissue', 'cell_fraction', 'doublet', 'doublet_score', 'nFeature_RNA', 'nCount_RNA', 'percent_mt', 'percent_ribo', 'qc_pass', '_scvi_batch', '_scvi_labels', 'leiden_scvi_0.1', 'coarse_cell_types', 'leiden_scvi_0.4', 'dataset', 'sat1_status', 'clustering', 'sat1_status_majority_vote'
     uns: '_scvi_manager_uu

In [4]:
# load databases
with open('../scenic_resource/hs_hgnc_tfs.txt', 'r') as tf_file:
    tf_names = [line.rstrip() for line in tf_file]
    
cistarget_db = RankingDatabase(
    '../scenic_resource/hg38__refseq-r80__10kb_up_and_down_tss.mc9nr.genes_vs_motifs.rankings.feather',
    'hg38__refseq-r80__10kb_up_and_down_tss.mc9nr'
)
cistarget_db

FeatherRankingDatabase(name="hg38__refseq-r80__10kb_up_and_down_tss.mc9nr")

In [5]:
# this takes quite some time so get comfortable
from distributed import Client, LocalCluster
client = Client(
    LocalCluster(
        name = 'grn_call',
        n_workers = 8,
        threads_per_worker = 1
    )
)


grnboost_results = {}
for k, bdata in data.items():
    print(k)
    expr_data = bdata.to_df('counts')
    adjacencies = grnboost2(
        expression_data = expr_data,
        tf_names = tf_names,
        client_or_address = client,
        verbose = True
    )
    grnboost_results[k] = adjacencies
    
client.close()
del client

all
preparing dask client
parsing input
creating dask graph
8 partitions
computing dask graph


This may cause some slowdown.
Consider scattering data ahead of time and using futures.


not shutting down client, client was created externally
finished
all_disease
preparing dask client
parsing input
creating dask graph
8 partitions
computing dask graph


This may cause some slowdown.
Consider scattering data ahead of time and using futures.


not shutting down client, client was created externally
finished
sat1hi
sat1lo


In [6]:
for k, result in grnboost_results.items():
    result.to_csv(
        f'../scenic_results/adj_{k}.tsv',
        sep = '\t'
    )

In [5]:
import pandas as pd


grnboost_results = {
    k: pd.read_csv(
        f'../scenic_results/adj_{k}.tsv', 
        sep = '\t', 
        index_col = 0
    )
    for k
    in ['sat1hi', 'sat1lo']
}

In [7]:
modules = {}
for k, bdata in data.items():
    inferred_modules = modules_from_adjacencies(
        grnboost_results[k],
        bdata.to_df('counts'),
        # according to pyscenic warning
        rho_mask_dropouts = True
    )
    # wrapping in list is needed because returned value is a generator
    # which is not going to work with the subsequent pruning
    modules[k] = list(inferred_modules)


2024-06-07 17:19:21,373 - pyscenic.utils - INFO - Calculating Pearson correlations.

	Dropout masking is currently set to [True].

2024-06-07 17:20:49,886 - pyscenic.utils - INFO - Creating modules.

2024-06-07 17:21:41,090 - pyscenic.utils - INFO - Calculating Pearson correlations.

	Dropout masking is currently set to [True].

2024-06-07 17:22:14,974 - pyscenic.utils - INFO - Creating modules.


In [8]:
pruned_modules = {
    k: prune2df(
        [cistarget_db], 
        inferred_modules, 
        '../scenic_resource/motifs-v9-nr.hgnc-m0.001-o0.0.tbl',
        client_or_address = 'custom_multiprocessing',
        num_workers = 8
    )
    for k, inferred_modules
    in modules.items()
}


2024-06-07 17:22:59,333 - pyscenic.prune - INFO - Using 8 workers.

2024-06-07 17:22:59,333 - pyscenic.prune - INFO - Using 8 workers.

2024-06-07 17:23:06,307 - pyscenic.prune - INFO - Worker hg38__refseq-r80__10kb_up_and_down_tss.mc9nr(2): database loaded in memory.

2024-06-07 17:23:06,307 - pyscenic.prune - INFO - Worker hg38__refseq-r80__10kb_up_and_down_tss.mc9nr(2): database loaded in memory.

2024-06-07 17:23:06,377 - pyscenic.prune - INFO - Worker hg38__refseq-r80__10kb_up_and_down_tss.mc9nr(3): database loaded in memory.

2024-06-07 17:23:06,377 - pyscenic.prune - INFO - Worker hg38__refseq-r80__10kb_up_and_down_tss.mc9nr(3): database loaded in memory.

2024-06-07 17:23:06,471 - pyscenic.prune - INFO - Worker hg38__refseq-r80__10kb_up_and_down_tss.mc9nr(1): database loaded in memory.

2024-06-07 17:23:06,471 - pyscenic.prune - INFO - Worker hg38__refseq-r80__10kb_up_and_down_tss.mc9nr(1): database loaded in memory.

2024-06-07 17:23:07,059 - pyscenic.prune - INFO - Worker hg

In [9]:
for k, pruned_modules_df in pruned_modules.items():
    pruned_modules_df.to_csv(
        f'../scenic_results/pruned_modules_{k}.tsv',
        sep = '\t'
    )

In [10]:
pruned_regulons = {
    k: df2regulons(
        pruned_modules_df
    )
    for k, pruned_modules_df
    in pruned_modules.items()
}

Create regulons from a dataframe of enriched features.
Additional columns saved: []
Create regulons from a dataframe of enriched features.
Additional columns saved: []


In [11]:
auc_mtxs = {}
for k, bdata in data.items():
    auc_mtx = aucell(
        bdata.to_df('counts'), 
        pruned_regulons[k], 
        num_workers = 8
    )
    auc_mtxs[k] = auc_mtx

In [12]:
for k, auc_mtx in auc_mtxs.items():
    auc_mtx.to_csv(
        f'../scenic_results/auc_mtx_{k}.tsv',
        sep = '\t'
    )

# Downstream analysis of SCENIC results