In [2]:
from pyreadr import read_r


from arboreto.algo import grnboost2
from arboreto.utils import load_tf_names
from pyscenic.utils import modules_from_adjacencies
from pyscenic.prune import prune2df, df2regulons
from pyscenic.aucell import aucell

from ctxcore.rnkdb import FeatherRankingDatabase as RankingDatabase

import pandas as pd
import os

OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


In [5]:
help(aucell)

Help on function aucell in module pyscenic.aucell:

aucell(exp_mtx: pandas.core.frame.DataFrame, signatures: Sequence[Type[ctxcore.genesig.GeneSignature]], auc_threshold: float = 0.05, noweights: bool = False, normalize: bool = False, seed=None, num_workers: int = 8) -> pandas.core.frame.DataFrame
    Calculate enrichment of gene signatures for single cells.
    
    :param exp_mtx: The expression matrix (n_cells x n_genes).
    :param signatures: The gene signatures or regulons.
    :param auc_threshold: The fraction of the ranked genome to take into account for the calculation of the
        Area Under the recovery Curve.
    :param noweights: Should the weights of the genes part of a signature be used in calculation of enrichment?
    :param normalize: Normalize the AUC values to a maximum of 1.0 per regulon.
    :param num_workers: The number of cores to use.
    :return: A dataframe with the AUCs (n_cells x n_modules).



### Prelimanary

Load count matrix and TFs names. \
\
Note: 
<ul>
    <li>count matrix generated by R script import_data.R, from file data/dpn.vnc.domain.labelled.rds </li>
    <li>TF names are copied from https://github.com/aertslab/pySCENIC/blob/master/resources/allTFs_dmel.txt (29.3.2023)</li>
<ul>

In [3]:
ex_matrix = pd.read_csv("../data/expression_mat.csv", index_col=0)  # load count matrix 
tf_names = load_tf_names("../data/allTFs_dmel.txt") # Derive list of Transcription Factors(TF) for Drosophila

In [34]:
ex_matrix.shape

(4672, 9751)

In [35]:
ex_matrix

Unnamed: 0,a,abd-A,Abd-B,Abl,abo,ac,acj6,Acph-1,Act5C,Act42A,...,lncRNA:CR43716,lncRNA:CR44997,asRNA:CR45151,lncRNA:CR45310,lncRNA:CR45425,asRNA:CR45822,asRNA:CR45891,lncRNA:CR45961,lncRNA:CR46032,lncRNA:CR46119
TP1_AACTCAGGTAAATACG,0.0,0.000000,0.0,0.0,0.466624,0.000000,0.0,0.0,2.764117,1.750383,...,0,0.0,0.0,0,0.0,0.0,0.0,0.0,0,0.0
TP1_TCACGAATCTATCGCC,0.0,1.053208,0.0,0.0,0.000000,0.483831,0.0,0.0,2.879282,1.053208,...,0,0.0,0.0,0,0.0,0.0,0.0,0.0,0,0.0
TP1_TACGGTACAATAGAGT,0.0,0.813928,0.0,0.0,0.000000,0.000000,0.0,0.0,2.458085,1.059571,...,0,0.0,0.0,0,0.0,0.0,0.0,0.0,0,0.0
TP1_CAACTAGAGAGACGAA,0.0,0.000000,0.0,0.0,0.000000,0.888313,0.0,0.0,2.728522,2.006819,...,0,0.0,0.0,0,0.0,0.0,0.0,0.0,0,0.0
TP1_GCGCCAAAGTCGATAA,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,3.294131,1.989672,...,0,0.0,0.0,0,0.0,0.0,0.0,0.0,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TP2.2_TCTTTCCGTACCGTTA,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,3.506908,2.466545,...,0,0.0,0.0,0,0.0,0.0,0.0,0.0,0,0.0
TP2.2_AGCCTAATCTGCAAGG,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,3.109142,2.836230,...,0,0.0,0.0,0,0.0,0.0,0.0,0.0,0,0.0
TP2.2_GATTCAGCACACGCTG,0.0,2.470005,0.0,0.0,0.000000,1.858055,0.0,0.0,3.119940,1.858055,...,0,0.0,0.0,0,0.0,0.0,0.0,0.0,0,0.0
TP2.2_TGAGCATGTGATGTCT,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,1.854410,2.466052,...,0,0.0,0.0,0,0.0,0.0,0.0,0.0,0,0.0


Load ranking databases (for motif enrichment)  --> https://resources.aertslab.org/cistarget/ (https://resources.aertslab.org/cistarget/databases/drosophila_melanogaster/dm6/flybase_r6.02/mc8nr/gene_based/)

In [4]:
db_fnames = "../data/dm6-5kb-upstream-full-tx-11species.mc8nr.genes_vs_motifs.rankings.feather"
dbs = [RankingDatabase(fname=db_fnames, name=os.path.splitext(os.path.basename(db_fnames))[0])]
dbs


[FeatherRankingDatabase(name="dm6-5kb-upstream-full-tx-11species.mc8nr.genes_vs_motifs.rankings")]

Motif annotation file: https://resources.aertslab.org/cistarget/motif2tf/ (v8 matches mc8nr db)

In [5]:
motif_annotation_file = "../data/motifs-v8-nr.flybase-m0.001-o0.0.tbl"

#### Run Pipeline
line by line to get familiar with the outputs, and time the commands. 

In [6]:
# sample matrix:
ex_matrix_sub = ex_matrix.iloc[1:100,1:500]
# run first step
adjacencies = grnboost2(ex_matrix_sub, tf_names, verbose=True)

preparing dask client
parsing input
creating dask graph
4 partitions
computing dask graph
shutting down client and local cluster
finished


In [7]:
adjacencies

Unnamed: 0,TF,target,importance
26,aop,rib,1.326126e+01
465,sala,sca,1.304844e+01
248,Ldh,fz,1.302985e+01
8,Act42A,E(spl)malpha-BFM,1.204088e+01
171,for,eEF1alpha2,1.135674e+01
...,...,...,...
346,E(spl)malpha-BFM,crp,4.892274e-16
207,gro,crc,4.332341e-16
189,bnb,Hsp83,3.798419e-16
350,mago,D1,2.053042e-16


In [13]:
modules = list(modules_from_adjacencies(adjacencies, ex_matrix_sub)) 
modules


2023-04-12 12:22:27,994 - pyscenic.utils - INFO - Calculating Pearson correlations.

	Dropout masking is currently set to [False].

2023-04-12 12:22:28,180 - pyscenic.utils - INFO - Creating modules.


[Regulon(name='Regulon for Act42A', gene2weight=frozendict.frozendict({'E(spl)malpha-BFM': 12.040878622934304, 'Fib': 3.4921191869715797, 'Rm62': 2.16528854539237, 'Ras64B': 2.0316420832568785, 'PpV': 1.6612448232237484, 'l(2)37Bb': 1.4003149295614665, 'eEF1alpha2': 1.3190187521746066, 'disco': 0.9516326784685352, 'Arl1': 0.9145825012755547, 'aop': 0.8003132942468826, 'shi': 0.794056009473243, 'chif': 0.6961646261517448, 'Myb': 0.6555934992694421, 'l(1)10Bb': 0.5935451652521422, 'RpII215': 0.5906082529783443, 'H': 0.5673887821246373, 'E(spl)m3-HLH': 0.5416120232285409, 'fj': 0.524925799677731, 'Kr': 0.5106513082070937, 'Pp1alpha-96A': 0.4328984154427686, 'ase': 0.4161566516670784, 'dup': 0.3980519819504499, 'ben': 0.38776966523226053, 'Act42A': 1.0}), gene2occurrence=frozendict.frozendict({}), transcription_factor='Act42A', context=frozenset({'weight>75.0%', 'activating'}), score=0.0, nes=0.0, orthologous_identity=0.0, similarity_qvalue=0.0, annotation=''),
 Regulon(name='Regulon for A

In [14]:
df = prune2df(dbs, modules, motif_annotations_fname=motif_annotation_file) # Prune modules for targets with cis regulatory footprints (RcisTarget)
df

OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.
OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.



Help on function prune2df in module pyscenic.prune:

prune2df(rnkdbs: Sequence[Type[ctxcore.rnkdb.RankingDatabase]], modules: Sequence[ctxcore.genesig.Regulon], motif_annotations_fname: str, rank_threshold: int = 1500, auc_threshold: float = 0.05, nes_threshold=3.0, motif_similarity_fdr: float = 0.001, orthologuous_identity_threshold: float = 0.0, weighted_recovery=False, client_or_address='dask_multiprocessing', num_workers=None, module_chunksize=100, filter_for_annotation=True) -> pandas.core.frame.DataFrame
    Calculate all regulons for a given sequence of ranking databases and a sequence of co-expression modules.
    The number of regulons derived from the supplied modules is usually much lower. In addition, the targets of the
    retained modules is reduced to only these ones for which a cis-regulatory footprint is present.
    
    :param rnkdbs: The sequence of databases.
    :param modules: The sequence of modules.
    :param motif_annotations_fname: The name of the file that 

In [15]:
regulons = df2regulons(df) # convert data frame to rergulons
regulons

Create regulons from a dataframe of enriched features.
Additional columns saved: []


[Regulon(name='Dfd(+)', gene2weight=frozendict.frozendict({'sm': 0.19055243383041204, 'grk': 0.07508450989774831, 'Cf2': 0.71248754867474, 'pnt': 0.10021463720864625, 'bru1': 0.2427972793443425, 'slou': 0.18313681493589215, 'hyd': 0.1957162159783155, 'g': 0.12260905764422787, 'rst': 0.15763936785210264, 'sd': 0.2669767101132622, 'Dfd': 1.0, 'Appl': 1.1389879395439284}), gene2occurrence=frozendict.frozendict({}), transcription_factor='Dfd', context=frozenset({'taipale__PDX1_DBD_NYAATTARNNNYAATTAN.png', 'activating'}), score=0.1724017313001147, nes=0.0, orthologous_identity=0.0, similarity_qvalue=0.0, annotation=''),
 Regulon(name='Eno(+)', gene2weight=frozendict.frozendict({'sgg': 0.6299259183478101, 'ref(2)P': 0.4742373554195821, 'ed': 0.12438608711692346, 'Eno': 1.0, 'slo': 0.26783304651307377}), gene2occurrence=frozendict.frozendict({}), transcription_factor='Eno', context=frozenset({'cisbp__M6210.png', 'activating'}), score=2.346322847870677, nes=0.0, orthologous_identity=0.0, simil

In [16]:
auc_ntx = aucell(ex_matrix_sub, regulons, num_workers=1)
auc_ntx

100%|██████████| 13/13 [00:00<00:00, 596.74it/s]


Regulon,Dfd(+),Eno(+),Jra(+),Myb(+),aop(+),cg(+),en(+),gsb-n(+),hb(+),mor(+),opa(+),pnt(+),slp1(+)
Cell,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
TP1_AAACCTGAGTAACCCT,0.0,0.0,0.000000,0.040138,0.000000,0.0,0.000000,0.0,0.017390,0.031351,0.000000,0.000744,0.010282
TP1_AAAGATGTCTCCCTGA,0.0,0.0,0.000000,0.005352,0.000000,0.0,0.000000,0.0,0.012693,0.009031,0.000000,0.003995,0.008078
TP1_AAAGCAATCTTCCTTC,0.0,0.0,0.000000,0.018731,0.000000,0.0,0.000000,0.0,0.013270,0.033843,0.000000,0.002686,0.008813
TP1_AAATGCCGTGGGTCAA,0.0,0.0,0.000000,0.008028,0.000000,0.0,0.000000,0.0,0.013270,0.022938,0.000000,0.002480,0.005141
TP1_AACTGGTGTCAACATC,0.0,0.0,0.001654,0.032185,0.001769,0.0,0.000000,0.0,0.029625,0.008429,0.000000,0.003288,0.008813
...,...,...,...,...,...,...,...,...,...,...,...,...,...
TP1_TTAGGACCAAACCTAC,0.0,0.0,0.000000,0.026759,0.000000,0.0,0.000710,0.0,0.020146,0.021820,0.137836,0.003582,0.009547
TP1_TTCGAAGGTTAAGGGC,0.0,0.0,0.000000,0.037462,0.000000,0.0,0.001184,0.0,0.012693,0.033437,0.000000,0.003430,0.008078
TP1_TTCGGTCAGTTGAGTA,0.0,0.0,0.000000,0.029434,0.000000,0.0,0.000237,0.0,0.013270,0.015302,0.000000,0.002865,0.012485
TP1_TTGCCGTCACCAGCAC,0.0,0.0,0.000000,0.021407,0.000000,0.0,0.000000,0.0,0.013270,0.026008,0.000000,0.000000,0.011751


In [None]:
# TODO: save intermediate products to file - see tutorial https://github.com/aertslab/pySCENIC/blob/master/notebooks/pySCENIC%20-%20Full%20pipeline.ipynb
n = 1  # TODO: decide on n based on computational resources 
all_results = [None] * n
for i in range(0, n):
    """ phase 1 - GRN inference, generation of co-expression modules """
    adjacencies = grnboost2(ex_matrix, tf_names, verbose=True) # adjacencies table of tf, target and importance weight
    modules = list(modules_from_adjacencies(adjacencies, ex_matrix)) # module generation - candidate regulons from TF-target gene interactions 
    """ phase 2+3 - Regulon prediction """
    df = prune2df(dbs, modules, motif_annotations_fname=motif_annotation_file) # Prune modules for targets with cis regulatory footprints (RcisTarget)
    regulons = df2regulons(df) # convert data frame to rergulons
    auc_ntx = aucell(ex_matrix, regulons, num_workers=1)  # Calculate enrichment of gene signatures for single cells.
    """ phase 4 - cellular enrichment """
    all_results[i] = auc_ntx # save for later 

# TODO - keep only modules that appear in over X percent...
# AUCell returns A dataframe with the AUCs (n_cells x n_modules).

    