# Transcription Factor inference via pySCENIC

### Installing packages and dependencies

In [1]:
import pyscenic
import scanpy as sc
import pandas as pd
import numpy as np
import pickle
from pyscenic.utils import load_motifs
from pyscenic.prune import df2regulons
from pyscenic.cli.utils import load_signatures
from pyscenic.binarization import binarize
from cytoolz import compose
import operator as op
import os

from pyscenic.plotting import plot_binarization, plot_rss
from pyscenic.rss import regulon_specificity_scores
from pyscenic.export import add_scenic_metadata
from IPython.display import HTML, display
from adjustText import adjust_text
from math import ceil, floor
import matplotlib.pyplot as plt
import seaborn as sns

### Code

In [None]:
# Directories
ROOT = "/Users/xuanzhao/Documents/R/gradientWeighing"
ROOT_CISTARGET = "/Users/xuanzhao/Documents/R/gradientWeighing/auxiliary_datasets/cisTarget_databases"
ROOT_PYSCENIC = "/Users/xuanzhao/Documents/R/gradientWeighing/pyscenic_5"
ROOT_ANNOTATIONS = "/Users/xuanzhao/Documents/R/gradientWeighing/gene_ontology_annotations/"
ROOT_ONTOLOGY = "/Users/xuanzhao/Documents/R/gradientWeighing/gene_ontology/"

# Directories for cisTarget databases
db500 = os.path.join(ROOT_CISTARGET, "mm9-500bp-upstream-7species.mc9nr.feather")
db5k = os.path.join(ROOT_CISTARGET, "mm9-tss-centered-5kb-7species.mc9nr.feather")
db10k = os.path.join(ROOT_CISTARGET, "mm9-tss-centered-10kb-7species.mc9nr.feather")

# Command-line code for pySCENIC GRN inference

# Command-line code for pySCENIC CTX enrichment

# Predicting regulons (TF-targets)
exprMat = sc.read_loom(os.path.join(ROOT_PYSCENIC, "exprMat.loom"))
df_motifs = load_motifs(os.path.join(ROOT_PYSCENIC, "regulons_s500_nes2.5_min20_500bp.csv"))
regulons = df2regulons(df_motifs)

# Sorting by score and printing out regulon table
def score(x):
    return x.score
regulons_sorted = regulons[:]
regulons_sorted.sort(reverse = True, key = score)
for num, item in enumerate(regulons_sorted):
    print("{:3s} {:10s} {:5f} {}".format(str(num),item.name,item.score,len(item.gene2weight))) #list(item.gene2weight)[:5]))

# Writing regulon table to csv
tf_list = []
num_list = []
nes_list = []
for i in range(len(regulons_sorted)):
    tf_list.append(regulons_sorted[i].name)
    num_list.append(len(regulons_sorted[i].gene2weight))
    nes_list.append(regulons_sorted[i].score)
summary_df = pd.DataFrame({'TF':tf_list, 'NES Score':nes_list, 'NumTargets':num_list})
summary_df.to_csv("/Users/xuanzhao/Desktop/mn_paper/regulon_summary_df.csv")
    
# Top 20 regulons, cutoff of NES Score 2.0
regulons_cutoff = regulons_sorted[:20]
regnames_cutoff = [i.name for i in regulons_cutoff]

# Unique genes explained by top 20 regulons - % variance explained by transcription networks
unique_genes = []
for reg in regulons_cutoff:
    for gene in list(reg.gene2weight):
        if gene not in unique_genes:
            unique_genes.append(gene)
print(len(unique_genes)/1000)            

# Command-line code for pySCENIC aucell quantification

# Regulon activity visualization
auc_mtx = pd.read_csv("/Users/xuanzhao/Documents/R/gradientWeighing/pyscenic_5/regAUC.csv", index_col=0)

# Removing unknown cells
unknown = pd.read_csv("/Users/xuanzhao/Documents/R/gradientWeighing/pyscenic_5/unknown_names.csv") # Dataframe of cells with unknown cell type
auc_mtx_new = auc_mtx[~auc_mtx.index.isin(unknown.x)] 
nonlist = [item for item in list(exprMat.obs.index) if item not in list(unknown.x)]
exprMat_new = exprMat[nonlist]

# Adding regulon + AUC metadata to expression mtx
metadata = pd.read_csv("/Users/xuanzhao/Documents/R/gradientWeighing/pyscenic_5/metadata_df.csv")
counts_new = exprMat_new.copy()
celltype_new = metadata[metadata['celltype'] != "Unknown"]
celltype_new = celltype_new.reset_index(drop=True)
counts_new.obs = celltype_new.set_index('cellname')
add_scenic_metadata(counts_new, auc_mtx_new, regulons)

# Quantifying regulon activity by z-score of Cell specific regulons - Zscore based
df_obs = counts_new.obs
signature_column_names = list(df_obs.select_dtypes('number').columns)
signature_column_names = list(filter(lambda s: s.startswith('Regulon('), signature_column_names))
df_scores = df_obs[signature_column_names + ['celltype']]
df_results = ((df_scores.groupby(by='celltype').mean() - df_obs[signature_column_names].mean())/ df_obs[signature_column_names].std()).stack().reset_index().rename(columns={'level_1': 'regulon', 0:'Z'})
df_results['regulon'] = list(map(lambda s: s[8:-1], df_results.regulon))

