In [3]:
!pip install scglue==0.3.2 pyscenic==0.10.1 numpy==1.23.4 scanpy networkx pyarrow cytoolz scikit-misc


!conda install -c bioconda bedtools -y
!conda install -c pytorch -c nvidia faiss-gpu=1.8.0 -y
!conda install pytorch::faiss-gpu


In [None]:
import anndata as ad
import networkx as nx
import scanpy as sc
import scglue
from matplotlib import rcParams
import os 
import subprocess
import pandas as pd
import numpy as np
work_dir = 'output/'
data_dir = '/resources/'
input_dir = 'input/'
os.makedirs(work_dir, exist_ok=True)

: 

In [None]:
# pd.read_csv('output/pruned_grn.csv')
par = {
  "multiomics_rna": "resources/grn-benchmark/multiomics_rna.h5ad",
  "multiomics_atac": "resources/grn-benchmark/multiomics_atac.h5ad",
  "motif_file": "resources/grn-benchmark/supp/JASPAR2022-hg38.bed.gz",
  "annotation_file": "resources/grn-benchmark/supp/gencode.v45.annotation.gtf.gz",
  "temp_dir": 'output/scglue/',
  "num_workers": 40,
  "prediction": "output/prediction.csv",
}

: 

In [3]:
rna = ad.read_h5ad(f"{par['temp_dir']}/rna-emb.h5ad")
atac = ad.read_h5ad(f"{par['temp_dir']}/atac-emb.h5ad")
guidance = nx.read_graphml(f"{par['temp_dir']}/guidance.graphml.gz")

rna.var["name"] = rna.var_names
atac.var["name"] = atac.var_names

genes = rna.var.index
peaks = atac.var.index

features = pd.Index(np.concatenate([rna.var_names, atac.var_names]))
feature_embeddings = np.concatenate([rna.varm["X_glue"], atac.varm["X_glue"]])

skeleton = guidance.edge_subgraph(
    e for e, attr in dict(guidance.edges).items()
    if attr["type"] == "fwd"
).copy()

reginf = scglue.genomics.regulatory_inference(
    features, feature_embeddings,
    skeleton=skeleton, random_state=0
)

gene2peak = reginf.edge_subgraph(
    e for e, attr in dict(reginf.edges).items()
    if attr["qval"] < 0.05
)


scglue.genomics.Bed(atac.var).write_bed(f"{par['temp_dir']}/peaks.bed", ncols=3)
scglue.genomics.write_links(
    gene2peak,
    scglue.genomics.Bed(rna.var).strand_specific_start_site(),
    scglue.genomics.Bed(atac.var),
    f"{par['temp_dir']}/gene2peak.links", keep_attrs=["score"]
)

motif_bed = scglue.genomics.read_bed(par['motif_file']) ## http://download.gao-lab.org/GLUE/cisreg/JASPAR2022-hg38.bed.gz
tfs = pd.Index(motif_bed["name"]).intersection(rna.var_names)
rna[:, np.union1d(genes, tfs)].write_loom(f"{par['temp_dir']}/rna.loom")
np.savetxt(f"{par['temp_dir']}/tfs.txt", tfs, fmt="%s")

regulatory_inference: 100%|██████████| 95021/95021 [00:00<00:00, 116563.85it/s]


KeyboardInterrupt: 

In [None]:
# !pip install distributed 

Collecting distributed
  Using cached distributed-2024.7.0-py3-none-any.whl.metadata (3.4 kB)
Using cached distributed-2024.7.0-py3-none-any.whl (1.0 MB)
Installing collected packages: distributed
Successfully installed distributed-2024.7.0
[0m

In [4]:

# Construct the command 
command = (
    f"pyscenic grn {par['temp_dir']}/rna.loom {par['temp_dir']}/tfs.txt "
    f"-o {par['temp_dir']}/draft_grn.csv --seed 0 --num_workers {par['num_workers']} "
    "--cell_id_attribute obs_id --gene_attribute name"
)

result = subprocess.run(command, shell=True)

print("Output:")
print(result.stdout)
print("Error:")
print(result.stderr)

if result.returncode == 0:
    print("Command executed successfully")
else:
    print("Command failed with return code", result.returncode)


Output:

Error:
Traceback (most recent call last):
  File "/root/anaconda3/envs/scglue/bin/pyscenic", line 5, in <module>
    from pyscenic.cli.pyscenic import main
  File "/root/anaconda3/envs/scglue/lib/python3.10/site-packages/pyscenic/cli/pyscenic.py", line 19, in <module>
    from arboreto.algo import genie3, grnboost2
  File "/root/anaconda3/envs/scglue/lib/python3.10/site-packages/arboreto/algo.py", line 7, in <module>
    from arboreto.core import create_graph, SGBM_KWARGS, RF_KWARGS, EARLY_STOP_WINDOW_LENGTH
  File "/root/anaconda3/envs/scglue/lib/python3.10/site-packages/arboreto/core.py", line 12, in <module>
    from dask.dataframe import from_delayed
  File "/root/anaconda3/envs/scglue/lib/python3.10/site-packages/dask/dataframe/__init__.py", line 4, in <module>
    from dask.dataframe import backends, dispatch, rolling
  File "/root/anaconda3/envs/scglue/lib/python3.10/site-packages/dask/dataframe/backends.py", line 21, in <module>
    from dask.dataframe.core import Data

In [31]:
!

In [None]:


print("Generate TF cis-regulatory ranking bridged by ATAC peaks", flush=True)
peak_bed = scglue.genomics.Bed(atac.var.loc[peaks])
peak2tf = scglue.genomics.window_graph(peak_bed, motif_bed, 0, right_sorted=True)
peak2tf = peak2tf.edge_subgraph(e for e in peak2tf.edges if e[1] in tfs)

gene2tf_rank_glue = scglue.genomics.cis_regulatory_ranking(
    gene2peak, peak2tf, genes, peaks, tfs,
    region_lens=atac.var.loc[peaks, "chromEnd"] - atac.var.loc[peaks, "chromStart"],
    random_state=0)

flank_bed = scglue.genomics.Bed(rna.var.loc[genes]).strand_specific_start_site().expand(500, 500)
flank2tf = scglue.genomics.window_graph(flank_bed, motif_bed, 0, right_sorted=True)

gene2flank = nx.Graph([(g, g) for g in genes])
gene2tf_rank_supp = scglue.genomics.cis_regulatory_ranking(
    gene2flank, flank2tf, genes, genes, tfs,
    n_samples=0
)

### Prune coexpression network using cis-regulatory ranking

gene2tf_rank_glue.columns = gene2tf_rank_glue.columns + "_glue"
gene2tf_rank_supp.columns = gene2tf_rank_supp.columns + "_supp"

scglue.genomics.write_scenic_feather(gene2tf_rank_glue, f"{par['temp_dir']}/glue.genes_vs_tracks.rankings.feather")
scglue.genomics.write_scenic_feather(gene2tf_rank_supp, f"{par['temp_dir']}/supp.genes_vs_tracks.rankings.feather")

pd.concat([
    pd.DataFrame({
            "#motif_id": tfs + "_glue",
            "gene_name": tfs
        }),
    pd.DataFrame({
        "#motif_id": tfs + "_supp",
        "gene_name": tfs
    })
]).assign(
    motif_similarity_qvalue=0.0,
    orthologous_identity=1.0,
    description="placeholder"
).to_csv(f"{par['temp_dir']}/ctx_annotation.tsv", sep="\t", index=False)

# Construct the command 
#TODO: be sure that obs_id is in obs and name is in var
print("Run pscenic ctx", flush=True)
command = (
    f" pyscenic ctx {par['temp_dir']}/draft_grn.csv {par['temp_dir']}/glue.genes_vs_tracks.rankings.feather "
    f" {par['temp_dir']}/supp.genes_vs_tracks.rankings.feather  --annotations_fname {par['temp_dir']}/ctx_annotation.tsv "
    f" --expression_mtx_fname {par['temp_dir']}/rna.loom --output {par['temp_dir']}/pruned_grn.csv "
    f" --rank_threshold 500 --min_genes 1  --num_workers {par['num_workers']} "
    " --cell_id_attribute obs_id --gene_attribute name"
)

result = subprocess.run(command, shell=True, capture_output=True, text=True)

print("Output:")
print(result.stdout)
print("Error:")
print(result.stderr)

if result.returncode == 0:
    print("pyscenic ctx executed successfully")
else:
    print("pyscenic ctx failed with return code", result.returncode)