## Imports

In [1]:
import pandas as pd
import os 
from celloracle import motif_analysis as ma
import pandas as pd
import celloracle as co
import anndata
import numpy as np
import anndata as ad
from celloracle import motif_analysis as ma
import genomepy



In [2]:
## VIASH START
par = {
  "multiomics_rna": "resources/grn-benchmark/multiomics_rna.h5ad",
  "multiomics_atac": "resources/grn-benchmark/multiomics_atac.h5ad",
  "annotation_file": "resources/grn-benchmark/annotation_file",
  "motif_file": "resources/grn-benchmark/motif_file",
  "prediction": "output/prediction.csv",
}

In [3]:
print('Reading input files', flush=True)
multiomics_rna = ad.read_h5ad(par["multiomics_rna"])
multiomics_atac = ad.read_h5ad(par["multiomics_atac"])

Reading input files


In [None]:
genomes_dir = 'output'

peaks = multiomics_atac.var_names.to_numpy()
peaks = [peak.replace(':','_').replace("-",'_') for peak in peaks]
tss_annotated = ma.get_tss_info(peak_str_list=peaks, ref_genome="hg38")
tss_annotated['peak_id'] = tss_annotated['chr'].astype(str)+"_"+tss_annotated['start'].astype(str)+"_"+tss_annotated['end'].astype(str)
peak_gene = tss_annotated


genomepy.install_genome(name="hg38", provider="UCSC", genomes_dir=genomes_dir)

# PLEASE make sure reference genome is correct.
ref_genome = "hg38"

genome_installation = ma.is_genome_installed(ref_genome=ref_genome,
                                             genomes_dir=genomes_dir)
print(ref_genome, "installation: ", genome_installation)

# Instantiate TFinfo object
tfi = ma.TFinfo(peak_data_frame=peak_gene, 
                ref_genome="hg38",
                genomes_dir=genomes_dir) 

tfi.scan(fpr=0.05, 
         motifs=None,  # If you enter None, default motifs will be loaded.
         verbose=True)
# Check motif scan results
tfi.scanned_df.head()
# Reset filtering 
tfi.reset_filtering()

# Do filtering
tfi.filter_motifs_by_score(threshold=10)

# Format post-filtering results.
tfi.make_TFinfo_dataframe_and_dictionary(verbose=True)

# Format and save 
df = tfi.to_dataframe()
df.to_csv(f'{par['temp_dir']}/grn_celloracle_base.csv')

## Annotations

## GRN construction


### Preprocessing scRNA-seq

In [86]:
def preprocess_rna(par):
    adata = ad.read_h5ad(par['multiomics_rna'])
    adata.layers["counts"] = adata.X.copy()
    sc.pp.normalize_per_cell(adata, key_n_counts='n_counts_all')

    filter_result = sc.pp.filter_genes_dispersion(adata.X,
                                                flavor='cell_ranger',
                                                n_top_genes=3000,
                                                log=False)

    # Subset the genes
    adata = adata[:, filter_result.gene_subset]

    # Renormalize after filtering
    sc.pp.normalize_per_cell(adata)

    # Log transformation and scaling
    sc.pp.log1p(adata)
    sc.pp.scale(adata)

    # PCA
    sc.tl.pca(adata, svd_solver='arpack')

    # Diffusion map
    sc.pp.neighbors(adata, n_neighbors=4, n_pcs=20)

    sc.tl.diffmap(adata)
    # Calculate neihbors again based on diffusionmap 
    sc.pp.neighbors(adata, n_neighbors=10, use_rep='X_diffmap')

    sc.tl.louvain(adata, resolution=0.8)

    sc.tl.paga(adata, groups='louvain')

    sc.pl.paga(adata)
    sc.tl.draw_graph(adata, init_pos='paga', random_state=123)
    sc.pl.draw_graph(adata, color='louvain', legend_loc='on data')
    # Check data in anndata
    print("Metadata columns :", list(adata.obs.columns))
    print("Dimensional reduction: ", list(adata.obsm.keys()))

    adata.X = adata.layers["counts"].copy()

    adata.write_h5ad(f"{par['temp_dir']}/adata.h5ad")

In [10]:
def infer_grn(par):
    adata = ad.read_h5ad(f"{par['temp_dir']}/adata.h5ad")
    df = pd.read_csv(f'{par['temp_dir']}/grn_celloracle_base.csv')
    # Instantiate Oracle object
    oracle = co.Oracle()
    # Instantiate Oracle object.
    oracle.import_anndata_as_raw_count(adata=adata,
                                    cluster_column_name="cell_type",
                                    embedding_name="X_draw_graph_fr")
    # You can load TF info dataframe with the following code.
    oracle.import_TF_data(TF_info_matrix=base_GRN)

    oracle.perform_PCA()
    plt.plot(np.cumsum(oracle.pca.explained_variance_ratio_)[:100])
    n_comps = np.where(np.diff(np.diff(np.cumsum(oracle.pca.explained_variance_ratio_))>0.002))[0][0]
    plt.axvline(n_comps, c="k")
    plt.show()
    print(n_comps)
    n_comps = min(n_comps, 50)

    n_cell = oracle.adata.shape[0]
    print(f"cell number is :{n_cell}")
    k = min([int(0.025*n_cell), 50])
    print(f"Auto-selected k is :{k}")
    oracle.knn_imputation(n_pca_dims=n_comps, k=k, balanced=True, b_sight=k*8,
                        b_maxl=k*4, n_jobs=4)

    links = oracle.get_links(cluster_name_for_GRN_unit="cell_type", alpha=10,
                        verbose_level=10)
    links.to_hdf5(file_path=f"{par['temp_dir']}/links.celloracle.links")

Metadata columns : ['cell_type', 'donor_id', 'n_genes', 'louvain', 'n_counts_all', 'n_counts']
Dimensional reduction:  ['X_diffmap', 'X_draw_graph_fr', 'X_pca', 'X_umap']


### GRN calculation


# Post evaluation

## Peak gene connections


## Refined GRN 

In [4]:
tag = '' #'_hvg'

links_o = co.load_hdf5(f"{work_dir}/infer/celloracle/grn/links{tag}.celloracle.links") 

In [7]:
links_dict =  links_o.links_dict.copy()
tt = 0.05
links_dict_f = {}
for key, df in links_dict.items():
    mask = df.p<tt
    df = df[mask]
    if key=='agg_type':
        key='T cells'
    links_dict_f[key]=df
for cell_type, grn in links_dict_f.items():
    grn.to_csv(f'{work_dir}/infer/celloracle/grn/grn_{cell_type}.csv')