In [1]:
# python libs
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import scanpy as sc
import liana as li
from liana.method import rank_aggregate
#import decoupler as dc

import session_info

In [2]:
# Setting up R dependencies
import anndata2ri
import rpy2
from rpy2.robjects import r
import random

anndata2ri.activate()

%load_ext rpy2.ipython

In [3]:
%%R
suppressPackageStartupMessages({
    library(reticulate)
    library(ggplot2)
    library(tidyr)
    library(dplyr)
    library(purrr)
    library(tibble)
})


    an issue that caused a segfault when used with rpy2:
    https://github.com/rstudio/reticulate/pull/1188
    Make sure that you use a version of that package that includes
    the fix.
    

In [4]:
%%R
library("nichenetr", lib="/home/d/danilina/mambaforge/envs/scanpy_r/lib/R/library")

The legacy packages maptools, rgdal, and rgeos, underpinning the sp package,
which was just loaded, will retire in October 2023.
Please refer to R-spatial evolution reports for details, especially
https://r-spatial.org/r/2023/05/15/evolution4.html.
It may be desirable to make the sf package available;
package maintainers should consider adding sf to Suggests:.
The sp package is now running under evolution status 2
     (status 2 uses the sf package in place of rgdal)


In [5]:
# figure settings
sc.settings.set_figure_params(dpi=200, frameon=False)
sc.set_figure_params(dpi=200, facecolor="white")
sc.set_figure_params(figsize=(5, 5))

In [3]:
file = "merged_data.h5ad"
adata = sc.read("../../../data/merged_data.h5ad")
adata

AnnData object with n_obs × n_vars = 87871 × 23767
    obs: 'author_annotation', 'scDblFinder_score', 'scDblFinder_class', 'manual_celltype_annotation', 'batch', 'condition', 'doublet_score', 'predicted_doublet', 'timepoint', 'author_annotation_coarse', 'dataset', 'fibrotic/control', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_20_genes', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo', 'total_counts_hb', 'log1p_total_counts_hb', 'pct_counts_hb', 'low_hierarchy', 'high_hierarchy', 'harmonized_anno', 'coarse_harmonized_anno', 'low_hierarchy_fine', 'high_hierarchy_fine', 'low_hierarchy_coarse', 'high_hierarchy_coarse'
    var: 'mt', 'ribo', 'hb', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'highly_variable_nbatches

In [4]:
#condition = [x for x in ['bleomycin', 'bleo', 'Bleo', 'asbestos'] if x in list(adata.obs["condition"].cat.categories)][0]
#control = [x for x in ['saline', 'healthy', 'UT', 'control'] if x in list(adata.obs["condition"].cat.categories)][0]

# data already log1p normalized
# make sure the format is correct
adata.obs["batch"] = adata.obs["batch"].astype("category")
adata.obs["manual_celltype_annotation"] = adata.obs["manual_celltype_annotation"].astype("category")

In [14]:
adata.obs.condition

AAACCTGAGGACATTA-1_xie    untreated
AAACCTGCAGTCGTGC-1_xie    untreated
AAACCTGCATGAAGTA-1_xie    untreated
AAACCTGTCTCGCATC-1_xie    untreated
AAACGGGTCCTAGAAC-1_xie    untreated
                            ...    
29291_schiller            untreated
29292_schiller            untreated
29293_schiller            untreated
29294_schiller            untreated
29295_schiller            untreated
Name: condition, Length: 87871, dtype: category
Categories (4, object): ['untreated', 'asbestos', 'bleomycin', 'saline']

In [15]:
adata.obs.condition[adata.obs.condition =="untreated"]

AAACCTGAGGACATTA-1_xie    untreated
AAACCTGCAGTCGTGC-1_xie    untreated
AAACCTGCATGAAGTA-1_xie    untreated
AAACCTGTCTCGCATC-1_xie    untreated
AAACGGGTCCTAGAAC-1_xie    untreated
                            ...    
29291_schiller            untreated
29292_schiller            untreated
29293_schiller            untreated
29294_schiller            untreated
29295_schiller            untreated
Name: condition, Length: 31495, dtype: category
Categories (4, object): ['untreated', 'asbestos', 'bleomycin', 'saline']

In [16]:
adata.obs.condition[adata.obs.condition =="saline"]

AAACCTGAGGAATTAC-1_peyser    saline
AAACCTGCATAACCTG-1_peyser    saline
AAACCTGCATTGGGCC-1_peyser    saline
AAACCTGGTCTCAACA-1_peyser    saline
AAACCTGGTTGACGTT-1_peyser    saline
                              ...  
TTTGGTTAGAATTCCC-1_peyser    saline
TTTGGTTAGTGGAGAA-1_peyser    saline
TTTGTCAGTAGGGTAC-1_peyser    saline
TTTGTCAGTGACGGTA-1_peyser    saline
TTTGTCAGTTCTGAAC-1_peyser    saline
Name: condition, Length: 6360, dtype: category
Categories (4, object): ['untreated', 'asbestos', 'bleomycin', 'saline']

In [17]:
adata.obs.condition[adata.obs.condition =="asbestos"]

SC15_AAACCTGAGACACGAC_misharin    asbestos
SC15_AAACCTGAGCTGAACG_misharin    asbestos
SC15_AAACCTGAGGGTCTCC_misharin    asbestos
SC15_AAACCTGCAATCCAAC_misharin    asbestos
SC15_AAACCTGCATATACGC_misharin    asbestos
                                    ...   
SC15_TTTGTCAGTGATGTGG_misharin    asbestos
SC15_TTTGTCAGTTCGCTAA_misharin    asbestos
SC15_TTTGTCATCAGTCAGT_misharin    asbestos
SC15_TTTGTCATCGCGTAGC_misharin    asbestos
SC15_TTTGTCATCGCTGATA_misharin    asbestos
Name: condition, Length: 7117, dtype: category
Categories (4, object): ['untreated', 'asbestos', 'bleomycin', 'saline']

In [18]:
adata.obs.condition[adata.obs.condition =="bleomycin"]

AAACCTGAGGAGTCTG-4_xie    bleomycin
AAACCTGCAACGCACC-4_xie    bleomycin
AAACCTGGTAGAGTGC-4_xie    bleomycin
AAACCTGGTCCTAGCG-4_xie    bleomycin
AAACCTGGTCGCGAAA-4_xie    bleomycin
                            ...    
28613_schiller            bleomycin
28614_schiller            bleomycin
28615_schiller            bleomycin
28616_schiller            bleomycin
28617_schiller            bleomycin
Name: condition, Length: 42899, dtype: category
Categories (4, object): ['untreated', 'asbestos', 'bleomycin', 'saline']

In [5]:
#adata.obs["condition"] = adata.obs["condition"].replace({"saline":"untreated"})
#adata.obs["condition"] = adata.obs["condition"].astype("category")
conditions = list(adata.obs.condition.cat.categories)
conditions

['untreated', 'asbestos', 'bleomycin', 'saline']

In [7]:
adata.X = adata.layers["log1p_norm"]
adatas = dict()

In [22]:
for cond in conditions[1:]:
    adatas[cond] = adata[adata.obs["condition"]==cond].copy()

    # run liana consensus
    print("Running rank_aggregate on "+file[:-5]+", "+cond)
    rank_aggregate(
        adatas[cond], groupby="manual_celltype_annotation", resource_name = 'mouseconsensus',
        return_all_lrs=True, use_raw=False, verbose=True)
    liana_res = adatas[cond].uns["liana_res"].drop_duplicates(["ligand_complex", "receptor_complex"]).sort_values(["magnitude_rank", "specificity_rank"],)
    liana_res.to_csv("./results/"+file[:-5]+"_"+cond+"_liana.csv")
    adatas[cond].write("../../../data/liana_anndatas/"+file[:-5]+"_"+cond+"_liana.h5ad", compression='gzip')
    fig = li.pl.dotplot(
        adata=adatas[cond],
        colour="magnitude_rank",
        size="specificity_rank",
        inverse_colour=True,  # we inverse sign since we want small p-values to have large sizes
        inverse_size=True,
        # since the rank_aggregate can also be interpreted as a probability distribution
        # we can again filter them according to their specificity significance
        # yet here the interactions are filtered according to
        # how consistently highly-ranked is their specificity across the methods
        filterby="specificity_rank",
        filter_lambda=lambda x: x <= 0.05,
        # again, we can also further order according to magnitude
        orderby="magnitude_rank",
        orderby_ascending=True,  # prioritize those with lowest values
        top_n=20,  # and we want to keep only the top 20 interactions
        figure_size=(46, 26),
        size_range=(1, 6),
        return_fig=True
    )   
    fig.save("./results/"+file[:-5]+"_"+cond+"_liana.png", dpi=500, limitsize=False)

Running rank_aggregate on merged_data, asbestos
Using `.X`!
4917 features of mat are empty, they will be removed.




0.18 of entities in the resource are missing from the data.
Generating ligand-receptor stats for 7117 samples and 18850 features
Assuming that counts were `natural` log-normalized!
Running CellPhoneDB


100%|██████████| 1000/1000 [00:11<00:00, 86.49it/s]


Running Connectome
Running log2FC
Running NATMI
Running SingleCellSignalR
Running CellChat


100%|██████████| 1000/1000 [02:25<00:00,  6.88it/s]


Running rank_aggregate on merged_data, bleomycin
Using `.X`!
417 features of mat are empty, they will be removed.




0.07 of entities in the resource are missing from the data.
Generating ligand-receptor stats for 42899 samples and 23350 features
Assuming that counts were `natural` log-normalized!
Running CellPhoneDB


100%|██████████| 1000/1000 [00:38<00:00, 25.84it/s]


Running Connectome
Running log2FC
Running NATMI
Running SingleCellSignalR
Running CellChat


100%|██████████| 1000/1000 [14:37<00:00,  1.14it/s]


Running rank_aggregate on merged_data, saline
Using `.X`!
5831 features of mat are empty, they will be removed.




0.14 of entities in the resource are missing from the data.
Generating ligand-receptor stats for 6360 samples and 17936 features
Assuming that counts were `natural` log-normalized!
Running CellPhoneDB


100%|██████████| 1000/1000 [00:11<00:00, 89.86it/s]


Running Connectome
Running log2FC
Running NATMI
Running SingleCellSignalR
Running CellChat


100%|██████████| 1000/1000 [01:52<00:00,  8.90it/s]


In [10]:
for cond in conditions[:1]:
    adatas[cond] = adata[adata.obs["condition"]==cond].copy()

    # run liana consensus
    print("Running rank_aggregate on "+file[:-5]+", "+cond)
    rank_aggregate(
        adatas[cond], groupby="manual_celltype_annotation", resource_name = 'mouseconsensus',
        return_all_lrs=True, use_raw=False, verbose=True)
    liana_res = adatas[cond].uns["liana_res"].drop_duplicates(["ligand_complex", "receptor_complex"]).sort_values(["magnitude_rank", "specificity_rank"],)
    liana_res.to_csv("./results/"+file[:-5]+"_"+cond+"_liana.csv")
    adatas[cond].write("../../../data/liana_anndatas/"+file[:-5]+"_"+cond+"_liana.h5ad", compression='gzip')
    fig = li.pl.dotplot(
        adata=adatas[cond],
        colour="magnitude_rank",
        size="specificity_rank",
        inverse_colour=True,  # we inverse sign since we want small p-values to have large sizes
        inverse_size=True,
        # since the rank_aggregate can also be interpreted as a probability distribution
        # we can again filter them according to their specificity significance
        # yet here the interactions are filtered according to
        # how consistently highly-ranked is their specificity across the methods
        filterby="specificity_rank",
        filter_lambda=lambda x: x <= 0.05,
        # again, we can also further order according to magnitude
        orderby="magnitude_rank",
        orderby_ascending=True,  # prioritize those with lowest values
        top_n=20,  # and we want to keep only the top 20 interactions
        figure_size=(46, 26),
        size_range=(1, 6),
        return_fig=True
    )   
    fig.save("./results/"+file[:-5]+"_"+cond+"_liana.png", dpi=500, limitsize=False)

Running rank_aggregate on merged_data, untreated
Using `.X`!
2175 features of mat are empty, they will be removed.




0.11 of entities in the resource are missing from the data.
Generating ligand-receptor stats for 31495 samples and 21592 features
Assuming that counts were `natural` log-normalized!
Running CellPhoneDB


100%|██████████| 1000/1000 [00:30<00:00, 33.11it/s]


Running Connectome
Running log2FC
Running NATMI
Running SingleCellSignalR
Running CellChat


100%|██████████| 1000/1000 [11:48<00:00,  1.41it/s]


In [None]:
adata.obs.condition

AAACCTGAGGACATTA-1_xie    untreated
AAACCTGCAGTCGTGC-1_xie    untreated
AAACCTGCATGAAGTA-1_xie    untreated
AAACCTGTCTCGCATC-1_xie    untreated
AAACGGGTCCTAGAAC-1_xie    untreated
                            ...    
29291_schiller            untreated
29292_schiller            untreated
29293_schiller            untreated
29294_schiller            untreated
29295_schiller            untreated
Name: condition, Length: 87871, dtype: category
Categories (4, object): ['untreated', 'asbestos', 'bleomycin', 'saline']

nichenet

In [None]:
%%R
# Increase timeout threshold
options(timeout=600)

# Load PK
ligand_target_matrix <- readRDS(url("https://zenodo.org/record/7074291/files/ligand_target_matrix_nsga2r_final_mouse.rds"))
lr_network <- readRDS(url("https://zenodo.org/record/7074291/files/lr_network_mouse_21122021.rds"))

UsageError: Cell magic `%%R` not found.


In [None]:
sender_celltypes = ["Aerocytes", "Alveolar_macrophages", "Endothelial", "Transitioning_epitheial", "Fibroblasts", "Interstitial_macrophages", "Epithelial"]
receiver_celltypes = ["Aerocytes", "Alveolar_macrophages", "Endothelial", "Transitioning_epitheial", "Fibroblasts", "Interstitial_macrophages", "Epithelial"]

In [None]:
# Helper function to obtain sufficiently expressed genes
from functools import reduce


def get_expressed_genes(adata, cell_type, expr_prop):
    # calculate proportions
    temp = adata[adata.obs["manual_celltype_annotation"] == cell_type, :]
    a = temp.X.getnnz(axis=0) / temp.X.shape[0]
    stats = (
        pd.DataFrame({"genes": temp.var_names, "props": a})
        .assign(cell_type=cell_type)
        .sort_values("genes")
    )

    # obtain expressed genes
    stats = stats[stats["props"] >= expr_prop]
    expressed_genes = stats["genes"].values

    return expressed_genes

In [None]:
sender_expressed = reduce(
    np.union1d,
    [
        get_expressed_genes(adata, cell_type=cell_type, expr_prop=0.1)
        for cell_type in sender_celltypes
    ],
)
receiver_expressed = reduce(
    np.union1d,
    [
        get_expressed_genes(adata, cell_type=cell_type, expr_prop=0.1)
        for cell_type in receiver_celltypes
    ],
)



In [None]:
%%R -i sender_expressed -i receiver_expressed
# get ligands and receptors in the resource
ligands <- lr_network %>% pull(from) %>% unique()
receptors <- lr_network %>% pull(to) %>% unique()

# only keep the intersect between the resource and the data
expressed_ligands <- intersect(ligands, sender_expressed)
expressed_receptors <- intersect(receptors, receiver_expressed)

# filter the network to only include ligands for which both the ligand and receptor are expressed
potential_ligands <- lr_network %>% 
  filter(from %in% expressed_ligands & to %in% expressed_receptors) %>%
  pull(from) %>% unique()

UsageError: Cell magic `%%R` not found.


In [None]:
deg = # all genes from table

SyntaxError: invalid syntax (1302648512.py, line 1)

In [None]:
# define background of sufficiently expressed genes
background_genes = deg["name"].values

# only keep significant and positive DE genes
deg = deg[(deg["pvals"] <= 0.05) & (deg["logFCs"] > 1)]
# get geneset of interest
geneset_oi = deg["name"].values

In [None]:
%%R -i geneset_oi -i background_genes -o ligand_activities

ligand_activities <- predict_ligand_activities(geneset = geneset_oi, 
                                               background_expressed_genes = background_genes,
                                               ligand_target_matrix = ligand_target_matrix,
                                               potential_ligands = potential_ligands)

ligand_activities <- ligand_activities %>% 
  arrange(-aupr) %>% 
  mutate(rank = rank(desc(aupr)))

# show top10 ligand activities
head(ligand_activities, n=10)

In [None]:
%%R -o vis_ligand_target
top_ligands <- ligand_activities %>%
  top_n(15, aupr) %>% 
  arrange(-aupr) %>%
  pull(test_ligand) %>%
  unique()

# get regulatory potentials
ligand_target_potential <- map(top_ligands,
                               ~get_weighted_ligand_target_links(.x,
                                                                 geneset = geneset_oi,
                                                                 ligand_target_matrix = ligand_target_matrix,
                                                                 n = 500)
                              ) %>%
    bind_rows() %>% 
    drop_na()
    
# prep for visualization
active_ligand_target_links <- 
  prepare_ligand_target_visualization(ligand_target_df = ligand_target_potential, 
                                      ligand_target_matrix = ligand_target_matrix)

# order ligands & targets
order_ligands <- intersect(top_ligands,
                           colnames(active_ligand_target_links)) %>% rev() %>% make.names()
order_targets <- ligand_target_potential$target %>%
  unique() %>% 
  intersect(rownames(active_ligand_target_links)) %>%
  make.names()
rownames(active_ligand_target_links) <- rownames(active_ligand_target_links) %>%
  make.names() # make.names() for heatmap visualization of genes like H2-T23
colnames(active_ligand_target_links) <- colnames(active_ligand_target_links) %>%
  make.names() # make.names() for heatmap visualization of genes like H2-T23

vis_ligand_target <- active_ligand_target_links[order_targets, order_ligands] %>%
  t()
    
# convert to dataframe, and then it's returned to py
vis_ligand_target <- vis_ligand_target %>%
    as.data.frame() %>%
    rownames_to_column("ligand") %>%
    as_tibble()

In [None]:
# convert dot to underscore and set ligand as index
vis_ligand_target["ligand"] = vis_ligand_target["ligand"].replace("\.", "_", regex=True)
vis_ligand_target.set_index("ligand", inplace=True)
# keep only columns where at least one gene has a regulatory potential >= 0.05
vis_ligand_target = vis_ligand_target.loc[
    :, vis_ligand_target[vis_ligand_target >= 0.05].any()
]
vis_ligand_target.head()

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(15, 5))
sns.heatmap(vis_ligand_target, xticklabels=True, ax=ax)
plt.show()