In [1]:
# python libs
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import scanpy as sc
import liana as li
from liana.method import rank_aggregate
#import decoupler as dc

import session_info

In [42]:
# Setting up R dependencies
import anndata2ri
import rpy2
from rpy2 import robjects
from rpy2.robjects import r
import random

anndata2ri.activate()

%load_ext rpy2.ipython

The rpy2.ipython extension is already loaded. To reload it, use:
  %reload_ext rpy2.ipython


In [3]:
file = "merged_data.h5ad"
adata = sc.read("../../../data/merged_data.h5ad")
adata

AnnData object with n_obs × n_vars = 87871 × 23767
    obs: 'author_annotation', 'scDblFinder_score', 'scDblFinder_class', 'manual_celltype_annotation', 'batch', 'condition', 'doublet_score', 'predicted_doublet', 'timepoint', 'author_annotation_coarse', 'dataset', 'fibrotic/control', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_20_genes', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo', 'total_counts_hb', 'log1p_total_counts_hb', 'pct_counts_hb', 'low_hierarchy', 'high_hierarchy', 'harmonized_anno', 'coarse_harmonized_anno', 'low_hierarchy_fine', 'high_hierarchy_fine', 'low_hierarchy_coarse', 'high_hierarchy_coarse'
    var: 'mt', 'ribo', 'hb', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'highly_variable_nbatches

In [4]:
# data already log1p normalized
adata.X = adata.layers["log1p_norm"]
adatas = dict()
# make sure the format is correct
adata.obs["batch"] = adata.obs["batch"].astype("category")
adata.obs["manual_celltype_annotation"] = adata.obs["manual_celltype_annotation"].astype("category")

In [6]:
adata.obs["condition"] = adata.obs["condition"].replace({"untreated":"untreated1"})
#adata.obs["condition"] = adata.obs["condition"].astype("category")
conditions = list(adata.obs.condition.cat.categories)
conditions

['untreated1', 'asbestos', 'bleomycin']

In [7]:
for cond in conditions[:1]:
    adatas[cond] = adata[adata.obs["condition"]==cond].copy()

    # run liana consensus
    print("Running rank_aggregate on "+file[:-5]+", "+cond)
    rank_aggregate(
        adatas[cond], groupby="manual_celltype_annotation", resource_name = 'mouseconsensus',
        return_all_lrs=True, use_raw=False, verbose=True)
    liana_res = adatas[cond].uns["liana_res"].drop_duplicates(["ligand_complex", "receptor_complex"]).sort_values(["magnitude_rank", "specificity_rank"],)
    liana_res.to_csv("./results/"+file[:-5]+"_"+cond+"_liana.csv")
    adatas[cond].write("../../../data/liana_anndatas/"+file[:-5]+"_"+cond+"_liana.h5ad", compression='gzip')
    fig = li.pl.dotplot(
        adata=adatas[cond],
        colour="magnitude_rank",
        size="specificity_rank",
        inverse_colour=True,  # we inverse sign since we want small p-values to have large sizes
        inverse_size=True,
        # since the rank_aggregate can also be interpreted as a probability distribution
        # we can again filter them according to their specificity significance
        # yet here the interactions are filtered according to
        # how consistently highly-ranked is their specificity across the methods
        filterby="specificity_rank",
        filter_lambda=lambda x: x <= 0.05,
        # again, we can also further order according to magnitude
        orderby="magnitude_rank",
        orderby_ascending=True,  # prioritize those with lowest values
        top_n=20,  # and we want to keep only the top 20 interactions
        figure_size=(46, 26),
        size_range=(1, 6),
        return_fig=True
    )   
    fig.save("./results/"+file[:-5]+"_"+cond+"_liana.png", dpi=500, limitsize=False)

Running rank_aggregate on merged_data, untreated1
Using `.X`!
1050 features of mat are empty, they will be removed.




0.09 of entities in the resource are missing from the data.
Generating ligand-receptor stats for 37855 samples and 22717 features
Assuming that counts were `natural` log-normalized!
Running CellPhoneDB


100%|██████████| 1000/1000 [00:36<00:00, 27.05it/s]


Running Connectome
Running log2FC
Running NATMI
Running SingleCellSignalR
Running CellChat


100%|██████████| 1000/1000 [16:33<00:00,  1.01it/s]


In [8]:
for cond in conditions:
    adatas[cond] = sc.read(f"../../../data/liana_anndatas/merged_data_{cond}_liana.h5ad")
adatas

{'untreated1': AnnData object with n_obs × n_vars = 37855 × 23767
     obs: 'author_annotation', 'scDblFinder_score', 'scDblFinder_class', 'manual_celltype_annotation', 'batch', 'condition', 'doublet_score', 'predicted_doublet', 'timepoint', 'author_annotation_coarse', 'dataset', 'fibrotic/control', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_20_genes', 'total_counts_mt', 'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'log1p_total_counts_ribo', 'pct_counts_ribo', 'total_counts_hb', 'log1p_total_counts_hb', 'pct_counts_hb', 'low_hierarchy', 'high_hierarchy', 'harmonized_anno', 'coarse_harmonized_anno', 'low_hierarchy_fine', 'high_hierarchy_fine', 'low_hierarchy_coarse', 'high_hierarchy_coarse'
     var: 'mt', 'ribo', 'hb', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'highly_

nichenet

In [9]:
%%R
suppressPackageStartupMessages({
    library(reticulate)
    library(ggplot2)
    library(tidyr)
    library(dplyr)
    library(purrr)
    library(tibble)
})


    an issue that caused a segfault when used with rpy2:
    https://github.com/rstudio/reticulate/pull/1188
    Make sure that you use a version of that package that includes
    the fix.
    

In [10]:
%%R
library("nichenetr", lib="/home/d/danilina/mambaforge/envs/scanpy_r/lib/R/library")

The legacy packages maptools, rgdal, and rgeos, underpinning the sp package,
which was just loaded, will retire in October 2023.
Please refer to R-spatial evolution reports for details, especially
https://r-spatial.org/r/2023/05/15/evolution4.html.
It may be desirable to make the sf package available;
package maintainers should consider adding sf to Suggests:.
The sp package is now running under evolution status 2
     (status 2 uses the sf package in place of rgdal)


In [11]:
%%R
# Increase timeout threshold
options(timeout=600)

# Load PK
ligand_target_matrix <- readRDS(url("https://zenodo.org/record/7074291/files/ligand_target_matrix_nsga2r_final_mouse.rds"))
lr_network <- readRDS(url("https://zenodo.org/record/7074291/files/lr_network_mouse_21122021.rds"))

In [25]:
sender_celltypes = ["Aerocytes", "Alveolar_macrophages", "Endothelial", "Transitioning_epithelial", "Fibroblasts", "Interstitial_macrophages", "Epithelial"]
receiver_celltypes = ["Aerocytes", "Alveolar_macrophages", "Endothelial", "Transitioning_epithelial", "Fibroblasts", "Interstitial_macrophages", "Epithelial"]

In [26]:
# Helper function to obtain sufficiently expressed genes
from functools import reduce


def get_expressed_genes(adata, cell_type, expr_prop):
    # calculate proportions
    temp = adata[adata.obs["manual_celltype_annotation"] == cell_type, :]
    a = temp.X.getnnz(axis=0) / temp.X.shape[0]
    stats = (
        pd.DataFrame({"genes": temp.var_names, "props": a})
        .assign(cell_type=cell_type)
        .sort_values("genes")
    )

    # obtain expressed genes
    stats = stats[stats["props"] >= expr_prop]
    expressed_genes = stats["genes"].values

    return expressed_genes

In [32]:
sender_expressed = reduce(
    np.union1d,
    [
        get_expressed_genes(adata, cell_type=cell_type, expr_prop=0.1)
        for cell_type in sender_celltypes
    ],
)

receiver_expressed = reduce(
    np.union1d,
    [
        get_expressed_genes(adata, cell_type=cell_type, expr_prop=0.1)
        for cell_type in receiver_celltypes
    ],
)



In [28]:
%%R -i sender_expressed -i receiver_expressed
# get ligands and receptors in the resource
ligands <- lr_network %>% pull(from) %>% unique()
receptors <- lr_network %>% pull(to) %>% unique()

# only keep the intersect between the resource and the data
expressed_ligands <- intersect(ligands, sender_expressed)
expressed_receptors <- intersect(receptors, receiver_expressed)

# filter the network to only include ligands for which both the ligand and receptor are expressed
potential_ligands <- lr_network %>% 
  filter(from %in% expressed_ligands & to %in% expressed_receptors) %>%
  pull(from) %>% unique()

In [29]:
d = pd.read_csv("../../../data/all_deg/Aerocytes_conditionasbestos-conditionuntreated.csv")
d = d.rename(columns={"Unnamed: 0":"name"})
d["contrast"] = "Aerocytes"
d

Unnamed: 0,name,logFC,logCPM,F,PValue,FDR,PValue_adj,contrast
0,Actr1b,-3.215889,5.195018,1.097403e+01,0.001146,1,1,Aerocytes
1,Mis18a,-5.747447,4.534618,1.113889e+01,0.001618,1,1,Aerocytes
2,Ampd2,-5.634205,4.197896,9.414385e+00,0.002536,1,1,Aerocytes
3,Ffar4,4.123256,4.792299,8.943457e+00,0.003233,1,1,Aerocytes
4,Tsen15,-5.634206,4.509011,8.881292e+00,0.003339,1,1,Aerocytes
...,...,...,...,...,...,...,...,...
14361,St5,0.000000,2.811764,-3.212423e-14,1.000000,1,1,Aerocytes
14362,Sult5a1,0.000000,3.211757,-8.675448e-13,1.000000,1,1,Aerocytes
14363,Tceb1,0.000000,4.715812,-3.546005e-14,1.000000,1,1,Aerocytes
14364,Tceb2,0.000000,6.305754,-1.903411e-14,1.000000,1,1,Aerocytes


In [33]:
deg = dict()
for cond in conditions[1:]:
    ld = []
    for ct in receiver_celltypes:
        d = pd.read_csv(f"../../../data/all_deg/{ct}_condition{cond}-conditionuntreated.csv")
        d = d.rename(columns={"Unnamed: 0":"name"})
        d["contrast"] = ct
        ld.append(d)
    deg[cond] = pd.concat(ld)

In [34]:
deg[cond]

Unnamed: 0,name,logFC,logCPM,F,PValue,FDR,PValue_adj,contrast
0,Inmt,-4.482831,7.948510,4.040107e+01,2.133601e-09,0.000031,0.000031,Aerocytes
1,Clca1,4.610492,3.326693,3.899095e+01,7.178720e-09,0.000052,0.000052,Aerocytes
2,Adamts15,-3.726447,3.939041,3.654612e+01,1.102336e-08,0.000053,0.000053,Aerocytes
3,Mmp3,-5.268465,4.572292,3.355748e+01,4.123833e-08,0.000148,0.000148,Aerocytes
4,C7,-4.724861,4.040843,3.216462e+01,6.916929e-08,0.000199,0.000199,Aerocytes
...,...,...,...,...,...,...,...,...
12146,Arl16,-0.000301,3.978986,1.903514e-07,9.996551e-01,0.999857,0.999857,Epithelial
12147,Ppp1r12c,0.006018,4.127733,5.950672e-04,9.996648e-01,0.999857,0.999857,Epithelial
12148,Itsn1,-0.003058,3.312961,1.092704e-04,9.996929e-01,0.999857,0.999857,Epithelial
12149,Agps,-0.000184,5.047182,8.106095e-08,9.997749e-01,0.999857,0.999857,Epithelial


In [37]:
genesets_oi = dict()
background_genesL = dict()
for cond in conditions[1:]:
    # define background of sufficiently expressed genes
    background_genesL[cond] = deg[cond]["name"].values

    # only keep significant and positive DE genes
    deg[cond] = deg[cond][(deg[cond]["PValue"] <= 0.05) & (deg[cond]["logFC"] > 1)]
    # get geneset of interest
    genesets_oi[cond] = deg[cond]["name"].values

In [44]:
for cond in conditions[1:]:
    # define background of sufficiently expressed genes
    background_genes = deg[cond]["name"].values

    # only keep significant and positive DE genes
    deg[cond] = deg[cond][(deg[cond]["PValue"] <= 0.05) & (deg[cond]["logFC"] > 1)]
    # get geneset of interest
    geneset_oi = deg[cond]["name"].values
    print(geneset_oi)

    robjects.globalenv['geneset_oi'] = geneset_oi
    robjects.globalenv['background_genes'] = background_genes
    robjects.r('''
        ligand_activities <- predict_ligand_activities(geneset = geneset_oi, 
                                                    background_expressed_genes = background_genes,
                                                    ligand_target_matrix = ligand_target_matrix,
                                                    potential_ligands = potential_ligands)

        ligand_activities <- ligand_activities %>% 
        arrange(-aupr) %>% 
        mutate(rank = rank(desc(aupr)))
                ''')
    ligand_activities = robjects.r["ligand_activities"]


R[write to console]: Error in evaluate_target_prediction(setting, ligand_target_matrix, ligands_position) : 
  all genes have same response



['Ffar4' 'Bloc1s5' 'Rnf128' 'Tsacc' 'Pomk' 'Skp2' 'Kcnk6' 'Tha1' 'Nupr1l'
 'Cyb5r4' 'Tyw3' 'Gm4890' 'Txnrd3' 'Clec9a' 'Tchh' 'AV356131' 'E2f2'
 'Dnase1l3' 'Dtx4' 'Rmi2' 'Akap7' 'Ubxn11' 'Hcrtr2' 'Irx2' 'Pbx4' 'Slamf7'
 'Rsph3a' 'BC049715' 'Catip' 'Anpep' 'Ank3' 'Adam23' 'Gm9920' 'Pop1'
 'Armc2' 'Dync2i2' 'Ccnb2' 'Gpatch3' 'Zfp937' 'Cep76' 'Gm45345' 'Rab26os'
 'Asb1' 'Gorasp1' '1700055D18Rik' 'Ears2' 'Lonrf3' 'Dnajc27' 'Fastkd5'
 'Gpnmb' 'Matn2' 'D430001F17Rik' 'Slc25a1' 'Zfp354a' 'Mtfp1' 'Tspan33'
 '2610028H24Rik' 'Amy1' 'Porcn' 'Gm26982' 'Tcp11l1' 'B3gntl1' 'Adgb'
 'Arsb' 'Nap1l3' 'Clnk' 'Barx2' 'Tapt1' '5730414N17Rik' 'Bcas1' 'Gm10419'
 'Ccdc181' 'Shank2' '9330151L19Rik' 'Gpatch2l' 'Rph3al' 'Xcr1' 'Cep126'
 'Gjb2' 'Fsip1' 'Kcnmb4' 'Pisd' 'Cyp39a1' 'Gm15964' 'Zbed4' 'Lrrc10b'
 'Pus10' 'Eya3' 'Efcab7' '4933406B17Rik' 'Cbr2' 'Snx22' 'Tmie' 'Efna5'
 'Igkv3-2' '5730405O15Rik' 'Rgs4' 'Lgi4' 'Anks6' 'Myrf' 'Recql4'
 'E230013L22Rik' 'Gm10605' 'E2f7' 'Tm4sf5' 'Col4a6' 'Bnc2' 'Vwc2'
 'Atp6v0a4

RRuntimeError: Error in evaluate_target_prediction(setting, ligand_target_matrix, ligands_position) : 
  all genes have same response
