# Integrating GRNs with multi-omics data
Author: Romana T. Pop^1^

1. Centre for Molecular Medicine Norway (NCMM), Faculty of Medicine, University of Oslo, Oslo, Norway

We use MOFA+ to investigate the contribution of GRNs to JDR models and their association with patient survival. We run MOFA+ separately on all omics data types with no GRN information and compared the results with MOFA+ models that included network metrics (indegree, outdegree, and both).

In [None]:
# ensure environment is clean
rm(list = ls())

# install MARMOT
# uncomment if not installed
#library(devtools)
#install_github("rtpop/MARMOT", ref = "v0.0.1")

# load libraries
library(MARMOT)
library(tidyverse)
library(reshape2)
library(MOFA2)

In [None]:
# some intermediate files are provided for ease, set this parameter to FALSE
# if you do not wish to use them and wish to compute them again instead
precomputed <- TRUE

# setting working directory
wd <- "../JDRnet"
setwd(wd)

# specify directory for results to be saved
tcga_res <- "TCGA"
gep_res <- "GEPliver"

# defining vector of cancer names for which to do the analysis
cancers_tcga <- c("aml", "breast", "colon", "gbm", "kidney", "liver", "lung",
          "melanoma", "ovarian", "sarcoma")

# defining names for the JDR models that we will run
model <- c("nonet", "indeg", "out", "both")

# define vector of omic names that will be used
omics_tcga <- c("expression", "methylation", "miRNA", "indegree", "outdegree")

In [None]:
# running MOFA models
for (cancer in cancers_tcga) {
  load(file.path(tcga_res, paste0(cancer, "_omics_pca.Rda")))

  #nonet
  data_nonet <- omics[1:3]
  mofa_nonet <- run_mofa2(data_nonet, n_fct = 5, seed = 13, convergence = "slow", use_basilisk = T)
  save(mofa_nonet, file = file.path(tcga_res, paste0("MOFA_", cancer, "_pca_nonet.Rda")))

  # with indeg
  data_indeg <- omics[-5]
  mofa_indeg <- run_mofa2(data_indeg, n_fct = 5, seed = 13, convergence = "slow", use_basilisk = T)
  save(mofa_indeg, file = file.path(tcga_res, paste0("MOFA_", cancer, "_pca_indeg.Rda")))

  # with outdeg
  data_out <- omics[-4]
  mofa_out <- run_mofa2(data_out, n_fct = 5, seed = 13, convergence = "slow", use_basilisk = T)
  save(mofa_out, file = file.path(tcga_res, paste0("MOFA_", cancer, "_pca_out.Rda")))

  # with both
  data_both <- omics
  mofa_both <- run_mofa2(data_both, n_fct = 5, seed = 13, convergence = "slow", use_basilisk = T)
  save(mofa_both, file = file.path(tcga_res, paste0("MOFA_", cancer, "_tcga_pca_both.Rda")))
}

We now perform univariate cox regression for each factor to determine their association with patient survival. We compare the association of the factors to patient survival in the models without GRNs and the models with GRNs, reproducing Figure 3 from the paper.

In [None]:
# Define model types
model_types <- c("nonet", "indeg", "out", "both")

# Define file paths for MOFA models and survival data
mofa_files <- lapply(model_types, function(model) {
  file.path(tcga_res, paste0("MOFA_", cancers_tcga, "_pca_", model, ".Rda"))
})
mofa_files <- do.call(c, mofa_files)

surv_files <- file.path(tcga_res, paste0(cancers_tcga, "_surv.Rda"))

# Initialize the survival data frame
surv_df <- data.frame()

# process each cancer for each model type
new_surv_df <- Map(function(cancer, mofa_files, surv_file) {
  message("Processing: ", cancer)

  # Load MOFA models and survival data
  models <- lapply(mofa_files, load)
  models <- lapply(models, get)
  load(surv_file) # loads into 'surv' variable

  # Getting factors
  factors <- lapply(models, function(model) get_factors(model)[[1]])

  # Perform survival associations
  cox_models <- lapply(factors, surv_association, surv = surv, univariate = TRUE)

  # Compare survival models
  df <- surv_compare(models = cox_models, model_labels = model_types,
                     univariate = TRUE, method = "BH")
  df$cancer <- cancer

  # Save individual cox models
  cox_all <- setNames(cox_models, model_types)
  save(cox_all, file = file.path(tcga_res, paste0(cancer, "_cox_models_PCA.Rda")))

  return(df)
}, rep(cancers_tcga, each = length(model_types)), seq(1, length(mofa_files), length(cancers_tcga)), surv_files)

# Combine all results into a single data frame
surv_df <- do.call(rbind, new_surv_df)

# Save the combined survival data frame
save(surv_df, file = file.path(tcga_res, "MOFA_surv_df_all_PCA.Rda"))

In [None]:
# plotting
if(!exists("surv_df")){
  load(file.path(tcga_res, "MOFA_surv_df_all_PCA.Rda"))
}

#get models to compare
model_comp <- setdiff(model, "nonet")

#set colours
cols <- palette("Dark2")

for(i in model_comp){
  models_to_compare <- c("nonet", i)
  p <- surv_compare_dotplot(surv_df = surv_df, models_to_compare = models_to_compare, 
                            colours = c(cols[8], "grey", cols[1]))

  ggsave(p, file = paste0(figure_dir, paste0("surv_compare_",models_to_compare[2],"_tcga.pdf")))
}

# Investigating survival associated factors in liver cancer
We explore the survival-associated factors identified when including GRN features for the TCGA liver cancer samples. We evaluate which feature types were explained by the survival associated factors and we explore the correlation between the factors without GRNs and those with GRNs to assess whether the GRNs capture different heterogeneity than the other omics, or if they boost a preexisting signal. The code in this section reproduces Figure 4 from the manuscript.

In [None]:
# Plotting heatmap of the -log10(FDR) of the survival association of the factors (Figure 4A)
load(file.path(tcga_res, "MOFA_surv_df_all_PCA.Rda"))
cols <- palette("Dark2")

surv_can <- surv_df %>%
filter(cancer == "liver")

p <- .plot_heatmap(surv_can)
p <- p +
     scale_fill_gradient(low = "white", high = cols[1],
                         limits = c(0,3),
                         breaks = seq(0,3, by = 1)) +
     labs(x = "Model", y = NULL, fill = expression("-log"[10] * "FDR"))
ggsave(p, file = file.path(figure_dir, "liver_fct_sig_heat_tcga.pdf"))


# Increased regulation of metabolic pathways is associated with liver cancer survival
To determine the contribution of the GRN features we looked at what drives these SAFs in the indegree and outdegree spaces. We first performed GSEA on the weights of the indegrees for each of the SAFs in both the TCGA and GEP datasets using the MSigDb Hallmarks and KEGG gene sets. The below reproduces Figures 5 and S5 from the manuscript. Before we can perform GSEA, we must map the original omics features to the MOFA factors.

In [None]:
# load data
pca_tcga <- get(load(file.path(tcga_res, "liver_omics_pca_results.Rda")))
pca_gep <- get(load(file.path(gep_res, "liver_omics_pca_results.Rda")))

# get mofa models
mofa_tcga <- get(load(file.path(tcga_res, "MOFA_liver_pca_both.Rda")))
mofa_gep <- get(load(file.path(gep_res, "MOFA_liver_pca_both.Rda")))

# get MOFA weights
wts_tcga <- get_weights(mofa_tcga)
wts_gep <- get_weights(mofa_gep)

# map weights
# for tcga
mapped_wts_tcga <- Map(function(x, pca_wts) {
  map_wts(fct_weights = x, pca_weights = pca_wts)
}, wts_tcga, pca_tcga)

# for gep
mapped_wts_gep <- Map(function(x, pca_wts) {
  map_wts(fct_weights = x, pca_weights = pca_wts)
}, wts_gep, pca_gep)

# save mapped weights
save(mapped_wts_tcga, file = file.path(tcga_res, "liver_mapped_feat_wts.Rda"))
save(mapped_wts_gep, file = file.path(gep_res, "liver_mapped_feat_wts.Rda"))

We can now perform GSEA w=using hallmark and KEGG gene sets from MSigDb. We show below the code used to retrieve the gene sets, but we recommend using the file provided on Zenodo for full reproducibility, as the data in the MSigdb repository might have changed.

In [None]:
# get pathways from MSigDb with msigdbr
if (!precomputed) {
    # hallmark
    hallmark <- msigdbr(species = "human", category = "H")
    hallmark <- as.data.frame(cbind(hallmark$gs_name,hallmark$human_gene_symbol))
    colnames(hallmark) <- c("gs_name", "human_gene_symbol")

    #kegg
    kegg <- msigdbr(species = "human", category = "C2", subcategory = "CP:KEGG")
    kegg <- as.data.frame(cbind(kegg$gs_name, kegg$human_gene_symbol))
    colnames(kegg) <- c("gs_name", "human_gene_symbol")

    # make pathway names lowercase
    hallmark$gs_name <- tolower(hallmark$gs_name)
    kegg$gs_name <- tolower(kegg$gs_name)

    # remove the "hallmark" before every path name
    hallmark$gs_name <- gsub("hallmark_","",hallmark$gs_name)
    kegg$gs_name <- gsub("kegg_","",kegg$gs_name)

    hallmark <- split(hallmark$human_gene_symbol, hallmark$gs_name)
    kegg <- split(kegg$human_gene_symbol, kegg$gs_name)

    save(hallmark, kegg, file = file.path(tcga_res, "hall_kegg_path.Rda"))
}

In [None]:
# load data
gep <- as.data.frame(get(load(file.path(gep_res, "liver_mapped_feat_wts.Rda")))[["indegree"]])
tcga <- as.data.frame(get(load(file.path(tcga_res,"liver_mapped_feat_wts.Rda")))[["indegree"]])

# load survival
surv_tcga <- get(load(file.path(tcga_res,"MOFA_surv_df_all_PCA.Rda")))
surv_gep <- get(load(file.path(gep_res,"MOFA_surv_df_all_PCA.Rda")))

# load gene sets
load(file.path(tcga_res, "hall_kegg_path.Rda"))

# filter to sig fct
sig_tcga <- surv_tcga %>% filter(cancer == "liver", label == "both", padj <= 0.05)
sig_gep <- surv_gep %>% filter(cancer == "liver", label == "both", padj <= 0.05)

tcga <- tcga[, sig_tcga$factor]
gep <- gep[, sig_gep$factor]

# turn data into lists
tcga <- lapply(names(tcga), function(x) tcga[, x, drop = FALSE])
gep <- lapply(names(gep), function(x) gep[, x, drop = FALSE])

# name them
names(tcga) <- sig_tcga$factor
names(gep) <- sig_gep$factor


In [None]:
# run gsea with hallmark
gsea_res <- Map(function(x, file_name) {
  perform_gsea(diff_results = x, gene_set = hallmark, save_file = FALSE,
  differential = FALSE)}, tcga, file_names)

save(gsea_res, file = file.path(tcga_res, "liver_gsea_indeg_hallmark.Rda"))

# gep
gsea_res <- Map(function(x, file_name) {
  perform_gsea(diff_results = x, gene_set = hallmark, save_file = FALSE,
  differential = FALSE)}, gep, file_names)
save(gsea_res, file =  file.path(gep_res, "liver_gsea_indeg_hallmark.Rda"))

In [None]:
# run gsea with kegg
gsea_res <- Map(function(x, file_name) {
  perform_gsea(diff_results = x, gene_set = kegg, save_file = FALSE,
  differential = FALSE)}, tcga, file_names)

save(gsea_res, file = file.path(tcga_res,"liver_gsea_indeg_kegg.Rda"))

# gep
gsea_res <- Map(function(x, file_name) {
  perform_gsea(diff_results = x, gene_set = kegg, save_file = FALSE,
  differential = FALSE)}, gep, file_names)
save(gsea_res, file = file.path(gep_res, "liver_gsea_indeg_kegg.Rda"))

In [None]:
pathways <- c("hallmark", "kegg")
datasets <- c("TCGA", "GEPliver")

for (data in datasets) {
  # set directory name
  direct <- file.path("data", data)
  for (path in pathways) {
    load(file.path(direct, "liver_gsea_indeg_", path, ".Rda"))

    # Add the factor column to each data frame and merge them
    factors <- names(gsea_res)
    merged_df <- bind_rows(Map(function(df, factor) {
      df %>% mutate(factor = factor)
    }, gsea_res, factors))

    file_name <- file.path(figure_dir, paste0("liver_indeg_gsea_wts_", path, data, ".pdf"))

    plot <- gsea_dotplots(merged_df, gene_set = path,
                          title = data, file_name = file_name, n_path = NULL,
                          thresh = 3, width = 70, height = 70, limitsize = FALSE)
    plot <- plot + 
            theme(text = element_text(size = 30),
            legend.key.size = unit(1.5, "cm"),
            legend.text = element_text(size = 70),
            axis.text.y = element_text(size = 90),
            axis.text.x = element_text(size = 70),
            axis.title.x = element_text(size = 70))

    ggsave(plot, file = file_name, width = 70, height = 70, limitsize = FALSE)
  }
}

## Immune regulatory and developmental TFs associate with liver cancer survival
To assess the contribution of the outdegrees, we opted to look at the top TFs driving the SAFs in each of the two datasets. We selected the top twenty TFs with the highest weights for each SAF in each dataset and looked at the TF overlap between the two datasets. We perform a Fisher's exact test to determine if the overlap is significantly more than expected by chance. 

In [None]:
# load data
gep <- get(load(file.path(gep_res, "liver_mapped_feat_wts.Rda")))[["outdegree"]]
tcga <- get(load(file.path(tcga_res, "liver_mapped_feat_wts.Rda")))[["outdegree"]]

# factor names
gep_fct <- c("Factor2", "Factor3", "Factor4")
tcga_fct <- c("Factor2", "Factor4", "Factor5")

# melt
gep <- reshape2::melt(gep)
tcga <- reshape2::melt(tcga)
colnames(gep) <- c("feature", "factor", "value")
colnames(tcga) <- c("feature", "factor", "value")

# all tfs
tfs <- unique(gep$feature)

# filter & sort by weight
gep <- gep %>%
      filter(factor %in% gep_fct)

tcga <- tcga %>%
      filter(factor %in% tcga_fct)

# scale
gep <- gep %>%
        group_by(factor) %>%
        mutate(value = value / max(abs(value), na.rm = TRUE)) %>%
        slice_max(order_by = abs(value), n = 20) %>%
        #filter(value > 0.5) %>%
        ungroup()

tcga <- tcga %>%
        group_by(factor) %>%
        mutate(value = value / max(abs(value), na.rm = TRUE)) %>%
        slice_max(order_by = abs(value), n = 20) %>%
        #filter(value > 0.5) %>%
        ungroup()

# create contingency table for fisher's exact test
common_tfs <- unique(intersect(tcga$feature, gep$feature))
common <- length(common_tfs)
in_gep <- length(unique(gep$feature)) - common
in_tcga <- length(unique(tcga$feature)) - common
in_neither <- length(tfs) - (in_gep + in_tcga + common)

contingency_table <- matrix(c(common, in_gep, in_tcga, in_neither), nrow = 2, byrow = TRUE)

# perform fisher test
fisher_result <- fisher.test(contingency_table)

save(fisher_result, contingency_table, common_tfs,
      file = file.path(tcga_res, "outdeg_gep_tcga_fisher_result.Rda"))

In [None]:
# load data
load(file.path(tcga_res, "outdeg_gep_tcga_fisher_result.Rda"))

factors <- c("Factor2", "Factor3", "Factor4")
file_name_gep <- file.path(gep_res, "liver_indeg_wts_common.pdf")
file_name_tcga <- file.path(tcga_res,"liver_indeg_wts_common.pdf")

# plot
plots <- suppressMessages(plot_feat_wts(feat_wts = tcga, fct = factors, file_name = file_name_tcga,
  thresh = NULL, plot_type = "dotplot", manual_lab = common_tfs, n_feat = 0,
  width = 20, height = 20, limitsize = FALSE))

plots <- suppressMessages(plot_feat_wts(feat_wts = gep, fct = factors, file_name = file_name_gep,
  thresh = NULL, plot_type = "dotplot", manual_lab = common_tfs, n_feat = 0,
  width = 20, height = 20, limitsize = FALSE))

In [None]:
# plotting heatmap of variance explained by each MOFA factor (Figure 4B)
cols <- palette("Dark2")
load(file.path(tcga_res, "MOFA_surv_df_all_PCA.Rda"))
load(file.path(tcga_res, "MOFA_liver_pca_both.Rda"))

surv_can <- surv_df %>%
  filter(cancer == "liver")

b <- suppressMessages(plot_var_heat(mofa_both))

ggsave(b, file = file.path(figure_dir, paste0(cancer, "_var_explained_both_tcga.pdf")),width = 10, height = 9)

In [None]:
# plotting the correlation of factors from the model without GRNs and the model with GRNs (Figure 4C)
corr_all <- list()
corr_df <- data.frame()

# load models
all <- get(load(file.path(tcga_res, "MOFA_liver_pca_both.Rda")))
nonet <- get(load(file.path(tcga_res, "MOFA_liver_pca_nonet.Rda")))

# get factors
all_fct <- get_factors(all)[[1]]
nonet_fct <- get_factors(nonet)[[1]]

corr <- fct_corr(all_fct, nonet_fct, labels = c("all", "no GRN"),
                as_data_frame = TRUE, abs = TRUE)

# reformat as data frame
corr_df <- format_fct_corr(corr)

# plot
p <- plot_fct_corr(corr_df, abs = TRUE)

# removing facet headers
p <- p +
  theme(strip.background = element_blank(),  # Remove background of the facet label
        strip.text = element_blank())
ggsave(p, file = file.path(figure_dir, "liver_view_exclusion_both_abs.pdf"), width = 10, height = 9)

# save dataframe
save(corr_df, file = file.path(tcga_res, "view_exclusion_corr_df_both_abs.Rda"))

# Validation in independent liver cancer chohort
We repeat the survival analysis in an independent liver cancer cohort from GEPliver and investigate the association of the MOFA factors with other clinical variables in both liver cancer datasets.

In [None]:
# running MOFA models for the validation cohort
load(file.path(gep_res, "liver_omics_pca.Rda"))

#nonet
data_nonet <- omics[1:3]
mofa_nonet <- run_mofa2(data_nonet, n_fct = 5, seed = 13, convergence = "slow", use_basilisk = T)
save(mofa_nonet, file = file.path(gep_res, "MOFA_liver_pca_nonet.Rda"))

# with both
data_both <- omics
mofa_both <- run_mofa2(data_both, n_fct = 5, seed = 13, convergence = "slow", use_basilisk = T)
save(mofa_both, file = file.path(gep_res, "MOFA_liver_pca_both.Rda"))

In [None]:
# running survival analysis for validation cohort
# Define model types
model_types <- c("nonet", "both")

# Define file paths for MOFA models and survival data
mofa_files <- lapply(model_types, function(model) {
  file.path(gep_res, paste0("MOFA_", cancers_gep, "_pca_", model, ".Rda"))
})
mofa_files <- do.call(c, mofa_files)

# Load MOFA models and survival data
models <- lapply(mofa_files, load)
load(file.path(gep_res, "liver_surv.Rda"))

# Initialize the survival data frame
surv_df <- data.frame()

# Getting factors
factors <- lapply(models, function(model) get_factors(model)[[1]])

# Perform survival associations
cox_models <- lapply(factors, surv_association, surv = surv, univariate = TRUE)

# Compare survival models
surv_df <- surv_compare(models = cox_models, model_labels = model_types,
                   univariate = TRUE, method = "BH")
surv_df$cancer <- "liver"

# Save individual cox models
cox_all <- setNames(cox_models, model_types)
save(cox_all, file = file.path(gep_res, paste0(cancer, "_cox_models_PCA.Rda")))

# Save the survival data frame
save(surv_df, file = paste0(gep_res, "MOFA_surv_df_all_PCA.Rda"))

Now that we've identified survival associated factors in both datasets, we look for associations with other clinical features. This reproduces Figure S4.

In [None]:
# loading clinical data
clin_tcga <- read.table(file.path(data_tcga, "liver", "clin"), head = TRUE, sep = "\t")
clin_tcga <- gsub("-", "\\.", clin$sampleID) # to match the sample names in the omics.
colnames(clin_tcga)[1] <- "sample"
clin_gep <-read.table(file.path(data_gep, "clin"), head = TRUE, sep ="\t")
colnames(clin_gep)[1] <- "sample"
clin <- list(tcga = clin_tcga, gep = clin_gep)

# define features of interest
feat_gep <- c("Phenotype", "Age", "Sex", "Fibrosis", "Tumor.stage", "Tumor.size")
feat_tcga <- c("additional_pharmaceutical_therapy", "additional_radiation_therapy",
                "age_at_initial_pathologic_diagnosis", "histological_type", "pathologic_stage",
                "gender", "fibrosis_ishak_score")
feat <- list(tcga = feat_tcga, gep = feat_gep)

# get mofa models
mofa_tcga <- get(load(file.path(tcga_res, "MOFA_liver_pca_both.Rda")))
mofa_gep <- get(load(file.path(gep_res, "MOFA_liver_pca_both.Rda")))

#make sure missing values are NAs
clin <- lapply(clin, function(df) {
  df <- mutate_all(df, ~ ifelse(. == "", NA, .))
  return(df)
})

# extract factors
fct_tcga <- as.data.frame(get_factors(mofa_tcga)[[1]])
fct_gep <- as.data.frame(get_factors(mofa_gep)[[1]])
fct <- list(tcga = fct_tcga, gep = fct_gep)

association <- Map(function(factors, clinical, features){
  assoc <- clin_associaton(factors, clin = clinical, clin_feat = features,
                           sample_label = "sample")
  return(assoc)
})

save(association, file = file.path(tcga_res,"liver_clin_feat_assoc_both.Rda"))

In [None]:
# plot
p <- lapply(association, plot_clin_association)

# Arrange the ggplot objects from the list 'p' in a grid
p_combined <- do.call(grid.arrange, c(p, ncol = 1))

ggsave(p_combined, file = file.path(figure_dir, "liver_clin_feat_assoc_both.pdf"))