# ARCHS4 model building with PLIER2 and PLIER

# PLIER2

## Load libraries

In [1]:
if (!requireNamespace("PLIER", quietly = TRUE)) {
    devtools::install_github("wgmao/PLIER")
}

if (!requireNamespace("PLIER2", quietly = TRUE)) {
    REPO_PATH <- "/home/msubirana/Documents/pivlab/PLIER2" 
    remotes::install_local(REPO_PATH, force = TRUE, dependencies = FALSE)
}

library(bigstatsr)
library(data.table)
library(dplyr)
library(rsvd)
library(glmnet)
library(Matrix)
library(knitr)
library(here)
library(PLIER)
library(PLIER2)
library(hdf5r)
library(biomaRt)

source(here("config.R"))


Attaching package: ‘dplyr’


The following objects are masked from ‘package:data.table’:

    between, first, last


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union


Loading required package: Matrix

Loaded glmnet 4.1-10

here() starts at /home/msubirana/Documents/pivlab/plier2-analyses

Loading required package: RColorBrewer

Loading required package: gplots


Attaching package: ‘gplots’


The following object is masked from ‘package:stats’:

    lowess


Loading required package: pheatmap

Loading required package: qvalue


Attaching package: ‘PLIER2’


The following objects are masked from ‘package:PLIER’:

    commonRows, num.pc, projectPLIER


The following object is masked from ‘package:bigstatsr’:

    AUC




## Output directory

In [2]:
output_dir <- config$ARCHS4$DATASET_FOLDER
dir.create(output_dir, showWarnings = FALSE, recursive = TRUE)

## Download archs4 

In [3]:
if (!file.exists(config$ARCHS4$DATASET_FILE)) {
  tryCatch({
    download.file(config$ARCHS4$URL, config$ARCHS4$DATASET_FILE, mode = "wb")
    cat("File downloaded successfully to:", config$ARCHS4$DATASET_FILE, "\n")
  }, error = function(e) {
    cat("Error during download:", e$message, "\n")
  })
} else {
  cat("File already exists. Skipping download.\n")
}

File already exists. Skipping download.


## Preprocess archs4 data

In [4]:
file_path <- config$ARCHS4$DATASET_FILE

h5        <- H5File$new(file_path, mode = "r")
dset      <- h5[["/data/expression"]]
gene_symbols <- h5[["/meta/genes/symbol"]]$read()
gene_ids     <- h5[["/meta/genes/ensembl_gene"]]$read()
sample_names <- h5[["/meta/samples/geo_accession"]]$read()
sc_prob      <- h5[["/meta/samples/singlecellprobability"]]$read()
lib_strategy <- h5[["/meta/samples/library_strategy"]]$read()     

In [5]:
# check all samples RNAseq
table(lib_strategy)

lib_strategy
RNA-Seq 
 888821 

In [6]:
# filter single-cell samples
keep_samples_idx <- which(sc_prob < 0.5)
stopifnot(length(keep_samples_idx) > 0)

In [None]:
# Filter out ENSG IDs 
gene_symbols_no_ensg <- unique(gene_symbols[!grepl("^ENSG", gene_symbols)])
gene_symbols_idx <- which(gene_symbols %in% gene_symbols_no_ensg)
gene_symbols <- gene_symbols[gene_symbols_idx]
mask <- !grepl("^ENSG", gene_symbols) & !is.na(gene_symbols)
symbols <- gene_symbols[mask]
idx <- gene_symbols_idx[mask]
keep <- !duplicated(symbols)
gene_symbols <- symbols[keep]
gene_symbols_idx <- idx[keep]

In [8]:
sample_names  <- sample_names[keep_samples_idx]
n_genes <- length(gene_symbols)
n_samples <- length(sample_names)

In [9]:
# Create the FBM
fbm_file <- file.path(output_dir, "FBMarchs4")
bkfile   <- paste0(fbm_file, ".bk")

if (!file.exists(bkfile)) {
  message("Backing file not found – building FBM from scratch")
  archs4FBM <- FBM(
    nrow        = n_genes,
    ncol        = n_samples,
    backingfile = fbm_file,
    create_bk   = TRUE
  )
} else {
  message("Found existing FBM backing file – re-opening")
  archs4FBM <- FBM(
    nrow        = n_genes,
    ncol        = n_samples,
    backingfile = fbm_file,
    create_bk   = FALSE
  )
}

Backing file not found – building FBM from scratch



In [10]:
block_size <- 100 
n_blocks   <- ceiling(n_samples / block_size)

pb <- txtProgressBar(min = 0, max = n_blocks, style = 3)

for (i in seq_len(n_blocks)) {
  setTxtProgressBar(pb, i)

  start_col <- (i-1) * block_size + 1
  end_col   <- min(i * block_size, n_samples)
  
  # Get the sample indices for this block
  sample_indices <- keep_samples_idx[start_col:end_col]

  raw_block <- tryCatch(
    dset[sample_indices, gene_symbols_idx],
    error = function(e) {
      message("Error with block: ", i)
      matrix(1e-10, length(sample_indices), length(gene_symbols_idx))
    }
  )

  # Transpose to get genes × samples
  raw_block_t <- t(raw_block)
  
  # Since gene_symbols and gene_symbols_idx correspond 1:1 after filtering,
  # we can use gene_symbols directly as the grouping variable
  raw_block_summed <- rowsum(
    raw_block_t,
    group = gene_symbols
  )
  
  # Write to FBM (genes × samples)
  archs4FBM[, start_col:end_col] <- raw_block_summed
}

close(pb)

message("Completed FBM: ", archs4FBM$nrow, " genes × ", archs4FBM$ncol, " samples")

  |                                                                      |   0%

In [None]:
# CPM normalization
cpmPLIER2FBM(archs4FBM, block_size_n)

In [None]:
library(bigstatsr)
library(matrixStats)

preprocessPLIER2FBM <- function(fbm,
                                mean_cutoff = NULL,
                                var_cutoff  = NULL,
                                backingfile = NULL,
                                block_size  = 1000L) {
  n_r <- nrow(fbm)
  n_c <- ncol(fbm)
  
  # 1. Choose base names
  base_bk <- if (is.null(backingfile)) 
               paste0(fbm$backingfile, "_preproc") 
             else 
               backingfile
  
  # 2. Make a writable on-disk copy (same dims)
  fbm_copy <- FBM(
    nrow        = n_r,
    ncol        = n_c,
    backingfile = base_bk,
    create_bk   = TRUE
  )
  
  # 3. Copy data in blocks
  for (rs in seq(1, n_r, by = block_size)) {
    rows <- rs:min(rs + block_size - 1L, n_r)
    fbm_copy[rows, ] <- fbm[rows, ]
  }
  
  # 4. Clean in-place (log2 if needed, fill NAs)
  cleanFBM(fbm_copy)
  
  # 5. Compute per-row means & variances in blocks
  row_means <- numeric(n_r)
  row_vars  <- numeric(n_r)
  for (rs in seq(1, n_r, by = block_size)) {
    rows <- rs:min(rs + block_size - 1L, n_r)
    mat  <- fbm_copy[rows, , drop = FALSE]
    row_means[rows] <- rowMeans(mat, na.rm = TRUE)
    row_vars[rows]  <- rowVars(mat, rows = NULL, cols = NULL, na.rm = TRUE)
  }
  
  # 6. Select which rows to keep
  keep_rows <- which(
    (is.null(mean_cutoff) | row_means >= mean_cutoff) &
    (is.null(var_cutoff)  | row_vars  >= var_cutoff)
  )
  message(length(keep_rows), " / ", n_r, " genes passed filters.")
  
  # 7. Create filtered FBM
  filt_bk <- paste0(base_bk, "_filtered")
  fbm_filtered <- FBM(
    nrow        = length(keep_rows),
    ncol        = n_c,
    backingfile = filt_bk,
    create_bk   = TRUE
  )
  
  # 8. Copy only kept rows into filtered FBM, in blocks of original
  for (rs in seq(1, n_r, by = block_size)) {
    rows <- rs:min(rs + block_size - 1L, n_r)
    keep_in_block <- which(rows %in% keep_rows)
    if (length(keep_in_block)) {
      orig_rows <- rows[keep_in_block]
      new_rows  <- match(orig_rows, keep_rows)
      fbm_filtered[new_rows, ] <- fbm_copy[orig_rows, ]
    }
  }
  
  # 9. Return filtered FBM + stats
  stats_filt <- list(
    row_means     = row_means[keep_rows],
    row_variances = row_vars[keep_rows]
  )
  
  list(
    fbm_filtered = fbm_filtered,
    rowStats     = stats_filt,
    kept_rows    = keep_rows
  )
}

cleanFBM=function(fbm){
  # SCheck for NA and max value
  max_value <- -Inf
  has_na <- FALSE

  big_apply(fbm, a.FUN = function(X, ind) {
    max_value <<- max(max_value, max(X[ind, ], na.rm = TRUE))
    if (anyNA(X[ind, ])) {
      has_na <<- TRUE
    }
    NULL  # No return, just updating global values
  }, ind = rows_along(fbm))

  # Log2 transform if necessary
  if (max_value >= 100) {
    message("Applying log2 transformation")
    big_apply(fbm, a.FUN = function(X, ind) {
      X[ind, ] <- log2(X[ind, ] + 1)
    }, ind = rows_along(fbm))
  }
  else{
    message("Already on log scale")
  }

  # Fill NAs with 0 if necessary
  if (has_na) {
    message("Filling NAs with 0")
    big_apply(fbm, a.FUN = function(X, ind) {
      X[ind, ][is.na(X[ind, ])] <- 0
      NULL
    }, ind = rows_along(fbm))
  }
  else{
    message("No NA values found")
  }

  return(list(max_value = max_value, had_na = has_na))
}


In [None]:
# Preprocess and z‑score FBM
prep_archs4 <- preprocessPLIER2FBM(
  fbm        = archs4FBM,
  mean_cutoff= config$ARCHS4$GENES_MEAN_CUTOFF,
  var_cutoff = config$ARCHS4$GENES_VAR_CUTOFF
)

archs4_fbm_filt <- prep_archs4$fbm_filtered
archs4_rowStats <- prep_archs4$rowStats

In [None]:
zscorePLIER2FBM(archs4_fbm_filt, archs4_rowStats)

archs4_genes <- genes[prep_archs4$kept_rows]

In [None]:
saveRDS(sample_names, file = file.path(output_dir, "archs4_samples.rds"))

In [None]:
saveRDS(archs4_genes, file = file.path(output_dir, "archs4_genes.rds"))

In [None]:
saveRDS(archs4_fbm_filt, file = file.path(output_dir, "archs4_fbm_filt.rds"))

## SVD computation and SVD K estimation

In [None]:
g_fb <- nrow(archs4_fbm_filt)
samples_fb <- ncol(archs4_fbm_filt)
SVD_K_archs4  <- min(g_fb, samples_fb) - 1

message("Using SVD K = ", SVD_K_archs4)

set.seed(1)
archs4_svdRes <- big_randomSVD(archs4_fbm_filt, k = SVD_K_archs4)

In [None]:
saveRDS(archs4_svdRes, file = file.path(output_dir, "archs4_svdRes.rds"))

## Estimate K for PLIER

In [None]:
PLIER_K_archs4 <- num.pc(list(d = archs4_svdRes$d))
message("Inferred PLIER K = ", PLIER_K_archs4)

In [None]:
saveRDS(PLIER_K_archs4, file = file.path(output_dir, "PLIER_K_archs4.rds"))

## PLIERbase initialization

In [None]:
archs4_baseRes <- PLIERbase(
  Y      = archs4_fbm_filt,
  k      = PLIER_K_archs4,
  svdres = archs4_svdRes,
  trace  = TRUE
)

In [None]:
saveRDS(archs4_baseRes, file = file.path(output_dir, "archs4_baseRes.rds"))

## Prepare pathway priors

In [None]:
archs4_gmtList <- list(
  KEGG = getGMT("https://maayanlab.cloud/Enrichr/geneSetLibrary?mode=text&libraryName=KEGG_2021_Human"),
  BP = getGMT("https://maayanlab.cloud/Enrichr/geneSetLibrary?mode=text&libraryName=GO_Biological_Process_2025"),
  GTEx_Tissues = getGMT("https://maayanlab.cloud/Enrichr/geneSetLibrary?mode=text&libraryName=GTEx_Tissues_V8_2023"),
)

# prefix each gene‐set name with its library to guarantee uniqueness
for(lib in names(archs4_gmtList)) {
  names(archs4_gmtList[[lib]]) <- paste0(lib, "_", names(archs4_gmtList[[lib]]))
}

archs4_pathMat <- gmtListToSparseMat(archs4_gmtList)
archs4_matched <- getMatchedPathwayMat(archs4_pathMat, archs4_genes)
archs4_chatObj <- getChat(archs4_matched)

## PLIERfull

In [None]:
archs4_fullRes <- PLIERfull(
  Y                 = archs4_fbm_filt,
  priorMat          = as.matrix(archs4_matched),
  svdres            = archs4_svdRes,
  plier.base.result = archs4_baseRes,
  Chat              = archs4_chatObj,
  k                 = PLIER_K_archs4,
  doCrossval        = TRUE,
  trace             = TRUE,
  max.U.updates=50
)

In [None]:
saveRDS(archs4_fullRes, file = file.path(output_data_dir, "archs4_PLIER2.rds"))

# Fix col, row names and summary

In [None]:
archs4_fullRes <- readRDS(file.path(output_data_dir, "archs4_PLIER2.rds"))

colnames(archs4_fullRes$B) <- sample_names

colnames(archs4_fullRes$Z) <- paste0('LV', seq_len(ncol(archs4_fullRes$Z)))

archs4_fullRes$summary <- archs4_fullRes$summary %>%
    dplyr::rename(LV = `LV index`) %>% 
    dplyr::mutate(LV = paste0('LV', LV))

saveRDS(archs4_fullRes, file = file.path(output_data_dir, "archs4_PLIER2.rds"))