# ARCHS4 model building with PLIER2

## Load libraries

In [2]:
if (!requireNamespace("PLIER", quietly = TRUE)) {
    devtools::install_github("wgmao/PLIER")
}

if (!requireNamespace("PLIER2", quietly = TRUE)) {
    REPO_PATH <- "/home/msubirana/Documents/pivlab/PLIER2" 
    remotes::install_local(REPO_PATH, force = TRUE, dependencies = FALSE)
}

library(bigstatsr)
library(data.table)
library(dplyr)
library(rsvd)
library(glmnet)
library(Matrix)
library(knitr)
library(here)
library(PLIER)
library(PLIER2)
library(hdf5r)
library(biomaRt)

source(here("config.R"))

set.seed(123)


Attaching package: ‘dplyr’


The following objects are masked from ‘package:data.table’:

    between, first, last


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union


Loading required package: Matrix

Loaded glmnet 4.1-10

here() starts at /home/msubirana/Documents/pivlab/plier2-analyses

Loading required package: RColorBrewer

Loading required package: gplots


Attaching package: ‘gplots’


The following object is masked from ‘package:stats’:

    lowess


Loading required package: pheatmap

Loading required package: qvalue


Attaching package: ‘PLIER2’


The following objects are masked from ‘package:PLIER’:

    num.pc, projectPLIER




## Output directory

In [3]:
output_dir <- config$ARCHS4$DATASET_FOLDER
dir.create(output_dir, showWarnings = FALSE, recursive = TRUE)

## Download archs4 

In [4]:
if (!file.exists(config$ARCHS4$DATASET_FILE)) {
  tryCatch({
    download.file(config$ARCHS4$URL, config$ARCHS4$DATASET_FILE, mode = "wb")
    cat("File downloaded successfully to:", config$ARCHS4$DATASET_FILE, "\n")
  }, error = function(e) {
    cat("Error during download:", e$message, "\n")
  })
} else {
  cat("File already exists. Skipping download.\n")
}

File already exists. Skipping download.


## Preprocess archs4 data

In [5]:
file_path <- config$ARCHS4$DATASET_FILE

h5        <- H5File$new(file_path, mode = "r")
dset      <- h5[["/data/expression"]]
gene_symbols <- h5[["/meta/genes/symbol"]]$read()
gene_ids     <- h5[["/meta/genes/ensembl_gene"]]$read()
sample_names <- h5[["/meta/samples/geo_accession"]]$read()
sc_prob      <- h5[["/meta/samples/singlecellprobability"]]$read()
lib_strategy <- h5[["/meta/samples/library_strategy"]]$read()     

In [6]:
# check all samples RNAseq
table(lib_strategy)

lib_strategy
RNA-Seq 
 888821 

In [None]:
# filter single-cell samples
keep_samples_idx <- which(sc_prob < 0.5)
stopifnot(length(keep_samples_idx) > 0)

In [9]:
# Filter out ENSG IDs 
gene_symbols_no_ensg <- unique(gene_symbols[!grepl("^ENSG", gene_symbols)])
gene_symbols_idx <- which(gene_symbols %in% gene_symbols_no_ensg)
gene_symbols <- gene_symbols[gene_symbols_idx]
mask <- !grepl("^ENSG", gene_symbols) & !is.na(gene_symbols)
symbols <- gene_symbols[mask]
idx <- gene_symbols_idx[mask]
keep <- !duplicated(symbols)
gene_symbols <- symbols[keep]
gene_symbols_idx <- idx[keep]

In [10]:
sample_names  <- sample_names[keep_samples_idx]
n_genes <- length(gene_symbols)
n_samples <- length(sample_names)

In [11]:
# Create the FBM
fbm_file <- file.path(output_dir, "FBMarchs4")
bkfile   <- paste0(fbm_file, ".bk")

if (!file.exists(bkfile)) {
  message("Backing file not found – building FBM from scratch")
  archs4FBM <- FBM(
    nrow        = n_genes,
    ncol        = n_samples,
    backingfile = fbm_file,
    create_bk   = TRUE
  )
} else {
  message("Found existing FBM backing file – re-opening")
  archs4FBM <- FBM(
    nrow        = n_genes,
    ncol        = n_samples,
    backingfile = fbm_file,
    create_bk   = FALSE
  )
}

Backing file not found – building FBM from scratch



In [12]:
block_size <- 100 
n_blocks   <- ceiling(n_samples / block_size)

pb <- txtProgressBar(min = 0, max = n_blocks, style = 3)

for (i in seq_len(n_blocks)) {
  setTxtProgressBar(pb, i)

  start_col <- (i-1) * block_size + 1
  end_col   <- min(i * block_size, n_samples)
  
  # Get the sample indices for this block
  sample_indices <- keep_samples_idx[start_col:end_col]

  raw_block <- tryCatch(
    dset[sample_indices, gene_symbols_idx],
    error = function(e) {
      message("Error with block: ", i)
      matrix(1e-10, length(sample_indices), length(gene_symbols_idx))
    }
  )

  # Transpose to get genes × samples
  raw_block_t <- t(raw_block)
  
  # Since gene_symbols and gene_symbols_idx correspond 1:1 after filtering,
  # we can use gene_symbols directly as the grouping variable
  raw_block_summed <- rowsum(
    raw_block_t,
    group = gene_symbols
  )
  
  # Write to FBM (genes × samples)
  archs4FBM[, start_col:end_col] <- raw_block_summed
}

close(pb)

message("Completed FBM: ", archs4FBM$nrow, " genes × ", archs4FBM$ncol, " samples")

  |=                                                                     |   2%



Completed FBM: 40097 genes × 5000 samples



In [13]:
# CPM normalization
N_CORES <- config$GENERAL$N_CORES

cpmPLIER2FBM(
    fbm_counts=archs4FBM,
    block_size=block_size,
    ncores=N_CORES
)

In [14]:
# Preprocess and z‑score FBM
prep_archs4 <- preprocessPLIER2FBM(
  fbm = archs4FBM,
  mean_cutoff= config$ARCHS4$GENES_MEAN_CUTOFF,
  var_cutoff = config$ARCHS4$GENES_VAR_CUTOFF,
  block_size = block_size,
  ncores=N_CORES
)

archs4_fbm_filt <- prep_archs4$fbm_filtered
archs4_rowStats <- prep_archs4$rowStats

Applying log2 transformation

No NA values found



In [15]:
zscorePLIER2FBM(
    fbm_filtered=archs4_fbm_filt,
    rowStats=archs4_rowStats,
    chunk_size=block_size,
    ncores=N_CORES
)

Applying Z-score transformation



In [16]:
saveRDS(prep_archs4, file = file.path(output_dir, "prep_archs4.rds"))

In [19]:
saveRDS(gene_symbols, file = file.path(output_dir, "gene_symbols.rds"))

In [20]:
saveRDS(sample_names, file = file.path(output_dir, "archs4_samples.rds"))

In [21]:
saveRDS(archs4_fbm_filt, file = file.path(output_dir, "archs4_fbm_filt.rds"))

In [22]:
archs4_genes <- gene_symbols[prep_archs4$kept_rows]

In [23]:
saveRDS(archs4_genes, file = file.path(output_dir, "archs4_genes.rds"))