# Create same model PLIER DelayedPLIER

Marc Subirana-Granés (2024)

Create a basic PLIER model to compare between the results between PLIER and DelayedPLIER

# Load libraries/modules

In [8]:
`%>%` <- dplyr::`%>%`
library(PLIER)
library(dplyr)
library(reticulate)

# PLIER utils
source(here::here('scripts/plier_util.R'))

#delayedPLIER functions
path_script_funcs = '/home/msubirana/Documents/pivlab/DelayedPLIER/funcs.R'
source(path_script_funcs)

# Load PLIER pathway and cell type data
data(bloodCellMarkersIRISDMAP)
data(svmMarkers)
data(canonicalPathways)

# Load data

In [None]:
# define output nb
output_nb_path = here('output/nbs/create_same_model_PLIER_DelayedPLIER')
dir.create(output_nb_path, showWarnings = FALSE)
expression_dataset_path <- here::here('output/gtex/GTEx_v8_gene_median_tpm.rds')

# Prepare data for all the models

In [6]:
# Assign arguments to variables 
# Prepare output directory
output_file_path=dirname(output_file)
dir.create(dirname(output_file_path), showWarnings = FALSE, recursive = TRUE)

# Load data
expression_dataset <- readRDS(expression_dataset_path)

# Remove gene ens id column and duplicate genes
expression_dataset <- subset(expression_dataset, select = -c(gene_ens_id))
expression_dataset <- expression_dataset[!duplicated(expression_dataset["gene_symbol"]),]

# Rename rows with gene symbols
rownames(expression_dataset) <- expression_dataset[,"gene_symbol"]

# Remove gene symbol column
expression_dataset <- subset(expression_dataset, select = -c(gene_symbol))

# Remove NA
expression_dataset = na.omit(expression_dataset)

# Convert to matrix
expression_matrix <- as.matrix(expression_dataset)

# Combine the pathway data from PLIER
all_paths <- PLIER::combinePaths(bloodCellMarkersIRISDMAP, svmMarkers, canonicalPathways)

# What genes are common to the pathway data and the expression matrix
cm_genes <- PLIER::commonRows(all_paths, expression_matrix)

# filter to common genes before row normalization to save on computation
expression_matrix_cm <- expression_matrix[cm_genes, ]

# Z-score normalization
expression_matrix_cm <- PLIER::rowNorm(expression_matrix_cm) 

# Remove NA
expression_matrix_cm=na.omit(expression_matrix_cm)

# What genes are common to the pathway data and the expression matrix
cm_genes <- PLIER::commonRows(all_paths, expression_matrix_cm)

# filter to common genes before row normalization to save on computation
expression_matrix_cm <- expression_matrix_cm[cm_genes, ]
all_paths_cm <- all_paths[cm_genes, ]

# PLIER

## GTEx PLIER preparation

In [26]:
# compute rsvd/svd
set.seed(123456)
ns=ncol(expression_matrix_cm)
message("Computing SVD")
if(ns>500){
  message("Using rsvd")
  set.seed(123456);svdres=rsvd(expression_matrix_cm, k=min(ns, max(200, ns/4)), q=3)
}else{
  svdres=svd(expression_matrix_cm)
}
message("Done")

# save z-scored expression data, the prior information matrix and svdres to be supplied to PLIER::PLIER and the number of PCs

output_file_preplier <- file.path(output_nb_path, 'gtex_comparison_plier.rds')


plier_data_list <- list("expression_matrix_cm" = expression_matrix_cm,
                        "all_paths_cm" = all_paths[cm_genes, ],
                        "svdres" = svdres)

saveRDS(plier_data_list, file = output_file_preplier)

Computing SVD

Using rsvd

Done



## GTEx PLIER run

In [28]:
# Assign arguments to variables 
output_file_plier <- file.path(output_nb_path, 'gtex_comparison_PLIER.rds')
parameter_k <- 1
frac <- 0.7

# Load data
expression_matrix_cm=plier_data_list$expression_matrix_cm
all_paths_cm=plier_data_list$all_paths_cm
svdres=plier_data_list$svdres

# compute k
k=num.pc(svdres)*2
k <- min(k, floor(ncol(expression_matrix_cm)*0.9))
k = k * parameter_k
k = round(k, 0)
message("k is set to ", k)

# Run PLIER (with common genes)
plier_result=PLIER::PLIER(data=expression_matrix_cm, priorMat=all_paths_cm , svdres=svdres, k=k, frac=frac, scale=FALSE)

# Prepare output directory
output_file_path=dirname(output_file)
dir.create(dirname(output_file_path), showWarnings = FALSE, recursive = TRUE)

# Save results
saveRDS(plier_result, file = output_file_plier)

k is set to 1042

Removing 4 pathways with too few genes



[1] 55.53997
[1] "L2 is set to 55.5399692443312"
[1] "L1 is set to 27.7699846221656"


errorY (SVD based:best possible) = 0.6328

New L3 is 0.000804733010124613

New L3 is 0.000710174388842549

New L3 is 0.000710174388842549

New L3 is 0.000804733010124613

New L3 is 0.000804733010124613

New L3 is 0.000804733010124613

New L3 is 0.000911881965554516

New L3 is 0.000911881965554516

New L3 is 0.000911881965554516

New L3 is 0.000911881965554516

New L3 is 0.000911881965554516

New L3 is 0.000911881965554516

New L3 is 0.000911881965554516

New L3 is 0.000911881965554516

New L3 is 0.00103329763864764

New L3 is 0.000911881965554516

Bdiff is not decreasing

Bdiff is not decreasing

Bdiff is not decreasing

Bdiff is not decreasing

Bdiff is not decreasing

Bdiff is not decreasing

converged at  iteration 334 Bdiff is not decreasing

There are 216  LVs with AUC>0.70



# DelayedPLIER

In [17]:
output_file_delayedPLIER_hdf5 <- file.path(output_nb_path, 'counts.hdf5')  
output_file_delayedPLIER_dim <- file.path(output_nb_path, 'dimnames.RDS') 

writeHDF5Array(expression_matrix_cm, filepath = output_file_delayedPLIER_hdf5, name = "count")
saveRDS(list(row.names = rownames(expression_matrix_cm) , col.names = colnames(expression_matrix_cm)), file = output_file_delayedPLIER_dim)

<6683 x 17382> HDF5Matrix object of type "double":
              [,1]       [,2]       [,3] ...    [,17381]    [,17382]
   [1,]  1.4113228 -0.4424523  0.3365641   . -0.43709630 -0.09997518
   [2,]  5.8423464 -0.7106564  0.7796052   . -0.64629791  1.35986578
   [3,] -0.2272683 -0.5812718 -0.5171569   . -0.59093477 -0.49703849
   [4,]  1.6378622 -0.5300667  0.1609265   . -0.73800558  1.53935231
   [5,]  0.6301441  0.6176855 -0.4076144   .  0.19893678  0.28222509
    ...          .          .          .   .           .           .
[6679,] -0.2462670 -0.7701677 -0.5296683   .  -0.7600938  -0.2738677
[6680,] -0.1980982 -0.2400479 -0.1990823   .  -0.2451261  -0.2109961
[6681,]  2.1421737 -0.7290364  1.5123376   .  -0.9276022   0.7712268
[6682,] -0.6538122 -0.9657483 -0.6224588   .  -1.0976921  -0.7310973
[6683,] -0.7506101 -0.9781575 -0.6702703   .  -0.8675906  -0.6017146

In [118]:
setAutoRealizationBackend("HDF5Array") #supportedRealizationBackends(), getRealizationBackend()

sce <- DelayedArray(seed = HDF5ArraySeed(filepath = output_file_delayedPLIER_hdf5, name = "count"))
dimnamaes <- readRDS(output_file_delayedPLIER_dim)
rownames(sce) <- dimnamaes$row.names
colnames(sce) <- dimnamaes$col.names

sce[is.na(sce)] <- 0
expression_matrix_dp <- sce[which(DelayedMatrixStats::rowSds(sce) >0),]

# Assign arguments to variables 
output_file_delayedPlier <- file.path(output_nb_path, 'gtex_comparison_delayedPlier.rds')
parameter_k <- 1
frac <- 0.7

# compute k
k=num.pc(svdres)*2
k <- min(k, floor(ncol(expression_matrix_cm)*0.9))
k = k * parameter_k
k = round(k, 0)
message("k is set to ", k)

# Run PLIER (with common genes)
delayedPlier_result=PLIER::PLIER(data=expression_matrix_dp, priorMat=all_paths_cm , svdres=svdres, k=k, frac=frac, scale=FALSE)

# Prepare output directory
output_file_path=dirname(output_file_delayedPlier)
dir.create(dirname(output_file_delayedPlier), showWarnings = FALSE, recursive = TRUE)

# Save results
saveRDS(delayedPlier_result, file = output_file_delayedPlier)

<6683 x 17382> HDF5Matrix object of type "double":
              [,1]       [,2]       [,3] ...    [,17381]    [,17382]
   [1,]  1.4113228 -0.4424523  0.3365641   . -0.43709630 -0.09997518
   [2,]  5.8423464 -0.7106564  0.7796052   . -0.64629791  1.35986578
   [3,] -0.2272683 -0.5812718 -0.5171569   . -0.59093477 -0.49703849
   [4,]  1.6378622 -0.5300667  0.1609265   . -0.73800558  1.53935231
   [5,]  0.6301441  0.6176855 -0.4076144   .  0.19893678  0.28222509
    ...          .          .          .   .           .           .
[6679,] -0.2462670 -0.7701677 -0.5296683   .  -0.7600938  -0.2738677
[6680,] -0.1980982 -0.2400479 -0.1990823   .  -0.2451261  -0.2109961
[6681,]  2.1421737 -0.7290364  1.5123376   .  -0.9276022   0.7712268
[6682,] -0.6538122 -0.9657483 -0.6224588   .  -1.0976921  -0.7310973
[6683,] -0.7506101 -0.9781575 -0.6702703   .  -0.8675906  -0.6017146

k is set to 1042

Removing 4 pathways with too few genes



[1] 55.53997
[1] "L2 is set to 55.5399692443312"
[1] "L1 is set to 27.7699846221656"


ERROR: Error in tcrossprod(x, y): requires numeric/complex matrix/vector arguments


In [107]:
head(expression_matrix_dp)

<6 x 17382> DelayedMatrix object of type "double":
         GTEX-1117F-0226-SM-5GZZ7 ...  GTEX-ZZPU-2726-SM-5NQ8O
GAS6                   1.41132283   .              -0.09997518
MMP14                  5.84234639   .               1.35986578
MARCKSL1              -0.22726833   .              -0.49703849
SPARC                  1.63786217   .               1.53935231
CTSD                   0.63014415   .               0.28222509
EPAS1                 -0.07935625   .               0.68432390

In [108]:
head(all_paths_cm)

Unnamed: 0,IRIS_Bcell-Memory_IgG_IgA,IRIS_Bcell-Memory_IgM,IRIS_Bcell-naive,IRIS_CD4Tcell-N0,IRIS_CD4Tcell-Th1-restimulated12hour,IRIS_CD4Tcell-Th1-restimulated48hour,IRIS_CD4Tcell-Th2-restimulated12hour,IRIS_CD4Tcell-Th2-restimulated48hour,IRIS_CD8Tcell-N0,IRIS_DendriticCell-Control,⋯,KEGG_GNRH_SIGNALING_PATHWAY,KEGG_BASAL_TRANSCRIPTION_FACTORS,REACTOME_SYNTHESIS_OF_DNA,KEGG_HEMATOPOIETIC_CELL_LINEAGE,KEGG_T_CELL_RECEPTOR_SIGNALING_PATHWAY,PID_IL4_2PATHWAY,REACTOME_SIGNALING_BY_THE_B_CELL_RECEPTOR_BCR,PID_BCR_5PATHWAY,PID_TELOMERASEPATHWAY,PID_PI3KPLCTRKPATHWAY
GAS6,0,0,0,0,0,0,0,0,0,1,⋯,0,0,0,0,0,0,0,0,0,0
MMP14,0,0,0,0,0,0,0,0,0,0,⋯,1,0,0,0,0,0,0,0,0,0
MARCKSL1,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
SPARC,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
CTSD,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
EPAS1,0,0,0,0,0,0,0,0,0,1,⋯,0,0,0,0,0,0,0,0,0,0


In [112]:
# Check the dimensions of your matrices
dim(expression_matrix_dp)
dim(all_paths_cm)

# Check the dimensions of svdres
str(svdres)

List of 3
 $ d: num [1:4345] 5017 3058 2698 2573 2253 ...
 $ u: num [1:6683, 1:4345] -0.01033 -0.01441 -0.00257 -0.01231 -0.0053 ...
 $ v: num [1:17382, 1:4345] -0.00908 0.00523 -0.00754 -0.00811 0.00894 ...
 - attr(*, "class")= chr "rsvd"


0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20
-0.009084249,0.0048984083,0.003729253,-0.005068619,-0.001676492,0.0012167764,0.0004521597,-0.001731784,0.0028415248,-0.0108770568,⋯,0.010316521,-0.0018216236,0.009547465,0.0007294641,-0.010735104,-0.003858802,-0.001301917,-0.0065631243,0.01598244,0.005948169
0.005227597,-0.0006597834,0.007296205,0.005113093,0.00410684,0.0058876053,-0.0220267426,0.01199124,0.0042095739,-0.0094073135,⋯,0.005766039,0.0031255668,-0.017016444,0.0020608788,0.0178844464,0.010818435,0.008424042,-0.0046739341,-0.006138034,-0.0042348002
-0.007538556,0.0024068316,0.006820188,-0.006764531,0.001602638,0.0026827639,-0.0056995087,-0.003174828,-0.0118093426,-0.0031382752,⋯,0.008278355,0.0001931016,-0.005094602,0.0028290347,0.0009537677,0.00213248,0.012981176,-0.0209944783,-0.001496447,-0.0032482439
-0.00810876,0.0015281041,0.00459211,-0.007836804,0.002823339,0.0022566112,0.0031078898,-0.0001156762,0.0001972283,-0.0040141988,⋯,-0.012390217,-0.0031648826,0.009541483,0.0027030098,-0.0007984402,-0.012082119,0.010887267,-0.0022341602,-0.004085697,-0.0060873099
0.008938876,0.000691605,0.00466347,8.402255e-05,0.001085982,0.0026497992,0.0033395725,0.002563578,-0.0026726362,-0.0002996245,⋯,-0.006185586,-0.0048198469,0.003343766,-0.0021676098,0.0022386804,0.00348923,0.007948025,-0.0008856395,0.002323073,-0.0007892155
0.002806485,0.0052999336,0.003398989,-0.003305127,0.000603851,0.0004749406,0.0070639287,-2.017282e-05,-0.0012545485,-0.01068482,⋯,0.012219089,0.0194636303,0.011813677,0.0098170893,-0.0210587766,-0.002653762,0.006859786,-0.0036161516,-0.004182263,0.0163174848


In [None]:
head(delayedPlier_result$Z)

In [None]:
head(delayedPlier_result$B)