# Create same model PLIER DelayedPLIER

Marc Subirana-Granés (2024)

Create a basic PLIER model to compare between the results between PLIER and DelayedPLIER

# Load libraries/modules

In [4]:
`%>%` <- dplyr::`%>%`
library(PLIER)
library(dplyr)
library(reticulate)
library(here)

# PLIER utils
source(here::here('scripts/plier_util.R'))

#delayedPLIER functions from repo
path_script_funcs = '/home/msubirana/Documents/pivlab/DelayedPLIER/funcs.R'
source(path_script_funcs)

# Load PLIER pathway and cell type data
data(bloodCellMarkersIRISDMAP)
data(svmMarkers)
data(canonicalPathways)

Loading required package: RColorBrewer

Loading required package: gplots


Attaching package: ‘gplots’


The following object is masked from ‘package:stats’:

    lowess


Loading required package: pheatmap

Loading required package: glmnet

Loading required package: Matrix

Loaded glmnet 4.1-8

Loading required package: knitr

Loading required package: rsvd

Loading required package: qvalue


Attaching package: ‘dplyr’


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union


here() starts at /home/msubirana/Documents/pivlab/plier_recount3

Loading required package: DelayedArray

Loading required package: stats4

Loading required package: BiocGenerics


Attaching package: ‘BiocGenerics’


The following objects are masked from ‘package:dplyr’:

    combine, intersect, setdiff, union


The following objects are masked from ‘package:stats’:

    IQR, mad, sd, var, xtabs


Th

# Load data

In [5]:
# define output nb
output_nb_path = here('output/nbs/create_same_model_PLIER_DelayedPLIER')
dir.create(output_nb_path, showWarnings = FALSE)
expression_dataset_path <- here::here('output/gtex/GTEx_v8_gene_median_tpm.rds')

# Prepare data for all the models

In [7]:
# Load data
expression_dataset <- readRDS(expression_dataset_path)

# Remove gene ens id column and duplicate genes
expression_dataset <- subset(expression_dataset, select = -c(gene_ens_id))
expression_dataset <- expression_dataset[!duplicated(expression_dataset["gene_symbol"]),]

# Rename rows with gene symbols
rownames(expression_dataset) <- expression_dataset[,"gene_symbol"]

# Remove gene symbol column
expression_dataset <- subset(expression_dataset, select = -c(gene_symbol))

# Remove NA
expression_dataset = na.omit(expression_dataset)

# Convert to matrix
expression_matrix <- as.matrix(expression_dataset)

# Combine the pathway data from PLIER
all_paths <- PLIER::combinePaths(bloodCellMarkersIRISDMAP, svmMarkers, canonicalPathways)

# What genes are common to the pathway data and the expression matrix
cm_genes <- PLIER::commonRows(all_paths, expression_matrix)

# filter to common genes before row normalization to save on computation
expression_matrix_cm <- expression_matrix[cm_genes, ]

# Z-score normalization
expression_matrix_cm <- PLIER::rowNorm(expression_matrix_cm) 

# Remove NA
expression_matrix_cm=na.omit(expression_matrix_cm)

# What genes are common to the pathway data and the expression matrix
cm_genes <- PLIER::commonRows(all_paths, expression_matrix_cm)

# filter to common genes before row normalization to save on computation
expression_matrix_cm <- expression_matrix_cm[cm_genes, ]
all_paths_cm <- all_paths[cm_genes, ]

# PLIER preparation

In [8]:
ns=ncol(expression_matrix_cm)

message("Computing SVD")
if(ns>500){
    message("Using rsvd")
    set.seed(123456);
    svdres = BiocSingular::runRandomSVD(expression_matrix_cm, k = min(ns, max(200, ns/4)), center = F, scale = F)
} else {
    svdres=BiocSingular::runRandomSVD(expression_matrix_cm, k = min(ng, ns))
}

message("Done")

output_file_preplier <- file.path(output_nb_path, 'preplier_comparison_dp_p.rds')
plier_data_list <- list("expression_matrix_cm" = expression_matrix_cm,
                        "all_paths_cm" = all_paths[cm_genes, ],
                        "svdres" = svdres)

saveRDS(plier_data_list, file = output_file_preplier)

Computing SVD

Using rsvd

Done



# PLIER 

In [9]:
# Assign arguments to variables 
output_file_plier <- file.path(output_nb_path, 'PLIER_comparison_dp_p.rds')
parameter_k <- 1
frac <- 0.7

# Load data
expression_matrix_cm=plier_data_list$expression_matrix_cm
all_paths_cm=plier_data_list$all_paths_cm
svdres=plier_data_list$svdres

# compute k
k=num.pc(svdres)*2
k <- min(k, floor(ncol(expression_matrix_cm)*0.9))
k = k * parameter_k
k = round(k, 0)
message("k is set to ", k)

# Run PLIER (with common genes)
plier_result=PLIER::PLIER(data=expression_matrix_cm, priorMat=all_paths_cm , svdres=svdres, k=k, frac=frac, scale=FALSE)

# Prepare output directory
output_file_path=dirname(output_file)
dir.create(dirname(output_file_path), showWarnings = FALSE, recursive = TRUE)

# Save results
saveRDS(plier_result, file = output_file_plier)

k is set to 1042

Removing 4 pathways with too few genes



[1] 55.53993
[1] "L2 is set to 55.5399316449024"
[1] "L1 is set to 27.7699658224512"


errorY (SVD based:best possible) = 0.6327

New L3 is 0.000911881965554516

New L3 is 0.000804733010124613

New L3 is 0.000710174388842549

New L3 is 0.000804733010124613

New L3 is 0.000804733010124613

New L3 is 0.000911881965554516

New L3 is 0.000911881965554516

New L3 is 0.000911881965554516

New L3 is 0.000911881965554516

New L3 is 0.000911881965554516

New L3 is 0.000911881965554516

New L3 is 0.00103329763864764

New L3 is 0.000911881965554516

New L3 is 0.00103329763864764

Bdiff is not decreasing

Bdiff is not decreasing

Bdiff is not decreasing

New L3 is 0.000911881965554516

Bdiff is not decreasing

Bdiff is not decreasing

Bdiff is not decreasing

converged at  iteration 302 Bdiff is not decreasing

There are 216  LVs with AUC>0.70



ERROR: Error in h(simpleError(msg, call)): error in evaluating the argument 'path' in selecting a method for function 'dirname': object 'output_file' not found


# DelayedPLIER

In [10]:
output_file_preplier <- file.path(output_nb_path, 'preplier_comparison_dp_p.rds')
rds_preplier <- readRDS(output_file_preplier)
expression_matrix_cm <- rds_preplier$expression_matrix_cm
all_paths_cm <- rds_preplier$all_paths_cm
svdres <- rds_preplier$svdres

In [11]:
output_file_delayedPLIER_hdf5 <- file.path(output_nb_path, 'counts.hdf5')  
output_file_delayedPLIER_dim <- file.path(output_nb_path, 'dimnames.RDS') 

writeHDF5Array(expression_matrix_cm, filepath = output_file_delayedPLIER_hdf5, name = "count")
saveRDS(list(row.names = rownames(expression_matrix_cm) , col.names = colnames(expression_matrix_cm)), file = output_file_delayedPLIER_dim)

Can not create dataset. Object with name 'count' already exists.



ERROR: Error in H5Dcreate(loc$H5Identifier, dataset, tid, sid, dcpl = dcpl): HDF5. Dataset. Unable to initialize object.


In [18]:
# Check for correct PLIER (delayedPLIER repo) function
find("PLIER")
detach("package:PLIER", unload=TRUE)
message('detch PLIER')
find("PLIER")

ERROR: Error in detach("package:PLIER", unload = TRUE): invalid 'name' argument


In [None]:
setAutoRealizationBackend("HDF5Array") #supportedRealizationBackends(), getRealizationBackend()

sce <- DelayedArray(seed = HDF5ArraySeed(filepath = output_file_delayedPLIER_hdf5, name = "count"))
dimnamaes <- readRDS(output_file_delayedPLIER_dim)
rownames(sce) <- dimnamaes$row.names
colnames(sce) <- dimnamaes$col.names

sce[is.na(sce)] <- 0
expression_matrix_dp <- sce[which(DelayedMatrixStats::rowSds(sce) >0),]

# Assign arguments to variables 
output_file_delayedPlier <- file.path(output_nb_path, 'delayedPlier_comparison_dp_p.rds')
parameter_k <- 1
frac <- 0.7

# compute k
k=num.pc(svdres)*2
k <- min(k, floor(ncol(expression_matrix_cm)*0.9))
k = k * parameter_k
k = round(k, 0)
message("k is set to ", k)

# Run PLIER (with common genes)
delayedPlier_result=PLIER(data=expression_matrix_dp, priorMat=all_paths_cm , svdres=svdres, k=k, frac=frac, scale=FALSE)

# Prepare output directory
output_file_path=dirname(output_file_delayedPlier)
dir.create(dirname(output_file_delayedPlier), showWarnings = FALSE, recursive = TRUE)

# Save results
saveRDS(delayedPlier_result, file = output_file_delayedPlier)

k is set to 1042

Removing 4 pathways with too few genes



[1] 55.53993
[1] "L2 is set to 55.5399316449024"
[1] "L1 is set to 27.7699658224512"


errorY (SVD based:best possible) = 0.6327

New L3 is 0.000911881965554516

New L3 is 0.000804733010124613

New L3 is 0.000710174388842549

New L3 is 0.000804733010124613

New L3 is 0.000804733010124613

New L3 is 0.000911881965554516

New L3 is 0.000911881965554516

New L3 is 0.000911881965554516

New L3 is 0.000911881965554516

New L3 is 0.000911881965554516

New L3 is 0.000911881965554516

New L3 is 0.00103329763864764

New L3 is 0.000911881965554516

New L3 is 0.00103329763864764

Bdiff is not decreasing

Bdiff is not decreasing

Bdiff is not decreasing



In [22]:
delayedPlier_result <- readRDS(output_file_delayedPlier)

In [23]:
head(delayedPlier_result$Z)

<6 x 1042> DelayedMatrix object of type "double":
                [,1]        [,2]        [,3] ...    [,1041]    [,1042]
GAS6     0.274305392 0.087087797 0.675887955   . 0.00000000 0.04389087
MMP14    0.000000000 0.000000000 0.000000000   . 0.00000000 0.00000000
MARCKSL1 0.000000000 0.000000000 0.063091941   . 0.00000000 0.00000000
SPARC    0.000000000 0.000000000 0.633009784   . 0.00000000 0.00000000
CTSD     0.026459878 0.000000000 0.011039844   . 0.00000000 0.00000000
EPAS1    0.000000000 0.009669602 0.078999045   . 0.07326930 0.00000000

In [24]:
head(delayedPlier_result$B)

<6 x 17382> DelayedMatrix object of type "double":
                                           GTEX-1117F-0226-SM-5GZZ7 ...
LV 1                                                     0.03453201   .
2,IRIS_Neutrophil-Resting                               -0.03266857   .
3,KEGG_REGULATION_OF_ACTIN_CYTOSKELETON                 -0.01924855   .
4,REACTOME_DNA_STRAND_ELONGATION                        -0.08761484   .
5,REACTOME_NEURONAL_SYSTEM                              -0.03663372   .
6,MIPS_39S_RIBOSOMAL_SUBUNIT_MITOCHONDRIAL              -0.10118090   .
                                            GTEX-ZZPU-2726-SM-5NQ8O
LV 1                                                    -0.02232031
2,IRIS_Neutrophil-Resting                                0.04354584
3,KEGG_REGULATION_OF_ACTIN_CYTOSKELETON                 -0.03377098
4,REACTOME_DNA_STRAND_ELONGATION                        -0.04456839
5,REACTOME_NEURONAL_SYSTEM                              -0.03198667
6,MIPS_39S_RIBOSOMAL_SUBUNIT_MITOCHON

In [26]:
plier_result <- readRDS(output_file_plier)

In [28]:
head(plier_result$Z)

0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21
GAS6,0.0,0,0.8088228,0.0,0.0,0.02247598,0.0,0,0.722732686,0.0,⋯,0.0,0.01289279,0.0,0.0,0.0,0.0,0,0.0,0.02679134,0.5149712
MMP14,0.0,0,0.0,0.03365089,0.04630009,0.0,0.0,0,0.631639811,0.0,⋯,0.0,0.08506112,0.1338525,0.0,0.0,0.0,0,0.006647826,0.0,0.3673954
MARCKSL1,0.01613378,0,0.0,0.0,0.0668888,0.066676153,0.0,0,0.007572313,0.0,⋯,0.09874767,0.0,0.0,0.0,0.00691434,0.008059198,0,0.0,0.08562033,0.2394095
SPARC,0.0,0,0.2178293,0.05884839,0.0,0.0,0.0,0,0.307728319,0.0,⋯,0.0,0.0,0.0,0.02233023,0.0,0.0,0,0.094854111,0.0,0.2220075
CTSD,0.36951508,0,0.4789879,0.11634386,0.0,0.004501709,0.0,0,1.338231068,0.006038829,⋯,0.0,0.02404874,0.0,0.0,0.03719229,0.0,0,0.0,0.0,0.3540167
EPAS1,0.0,0,0.0,0.0,0.0,0.019567018,0.07301501,0,0.139432989,0.0,⋯,0.0,0.0,0.0,1.31723287,0.75172429,0.0,0,0.10046577,0.0,0.1011839


In [29]:
head(plier_result$B)

Unnamed: 0,GTEX-1117F-0226-SM-5GZZ7,GTEX-1117F-0426-SM-5EGHI,GTEX-1117F-0526-SM-5EGHJ,GTEX-1117F-0626-SM-5N9CS,GTEX-1117F-0726-SM-5GIEN,GTEX-1117F-1326-SM-5EGHH,GTEX-1117F-2426-SM-5EGGH,GTEX-1117F-2526-SM-5GZY6,GTEX-1117F-2826-SM-5GZXL,GTEX-1117F-2926-SM-5GZYI,⋯,GTEX-ZZPU-1126-SM-5N9CW,GTEX-ZZPU-1226-SM-5N9CK,GTEX-ZZPU-1326-SM-5GZWS,GTEX-ZZPU-1426-SM-5GZZ6,GTEX-ZZPU-1826-SM-5E43L,GTEX-ZZPU-2126-SM-5EGIU,GTEX-ZZPU-2226-SM-5EGIV,GTEX-ZZPU-2426-SM-5E44I,GTEX-ZZPU-2626-SM-5E45Y,GTEX-ZZPU-2726-SM-5NQ8O
"1,IRIS_Neutrophil-Resting",-0.06466991,-0.05444442,-0.11487165,-0.035270359,-0.016221577,-0.08795147,-0.043152199,-0.02652864,-0.01357084,-0.06115114,⋯,-0.015111653,-0.0081364322,0.01361394,0.0091803918,-0.003859179,-0.01732336,0.00898762,-0.027158754,0.008166765,-0.067802356
"2,IRIS_Neutrophil-Resting",-0.05571578,-0.01590762,0.01577313,0.020790523,-0.008301199,0.08083894,-0.017205464,-0.02094966,0.01108145,-0.01793904,⋯,0.004208666,0.0250979196,0.01624681,0.0136984738,0.025682792,0.02891395,0.03305504,0.016822676,-0.011734695,0.003665061
LV 3,-0.03838166,0.03482423,-0.02717656,0.082874772,0.01486442,-0.0379524,0.067458966,0.05625198,0.02319226,0.04187308,⋯,0.039603366,-0.0131135798,-0.0370263,0.0153230015,-0.048164293,-0.03551308,0.023533739,0.060811821,0.014446735,-0.010462315
"4,REACTOME_DNA_STRAND_ELONGATION",-0.04186157,-0.05852824,-0.0840443,-0.093036016,-0.044522459,-0.1094229,-0.074321773,0.07691609,-0.01977538,-0.05429195,⋯,-0.033256591,-0.0622496509,-0.08244244,-0.0331149318,-0.094837675,-0.01099475,0.350614733,-0.114707087,-0.032924232,-0.059023543
LV 5,-0.06491813,-0.02373363,-0.07026314,-0.002406268,0.014472661,-0.02067548,-0.002451937,-0.01588877,0.01729428,0.01426269,⋯,0.043720364,0.0366697386,0.02405436,0.0005016018,0.003232915,0.02730512,-0.005763948,0.001769326,-0.035416927,-0.016337502
"6,REACTOME_CELL_CYCLE",-0.05129524,-0.04315978,-0.09255232,-0.002334088,-0.055644603,-0.07307825,-0.013328589,-0.08143266,-0.04448419,-0.01763071,⋯,-0.035534652,-0.0002110482,-0.0546757,-0.0218612034,-0.013739038,-0.09074433,-0.033209628,-0.015577431,-0.039951759,-0.028047764


In [7]:
output_file_plier <- file.path(output_nb_path, 'PLIER_comparison_dp_p.rds')
plier_result <- readRDS(output_file_plier)

output_file_delayedPlier <- file.path(output_nb_path, 'delayedPlier_comparison_dp_p.rds')
delayedPlier_result <- readRDS(output_file_delayedPlier)

# Save as pickle

In [8]:
library(reticulate)

save_as_pickle <- function(object, filename, save_directory) {
  full_path <- file.path(save_directory, filename)
  py_save_object(r_to_py(object), full_path)
}

PLIER_model_to_pickle = function(PLIER_model, save_directory){
    
    # Check if the directory exists, create if it does not
    if (!dir.exists(save_directory)) {
      dir.create(save_directory, recursive = TRUE)
    }
    
    # Assuming gtex_tmp_1 is a list with various data types
    names_list <- names(PLIER_model)
    
    for (name in names_list) {
      element <- PLIER_model[[name]]
      if (is.matrix(element) || is.array(element)) {
        # Convert matrices/arrays to data frames before saving
        df <- as.data.frame(element)
        save_as_pickle(df, paste0(name, ".pkl"), save_directory)
      } else {
        # Save other data types directly
        save_as_pickle(element, paste0(name, ".pkl"), save_directory)
      }
    }  
}

In [32]:
PLIER_model_to_pickle(plier_result, file.path(output_nb_path, 'plier_result'))

In [16]:
plier_Z <- data.frame(plier_result$Z)
colnames(plier_Z) <- paste0('LV', 1:ncol(plier_Z))
head(plier_Z)

Unnamed: 0_level_0,LV1,LV2,LV3,LV4,LV5,LV6,LV7,LV8,LV9,LV10,⋯,LV1033,LV1034,LV1035,LV1036,LV1037,LV1038,LV1039,LV1040,LV1041,LV1042
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
GAS6,0.0,0,0.8088228,0.0,0.0,0.02247598,0.0,0,0.722732686,0.0,⋯,0.0,0.01289279,0.0,0.0,0.0,0.0,0,0.0,0.02679134,0.5149712
MMP14,0.0,0,0.0,0.03365089,0.04630009,0.0,0.0,0,0.631639811,0.0,⋯,0.0,0.08506112,0.1338525,0.0,0.0,0.0,0,0.006647826,0.0,0.3673954
MARCKSL1,0.01613378,0,0.0,0.0,0.0668888,0.066676153,0.0,0,0.007572313,0.0,⋯,0.09874767,0.0,0.0,0.0,0.00691434,0.008059198,0,0.0,0.08562033,0.2394095
SPARC,0.0,0,0.2178293,0.05884839,0.0,0.0,0.0,0,0.307728319,0.0,⋯,0.0,0.0,0.0,0.02233023,0.0,0.0,0,0.094854111,0.0,0.2220075
CTSD,0.36951508,0,0.4789879,0.11634386,0.0,0.004501709,0.0,0,1.338231068,0.006038829,⋯,0.0,0.02404874,0.0,0.0,0.03719229,0.0,0,0.0,0.0,0.3540167
EPAS1,0.0,0,0.0,0.0,0.0,0.019567018,0.07301501,0,0.139432989,0.0,⋯,0.0,0.0,0.0,1.31723287,0.75172429,0.0,0,0.10046577,0.0,0.1011839


In [15]:
delayedPlier_Z <- data.frame(delayedPlier_result$Z)
colnames(delayedPlier_Z) <- paste0('LV', 1:ncol(delayedPlier_Z))
head(delayedPlier_Z)

Unnamed: 0_level_0,LV1,LV2,LV3,LV4,LV5,LV6,LV7,LV8,LV9,LV10,⋯,LV1033,LV1034,LV1035,LV1036,LV1037,LV1038,LV1039,LV1040,LV1041,LV1042
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
GAS6,0.27430539,0.087087797,0.67588796,0.006886257,0,0.039737414,0.0,0.148833698,0.62829993,0.0,⋯,0.0,0.0,0.20816,0.0,0.0,0.05558472,0,0.0,0.0,0.04389087
MMP14,0.0,0.0,0.0,0.059793633,0,0.0,0.1319655,0.0885056077,0.79200247,0.05847276,⋯,0.0,0.0,0.0,0.0,0.03387982,0.05303867,0,0.0,0.0,0.0
MARCKSL1,0.0,0.0,0.06309194,0.0,0,0.056461721,0.0,0.0173531588,0.00569385,0.0,⋯,0.1730694,0.0,0.0440362,0.004335352,0.0,0.0,0,0.0,0.0,0.0
SPARC,0.0,0.0,0.63300978,0.09392905,0,0.006519107,0.0,0.0150254065,0.31534125,0.10027196,⋯,0.0,0.0,0.0,0.0,0.10756563,0.0,0,0.0,0.0,0.0
CTSD,0.02645988,0.0,0.01103984,0.095672291,0,0.0,0.0,0.1255902669,1.51097641,0.0,⋯,0.0,0.0,0.0,0.177918538,0.0,0.0,0,0.1559392,0.0,0.0
EPAS1,0.0,0.009669602,0.07899905,0.0,0,0.0522011,0.1557273,0.0005632658,0.22821122,0.06532447,⋯,0.0,0.00571755,0.0,0.0,0.44298536,0.0,0,0.0,0.0732693,0.0


In [18]:
save_as_pickle(delayedPlier_Z, 'delayedPlier_Z.pkl', output_nb_path)

In [19]:
save_as_pickle(plier_Z, 'plier_Z.pkl', output_nb_path)