# Create Human Trisome Project PLIER model

# Load libraries/modules

In [1]:
library(biomaRt)
library(here)
library(biomaRt)
library(DESeq2)
library(tidyverse)
library(rtracklayer)
library(dplyr)
library(GenomicRanges)
library(ggpubr)
library(cowplot)
# load plier utils
source(here::here('scripts/plier_util.R'))
`%>%` <- dplyr::`%>%`
library(PLIER)

here() starts at /home/msubirana/Documents/pivlab/plier_recount3

Loading required package: S4Vectors

Loading required package: stats4

Loading required package: BiocGenerics


Attaching package: ‘BiocGenerics’


The following objects are masked from ‘package:stats’:

    IQR, mad, sd, var, xtabs


The following objects are masked from ‘package:base’:

    anyDuplicated, aperm, append, as.data.frame, basename, cbind,
    colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
    get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
    match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
    Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
    table, tapply, union, unique, unsplit, which.max, which.min



Attaching package: ‘S4Vectors’


The following object is masked from ‘package:utils’:

    findMatches


The following objects are masked from ‘package:base’:

    expand.grid, I, unname


Loading required package: IRanges

Loading required p

# Load data

In [2]:
output_nb_path = here('output/nbs/create_human_trisome_project_plier_model/')
dir.create(output_nb_path, showWarnings = FALSE)

# Load PLIER pathway and cell type data
data(bloodCellMarkersIRISDMAP)
data(svmMarkers)
data(canonicalPathways)
data(chemgenPathways)
data(oncogenicPathways)
data(xCell)
data(immunePathways)
chr21_pathway <- readRDS(here::here('output/pathways/chr21_pathway.rds'))

## GSE190125 data

In [8]:
counts_matrix=here::here('data/GSE190125/GSE190125_Counts_for_GEO.txt')
raw_gene_counts_GSE190125 <- read.table(counts_matrix, header = TRUE, sep = "\t", check.names = FALSE)

gene_counts_GSE190125 <- raw_gene_counts_GSE190125 %>%
 dplyr::select(-chr, -gene_type) %>%
  pivot_wider(
    names_from = SampleID,
    values_from = raw_count,
    values_fill = list(raw_count = 0)
  ) %>%
  dplyr::select(Geneid = EnsemblID, GeneSymbol = gene_name, everything())

head(gene_counts_GSE190125)

In [None]:
# filter T21 samples
path_metadata_GSE190125 <- here::here('data/GSE190125/metadata_GSE190125.csv')
metadata_GSE190125 <- read.csv(path_metadata_GSE190125)
colnames(metadata_GSE190125) <- c('Group', 'Sample')	

#t21_samples <- metadata_GSE190125 %>% 
#  filter(Group == "T21") %>%
#  pull(Sample)

gene_counts_T21 <- gene_counts_GSE190125 %>%
  select(Geneid, GeneSymbol, matches(paste(t21_samples, collapse = "|")))

head(gene_counts_T21)

In [11]:
gtf_path <- here::here('data/GCF_000001405.38_GRCh38.p12_genomic.gtf')
tpm_gene_counts_GSE190125=tpm_normalization(gene_counts_GSE190125, gtf_path)
head(tpm_gene_counts_GSE190125)

[1m[22mJoining with `by = join_by(GeneSymbol)`


Unnamed: 0,HTP0001B2,HTP0005A3,HTP0012A2,HTP0015A4,HTP0017A4,HTP0018B3,HTP0019B2,HTP0022B2,HTP0023A2,HTP0025A3,⋯,HTP0664A,HTP0665A,HTP0666A,HTP0667A,HTP0668A,HTP0669B,HTP0672A,HTP0676A,HTP0706A,HTP0708A
A1BG,1.213548628,0.74541369,0.599729304,0.32731643,1.3535745,0.9487994,0.6912469,1.097659107,0.6766134,0.27325337,⋯,0.255005402,0.52592489,0.28917116,0.4338034,1.0180578,1.07863098,0.66926351,1.31704224,0.422119153,1.10667053
A1BG.AS1,38.222498659,24.92606386,26.427416614,12.07818145,24.1682646,20.46200787,29.3621799,24.934104324,37.2498378,16.60207704,⋯,32.137704564,37.67020602,19.36197036,23.5854179,29.6142852,31.45654523,17.50423671,26.70369761,22.246704141,31.47242852
A1CF,0.003766156,0.0,0.003579259,0.0,0.0,0.0,0.0,0.0,0.0,0.0,⋯,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A2M,0.040175335,0.05952886,0.025454407,0.07095019,0.1066928,0.03079474,0.1616818,0.079310732,0.1199986,0.03769264,⋯,0.052763272,0.08370718,0.06838003,0.1077101,0.0200616,0.04734121,0.09231839,0.04844618,0.005822723,0.07964579
A2M.AS1,7.22482254,7.02529435,5.042429225,12.28791629,5.7335597,7.13868477,9.2958055,7.779616698,10.7898108,6.35470746,⋯,4.052397341,2.16390527,4.32314182,9.3822274,4.481468,6.95520729,2.50756773,9.9033065,3.730342348,8.72800822
A2ML1,0.0,0.0,0.004770096,0.0,0.0,0.01731259,0.0,0.005404598,0.0,0.0,⋯,0.008789078,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.004364659,0.0


# PLIER preprocess

In [12]:
# Assign arguments to variables 
output_file <- file.path(output_nb_path, 'htp_plier_prep.rds')
                         
# Load data
expression_matrix <- tpm_gene_counts_GSE190125

# Prepare output directory
output_file_path=dirname(output_file)
dir.create(dirname(output_file_path), showWarnings = FALSE, recursive = TRUE)

# Combine the pathway data from PLIER
all_paths <- PLIER::combinePaths(bloodCellMarkersIRISDMAP, svmMarkers, canonicalPathways, chemgenPathways, oncogenicPathways,
                                 xCell, immunePathways, chr21_pathway)

output_combine_allPaths_expressionMatrix = combine_allPaths_expressionMatrix(expression_matrix, all_paths)

expression_matrix_cm=output_combine_allPaths_expressionMatrix$expression_matrix
all_paths_cm=output_combine_allPaths_expressionMatrix$all_paths

# compute rsvd/svd
set.seed(123456)
ns=ncol(expression_matrix_cm)
message("Computing SVD")
set.seed(123456);svdres=rsvd(expression_matrix_cm, k=min(ns, max(200, ns/4)), q=3)
message("Done")

# save z-scored expression data, the prior information matrix and svdres to be supplied to PLIER::PLIER and the number of PCs

plier_data_list <- list("expression_matrix_cm" = expression_matrix_cm,
                        "all_paths_cm" = all_paths_cm,
                        "svdres" = svdres)

saveRDS(plier_data_list, file = output_file)

Computing SVD

Done



In [13]:
dim(expression_matrix_cm)
dim(all_paths_cm)

In [14]:
head(expression_matrix_cm)

Unnamed: 0,HTP0001B2,HTP0005A3,HTP0012A2,HTP0015A4,HTP0017A4,HTP0018B3,HTP0019B2,HTP0022B2,HTP0023A2,HTP0025A3,⋯,HTP0664A,HTP0665A,HTP0666A,HTP0667A,HTP0668A,HTP0669B,HTP0672A,HTP0676A,HTP0706A,HTP0708A
GAS6,1.6560099,2.1402519,1.261899,1.796398,1.2777484,2.1355887,1.639557,2.66673,1.0397606,1.9595894,⋯,2.2467221,3.941508,1.5489576,1.4465901,3.290959,1.6876835,1.9312305,1.7675533,3.055264,0.9686226
MMP14,2.4953244,1.9930306,1.199696,2.5450903,1.6079692,2.3627318,3.089905,1.2012204,2.4844051,1.8453553,⋯,1.8506383,1.3945887,1.8237727,1.4166992,1.5172404,1.8680206,2.0237556,1.1416627,3.3442387,2.5316326
MARCKSL1,276.7641962,270.4265478,279.941407,258.5501581,334.024471,254.0435752,319.173642,395.2752429,331.1200619,288.8777938,⋯,391.0419482,259.6474733,490.4523672,430.3365181,266.8795789,326.3788623,309.2610658,286.272958,432.2270856,351.7063226
SPARC,3.6546822,57.871576,14.476117,25.5837818,26.2252392,4.2739706,47.820102,37.2642962,40.2125047,32.8440224,⋯,44.6884146,11.1299801,35.8952044,66.026055,13.0404064,25.0934466,8.4555687,6.9162273,29.9525006,23.0135713
CTSD,112.4979271,173.5282446,191.496797,176.9456269,217.5920308,160.519251,264.594733,201.8425316,194.9738538,172.2037147,⋯,224.833589,240.7303094,428.5332534,407.3607449,206.2864302,222.347196,115.9697302,161.9213684,303.9147606,149.5313723
EPAS1,0.5784673,0.5292743,0.733014,0.4310616,0.8043436,0.5605247,1.146536,0.5680423,0.2825346,0.5529601,⋯,0.3886176,0.2061834,0.2817493,0.5104546,0.3978614,0.5328227,0.6047124,0.4014235,0.6548923,0.1767059


In [15]:
head(all_paths_cm)

Unnamed: 0,IRIS_Bcell-Memory_IgG_IgA,IRIS_Bcell-Memory_IgM,IRIS_Bcell-naive,IRIS_CD4Tcell-N0,IRIS_CD4Tcell-Th1-restimulated12hour,IRIS_CD4Tcell-Th1-restimulated48hour,IRIS_CD4Tcell-Th2-restimulated12hour,IRIS_CD4Tcell-Th2-restimulated48hour,IRIS_CD8Tcell-N0,IRIS_DendriticCell-Control,⋯,chr21_q21.1,chr21_q21.2,chr21_q21.3,chr21_q22.11,chr21_q22.12,chr21_q22.2,chr21_q22.13,chr21_q22.3,chr21_p12,chr21
GAS6,0,0,0,0,0,0,0,0,0,1,⋯,0,0,0,0,0,0,0,0,0,0
MMP14,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
MARCKSL1,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
SPARC,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
CTSD,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
EPAS1,0,0,0,0,0,0,0,0,0,1,⋯,0,0,0,0,0,0,0,0,0,0


In [3]:
# Load data
plier_data_list =  readRDS(file.path(output_nb_path, 'htp_plier_prep.rds'))
expression_matrix_cm=plier_data_list$expression_matrix_cm
all_paths_cm=plier_data_list$all_paths_cm
svdres=plier_data_list$svdres

In [None]:
#parameter_k <- c(0.25, 0.5, 0.75, 1, 1.25, 1.5, 1.75, 2)

parameter_k <- c(1.25, 1.5, 1.75, 2)
frac <- c(0.25, 0.5, 0.7, 1)

for (param_k in parameter_k) {
  for (f in frac) {
    output_file <- file.path(output_nb_path, paste0('htp_plier_model_k', param_k, '_frac', f, '.rds'))

    # Load data
    expression_matrix_cm <- plier_data_list$expression_matrix_cm
    all_paths_cm <- plier_data_list$all_paths_cm
    svdres <- plier_data_list$svdres

    # Compute k
    k <- num.pc(svdres) * 2
    k <- min(k, floor(ncol(expression_matrix_cm) * 0.9))
    k <- k * param_k
    k <- round(k, 0)
    message("k is set to ", k)

    # Run PLIER
    plier_result <- PLIER::PLIER(data = expression_matrix_cm, priorMat = all_paths_cm, svdres = svdres, k = k, frac = f, scale = FALSE)

    # Save results
    saveRDS(plier_result, file = output_file)
  }
}

k is set to 195

Removing 437 pathways with too few genes



[1] 1810.413
[1] "L2 is set to 1810.41272823216"
[1] "L1 is set to 905.206364116078"


errorY (SVD based:best possible) = 10430

New L3 is 0.00463091873353325

New L3 is 0.00408677143846407

New L3 is 0.00408677143846407

New L3 is 0.00408677143846407

New L3 is 0.00408677143846407

New L3 is 0.00408677143846407

New L3 is 0.00408677143846407

