# GTEx model building with GenomicSuperSignature

## Libraries

In [None]:
library(here)
library(matrixStats)
library(factoextra)
library(cluster)
library(GenomicSuperSignature)
library(msigdbr)
library(fgsea)
library(dplyr)
library(tibble)

## Input

In [2]:
gtex_data <- readRDS(here('output/gtex/df_gtex_fbm_filt.rds'))
head(gtex_data)

Unnamed: 0_level_0,GTEX-1117F-0226-SM-5GZZ7,GTEX-1117F-0426-SM-5EGHI,GTEX-1117F-0526-SM-5EGHJ,GTEX-1117F-0626-SM-5N9CS,GTEX-1117F-0726-SM-5GIEN,GTEX-1117F-1326-SM-5EGHH,GTEX-1117F-2426-SM-5EGGH,GTEX-1117F-2526-SM-5GZY6,GTEX-1117F-2826-SM-5GZXL,GTEX-1117F-2926-SM-5GZYI,⋯,GTEX-ZZPU-1126-SM-5N9CW,GTEX-ZZPU-1226-SM-5N9CK,GTEX-ZZPU-1326-SM-5GZWS,GTEX-ZZPU-1426-SM-5GZZ6,GTEX-ZZPU-1826-SM-5E43L,GTEX-ZZPU-2126-SM-5EGIU,GTEX-ZZPU-2226-SM-5EGIV,GTEX-ZZPU-2426-SM-5E44I,GTEX-ZZPU-2626-SM-5E45Y,GTEX-ZZPU-2726-SM-5NQ8O
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
WASH7P,3.2874723,2.2812531,3.0616034,3.5933538,2.1063483,2.6755901,3.6993295,4.1659119,3.4646683,3.7548875,⋯,1.3818371,1.708408,2.674913,1.7268312,1.789103,2.328549,1.5469562,1.9474791,0.9027296,1.663117
RP11-34P13.15,2.0755326,0.3210045,1.2363395,1.5165195,0.9458324,1.760008,1.4302853,2.9412941,1.6031219,2.5395311,⋯,2.1156992,3.018812,5.148934,3.1322478,2.537545,4.292782,3.3077199,2.1210154,0.7822408,2.867106
RP11-34P13.16,3.0021624,0.5819778,0.5661819,1.5134907,1.4745658,2.2097653,1.08882,3.6701605,2.304511,2.7911889,⋯,3.2237314,4.214125,5.580447,4.1667154,3.230818,5.26341,4.5014391,2.7152345,0.9684963,3.764474
RP11-34P13.18,3.5741015,2.4800066,3.4025858,3.4672795,1.8976277,2.4192692,4.1424134,3.7949357,3.0134623,3.6519127,⋯,1.75446,2.301295,2.827616,1.9500951,2.272023,2.955871,1.6780719,2.3152764,1.4076247,2.553361
AP006222.2,0.8335783,0.2752455,0.4659222,0.4257815,0.2309408,0.2118844,0.2096405,0.1629834,0.3923174,0.1650436,⋯,0.6360793,1.226509,1.14991,0.4867659,0.625177,1.531569,0.4006472,0.5525738,0.4739427,1.921817
MTND1P23,3.252779,5.0386996,3.8288346,2.3030501,3.257765,5.165108,2.9783787,2.5192903,4.2517191,3.3279747,⋯,6.4319572,3.848998,3.836934,3.6892992,4.092546,3.941106,3.3367118,3.9030383,4.6707271,3.879706


# PCA

In [17]:
n <- 412
study <- 'GTEx'
d <- 4 

In [4]:
pca_res <- prcomp(t(as.matrix(gtex_data)))   # x is a matrix with genes(row) x samples(column)

In [6]:
trainingData_PCA <- list()
trainingData_PCA[[study]] <- list()

trainingData_PCA[[study]]$rotation <- pca_res$rotation[, 1:n]
colnames(trainingData_PCA[[study]]$rotation) <- paste0(study, ".PC", 1:n)

In [7]:
eigs <- pca_res$sdev^2

In [8]:
pca_summary <- rbind(SD = sqrt(eigs),
                   Variance = eigs/sum(eigs),
                   Cumulative = cumsum(eigs)/sum(eigs))

In [9]:
trainingData_PCA[[study]]$variance <- pca_summary[,1:n]

In [10]:
colnames(trainingData_PCA[[study]]$variance) <- paste0(study, ".PC", c(1:n))

# Hierarchical Clustering

In [14]:
allZ <- trainingData_PCA[[study]]$rotation
storage.mode(allZ) <- "double"
all  <- t(allZ)

In [15]:
res.dist <- factoextra::get_dist(all, method = "spearman")

In [18]:
# Cut the tree
k <- round(nrow(all)/d, 0)
res.hcut <- factoextra::hcut(res.dist, k = k, hc_func = "hclust", 
                             hc_method = "ward.D", hc_metric = "spearman")

In [20]:
# Build avgLoading 
trainingData_PCclusters <- buildAvgLoading(allZ, k, cluster = res.hcut$cluster)

In [21]:
# Silhouette Width
cl <- trainingData_PCclusters$cluster
silh_res <- cluster::silhouette(cl, res.dist)
cl_silh_width <- summary(silh_res)$clus.avg.widths
trainingData_PCclusters$sw <- cl_silh_width  # add silhouette width to the result

# Final model

In [26]:
trainingData_df <- DataFrame(
  PCAsummary = I(list(trainingData_PCA[[study]]$variance))
)
rownames(trainingData_df) <- study

In [27]:
# Construct PCAGenomicSignatures
RAVmodel <- PCAGenomicSignatures(
  assays       = list(RAVindex = as.matrix(trainingData_PCclusters$avgLoading)),
  trainingData = trainingData_df
)

In [28]:
# Attach metadata analogous to the multi-study build
metadata(RAVmodel) <- trainingData_PCclusters[c("cluster","size","k","n")]
names(metadata(RAVmodel)$size) <- paste0("RAV", seq_len(ncol(RAVmodel)))

geneSets(RAVmodel)        <- "Custom"                          # label as you wish
studies(RAVmodel)         <- trainingData_PCclusters$studies   # PC->study map
silhouetteWidth(RAVmodel) <- trainingData_PCclusters$sw
updateNote(RAVmodel)      <- paste0("Single-matrix GTEx model; PCs = ", n, ".")
metadata(RAVmodel)$version <- "0.1.0-single"

RAVmodel

class: PCAGenomicSignatures 
dim: 21613 103 
metadata(7): cluster size ... updateNote version
assays(1): RAVindex
rownames(21613): WASH7P RP11-34P13.15 ... MT-TT MT-TP
rowData names(0):
colnames(103): Cl103_01 (2/1) Cl103_02 (8/1) ... Cl103_102 (2/1)
  Cl103_103 (4/1)
colData names(2): studies silhouetteWidth
trainingData(1): PCAsummary
trainingData names(1): GTEx

In [37]:
msig_category <- "C2" 
msig_df <- msigdbr(species = "Homo sapiens", category = msig_category)
pathways <- split(msig_df$gene_symbol, msig_df$gs_name)

RAVindex <- as.matrix(trainingData_PCclusters$avgLoading)
stopifnot(!is.null(rownames(RAVindex)))
rav_names <- colnames(RAVindex)

prep_ranks <- function(v) {
  v <- v[is.finite(v)]                 
  v <- tapply(v, names(v), function(x) x[which.max(abs(x))]) |> unlist()
  sort(v, decreasing = TRUE)
}

gsea_list <- vector("list", length(rav_names))
names(gsea_list) <- rav_names

for (j in seq_along(rav_names)) {
  ranks <- RAVindex[, j]
  names(ranks) <- rownames(RAVindex)
  ranks <- prep_ranks(ranks)
  stype <- if (all(ranks >= 0)) "pos" else if (all(ranks <= 0)) "neg" else "std"

  res <- fgsea(
    pathways = pathways,
    stats    = ranks,
    minSize  = 10,
    maxSize  = 5000,
    scoreType = stype           
  ) |>
    arrange(padj, desc(NES)) |>
    as_tibble()

  gsea_list[[j]] <- res
}

gsea(RAVmodel) <- gsea_list
metadata(RAVmodel)$gsea_collection <- msig_category
metadata(RAVmodel)$version <- paste0(metadata(RAVmodel)$version, "+gsea")

“There were 28 pathways for which P-values were not calculated properly due to unbalanced (positive and negative) gene-level statistic values. For such pathways pval, padj, NES, log2err are set to NA. You can try to increase the value of the argument nPermSimple (for example set it nPermSimple = 10000)”
“For some pathways, in reality P-values are less than 1e-50. You can set the `eps` argument to zero for better estimation.”
“For some pathways, in reality P-values are less than 1e-50. You can set the `eps` argument to zero for better estimation.”
“For some pathways, in reality P-values are less than 1e-50. You can set the `eps` argument to zero for better estimation.”
“For some pathways, in reality P-values are less than 1e-50. You can set the `eps` argument to zero for better estimation.”
“There were 1 pathways for which P-values were not calculated properly due to unbalanced (positive and negative) gene-level statistic values. For such pathways pval, padj, NES, log2err are set to NA.

In [38]:
B <- assays(RAVmodel)[["RAVindex"]]
B_df <- as.data.frame(B)
head(B_df)

Unnamed: 0_level_0,Cl103_01 (2/1),Cl103_02 (8/1),Cl103_03 (2/1),Cl103_04 (5/1),Cl103_05 (2/1),Cl103_06 (2/1),Cl103_07 (2/1),Cl103_08 (5/1),Cl103_09 (7/1),Cl103_10 (5/1),⋯,Cl103_94 (4/1),Cl103_95 (4/1),Cl103_96 (2/1),Cl103_97 (4/1),Cl103_98 (4/1),Cl103_99 (5/1),Cl103_100 (4/1),Cl103_101 (5/1),Cl103_102 (2/1),Cl103_103 (4/1)
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
WASH7P,0.000869914,-0.0035749177,0.0029476923,-0.0026787767,0.0027500734,-0.0002387846,-0.0048089836,0.0038364528,0.0003791668,0.0009981635,⋯,-0.001787915,0.0003578746,-0.0005832962,0.0040817601,-0.001411053,-0.00208124,0.0001816317,0.0005391983,0.001557177,0.003603127
RP11-34P13.15,-0.0075502758,0.0010751687,0.0009739744,-0.0032808487,0.0111387793,0.0146036492,-0.0007845854,-0.0009492809,0.004970048,-0.0011622949,⋯,0.0004421878,-0.008726541,0.0110645077,0.0122221672,-0.006928927,0.0047774797,0.0032732918,0.0021879293,0.013354866,-0.004265338
RP11-34P13.16,-0.0074202422,0.0028494454,-0.0022909236,-0.0024943045,0.0114231988,0.0166943166,-0.0016625884,-0.0017790979,0.0033235212,-0.0011495053,⋯,-0.0014214585,-0.01047174,0.0112652156,0.0153935593,-0.01258905,0.0072055335,0.0042382958,0.0066718938,0.012464723,-0.00642105
RP11-34P13.18,0.0008493924,-0.0015053055,-0.0015763311,-0.0008138962,0.0039306403,0.0046075158,-0.0054816186,-8.6253e-05,0.0011749211,0.0023460318,⋯,0.0006688386,-0.000489875,0.0018164222,0.0070893039,-0.005717554,0.0042241904,-0.0030071179,0.0024588334,-0.00636949,-0.001617729
AP006222.2,-0.0008161517,-0.0008667556,8.83724e-05,-0.0023678468,0.0007571445,0.0013819781,0.0004960214,0.0005270534,0.0011334137,0.0004653769,⋯,-0.0014173048,0.004225186,0.0028545983,-0.0008303019,0.001161698,0.000968592,-0.000952604,0.0011771213,-0.00335135,-0.003028939
MTND1P23,0.0002052102,-0.012271416,0.0036115216,-0.0117400614,0.0010318294,-0.0292359129,0.0076769842,0.0124613085,-0.0198091259,-0.0085543305,⋯,0.010122949,5.703683e-05,-0.0120919581,0.0058164568,0.002709946,0.0004445082,-0.00061869,0.002757809,0.007278513,-0.008509986


In [47]:
library(GenomicSuperSignature)

# genes × samples (numeric matrix)
expr <- as.matrix(gtex_data)
storage.mode(expr) <- "double"
rownames(expr) <- make.unique(rownames(expr))

# loadings from model: genes × RAVs
RAVindex <- assays(RAVmodel)[["RAVindex"]] |> as.matrix()
storage.mode(RAVindex) <- "double"

# align by genes (same order in both)
common <- sort(intersect(rownames(expr), rownames(RAVindex)))
expr_c     <- expr[common, , drop = FALSE]
RAVindex_c <- RAVindex[common, , drop = FALSE]

# RAV × sample scores (B in LV-space)
B_RAVxSample <- crossprod(RAVindex_c, expr_c)   # == t(RAVindex_c) %*% expr_c

# OPTIONAL: reconstruct gene × sample from model
X_hat <- RAVindex_c %*% B_RAVxSample

# quick checks
dim(expr_c)        # genes × samples (input)
dim(RAVindex_c)    # genes × RAVs
dim(B_RAVxSample)  # RAVs × samples
dim(X_hat)         # genes × samples (reconstruction)


In [49]:
head(B_RAVxSample)
dim(B_RAVxSample)

Unnamed: 0,GTEX-1117F-0226-SM-5GZZ7,GTEX-1117F-0426-SM-5EGHI,GTEX-1117F-0526-SM-5EGHJ,GTEX-1117F-0626-SM-5N9CS,GTEX-1117F-0726-SM-5GIEN,GTEX-1117F-1326-SM-5EGHH,GTEX-1117F-2426-SM-5EGGH,GTEX-1117F-2526-SM-5GZY6,GTEX-1117F-2826-SM-5GZXL,GTEX-1117F-2926-SM-5GZYI,⋯,GTEX-ZZPU-1126-SM-5N9CW,GTEX-ZZPU-1226-SM-5N9CK,GTEX-ZZPU-1326-SM-5GZWS,GTEX-ZZPU-1426-SM-5GZZ6,GTEX-ZZPU-1826-SM-5E43L,GTEX-ZZPU-2126-SM-5EGIU,GTEX-ZZPU-2226-SM-5EGIV,GTEX-ZZPU-2426-SM-5E44I,GTEX-ZZPU-2626-SM-5E45Y,GTEX-ZZPU-2726-SM-5NQ8O
Cl103_01 (2/1),235.42092,177.824397,239.7449,256.283375,189.732562,201.970396,251.648378,175.27062,214.219362,180.217961,⋯,202.63058,194.46731,207.14608,177.893278,226.1261,226.67663,133.989369,244.818671,179.563386,227.076513
Cl103_02 (8/1),12.86411,10.037433,12.04024,13.714875,9.418049,6.351568,18.321228,9.630842,11.065404,9.422213,⋯,11.78336,13.28835,13.58021,9.903368,13.38798,15.77753,6.267757,12.845496,9.145418,9.477471
Cl103_03 (2/1),-22.27538,8.278805,-27.51337,-35.27585,-19.753472,-38.12037,-34.897845,14.135122,-22.603801,21.592462,⋯,-14.80876,-24.96068,-29.87867,-16.664926,-15.88173,-22.91118,28.030285,-19.022171,9.344022,-29.514141
Cl103_04 (5/1),-10.63538,-24.615589,-12.12097,-9.066849,-9.115806,-4.837088,-8.825615,-9.565847,-7.581813,-3.694665,⋯,-10.63398,-15.28232,-10.18097,-6.848632,-10.20117,-16.91394,-8.576566,-9.838294,-22.702921,-6.594538
Cl103_05 (2/1),-69.92059,-62.762218,-69.64985,-61.946269,-52.719924,-72.337063,-53.234672,-70.523514,-64.860939,-72.208926,⋯,-58.383,-60.52823,-39.49516,-53.710738,-71.10295,-47.97906,-80.394418,-75.902548,-75.713397,-80.097618
Cl103_06 (2/1),41.31726,65.358315,47.19061,36.901868,39.151641,29.256938,41.984205,41.580664,32.490621,51.04787,⋯,44.17123,22.92701,32.33266,17.735377,36.63709,27.52369,29.241212,39.768066,64.788589,36.611757


In [50]:
output_dir <- here("output/gtex/GenomicSuperSignature")
dir.create(output_dir, showWarnings = FALSE)

In [53]:
write.csv(B_RAVxSample,
          file = here(output_dir, "gtex_B.csv"),
          quote = FALSE)