# Project GTEx into ARCHS4 model

In [86]:
library(here)
library(dplyr)
library(hdf5r)
library(data.table)
library(tibble)
library(Rtsne)
library(ggplot2)
library(scales)

source(here("config.R"))

In [2]:
archs4_traits  <- readRDS(here('data/archs4/traits/archs4-phenoplier.rds'))
archs4_plier2 <- readRDS(here('data/archs4/model/archs4/archs4_PLIER2.rds'))

In [3]:
archs4_b <- as.data.frame(as.matrix(archs4_plier2$B))

Loading required package: Matrix



In [None]:
source(here("config.R"))

output_dir <- config$ARCHS4$DATASET_FOLDER
dir.create(output_dir, showWarnings = FALSE, recursive = TRUE)

h5 <- H5File$new(here('data/archs4/human_gene_v2.5.h5'), mode = "r")
sc_samples <- h5[["/meta/samples/singlecellprobability"]]$read()
samples_idx <- which(sc_samples < 0.5)
id_samples <- h5[["/meta/samples/geo_accession"]]$read()
filtered_ids <- id_samples[samples_idx]
saveRDS(filtered_ids, here(output_dir, 'samples_archs4.rds'))
h5$close_all()

In [5]:
length(filtered_ids)==length(colnames(archs4_b))

In [6]:
colnames(archs4_b) <- filtered_ids

In [7]:
archs4_b[1:5, 1:5]

Unnamed: 0_level_0,GSM1000981,GSM1000982,GSM1000983,GSM1000984,GSM1000985
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
LV1,0.07398552,0.06549906,0.08115003,0.07502413,0.06739754
LV2,-0.03746596,-0.03999266,-0.03714191,-0.03785702,-0.03428144
LV3,-0.13033351,-0.13065868,-0.13684181,-0.14037599,-0.13831017
LV4,-0.08857614,-0.08829065,-0.08617926,-0.09295446,-0.09596845
LV5,0.04987766,0.04897318,0.04881013,0.05693444,0.05454549


In [None]:
archs4_genes <- rownames(archs4_plier2$Z)
head(archs4_genes)

## Load GTEx

In [24]:
url <- config$GTEx$URL
dest_dir <-  config$GTEx$DATASET_FOLDER
dest_gz  <- file.path(dest_dir, basename(url))

if (!file.exists(dest_gz)) {
  dir.create(dest_dir, recursive = TRUE, showWarnings = FALSE)
  download.file(url, dest_gz, mode = "wb")
  message("Downloaded to: ", dest_gz)
} else {
  message("File already exists, skipping download.")
}

exprs_path  <- file.path(config$GTEx$DATASET_FOLDER, 'GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_tpm.gct.gz')
output_file <- config$GTEx$DATASET_FILE

if (!file.exists(output_file)) {
  dir.create(dirname(output_file), recursive = TRUE, showWarnings = FALSE)
  exprs_data <- read.table(exprs_path, header = TRUE, sep = "\t", skip = 2, check.names = FALSE)
  saveRDS(exprs_data, config$GTEx$DATASET_FILE)
  message("File successfully written to: ", config$GTEx$DATASET_FILE)
} else {
  message("Output file already exists. Skipping.")
}

# Aggregate in-place by 'description'
gtex <- readRDS(here(config$GTEx$DATASET_FILE))
gtex <- as.data.table(gtex)
aggregated_gtex <- gtex[, lapply(.SD, sum), by = Description, .SDcols = is.numeric]

genes <- aggregated_gtex$Description
samples <- colnames(aggregated_gtex[, -1])
gtex_data_mat <- as.matrix(aggregated_gtex[, -1])

File already exists, skipping download.

Output file already exists. Skipping.



In [25]:
rownames(gtex_data_mat) <- genes

## Filter genes

In [None]:
common_genes <- intersect(archs4_genes, rownames(gtex_data_mat))
gtex_data_mat_filtered <- gtex_data_mat[common_genes, , drop = FALSE]

In [None]:
gtex_data_mat_filtered <- archs4_plier2$Z[common_genes, , drop = FALSE]

In [34]:
archs4_Z_filtered <- archs4_plier2$Z[common_genes, , drop = FALSE]
archs4_plier2_filtered <- archs4_plier2
archs4_plier2_filtered$Z <- archs4_Z_filtered

## Project GTEx into ARCHS4 model

In [10]:
library(PLIER2)

In [30]:
nrow(gtex_data_mat_filtered)

In [35]:
nrow(archs4_plier2_filtered$Z)

In [70]:
gtex_archs4_projection <- projectPLIER(archs4_plier2_filtered, gtex_data_mat_filtered)

In [71]:
head(gtex_archs4_projection)

Unnamed: 0,GTEX-1117F-0226-SM-5GZZ7,GTEX-1117F-0426-SM-5EGHI,GTEX-1117F-0526-SM-5EGHJ,GTEX-1117F-0626-SM-5N9CS,GTEX-1117F-0726-SM-5GIEN,GTEX-1117F-1326-SM-5EGHH,GTEX-1117F-2426-SM-5EGGH,GTEX-1117F-2526-SM-5GZY6,GTEX-1117F-2826-SM-5GZXL,GTEX-1117F-2926-SM-5GZYI,⋯,GTEX-ZZPU-1126-SM-5N9CW,GTEX-ZZPU-1226-SM-5N9CK,GTEX-ZZPU-1326-SM-5GZWS,GTEX-ZZPU-1426-SM-5GZZ6,GTEX-ZZPU-1826-SM-5E43L,GTEX-ZZPU-2126-SM-5EGIU,GTEX-ZZPU-2226-SM-5EGIV,GTEX-ZZPU-2426-SM-5E44I,GTEX-ZZPU-2626-SM-5E45Y,GTEX-ZZPU-2726-SM-5NQ8O
LV 1,-10.589027,-14.64281971,-9.064855,-3.801207,-3.3356457,0.2029424,-7.247347,-4.876037,-2.0989028,-7.5754996,⋯,-10.2324669,-6.8742954,-7.588997,-7.1581058,-6.76621145,-9.0293,-4.476164,-10.187978,-11.564694,-6.427916
LV 2,3.459603,8.06594762,7.21145,3.122184,4.3657242,5.1479,4.830309,-1.273722,8.9023561,-1.3133951,⋯,0.9414166,5.4092998,5.420099,4.9922998,0.03035671,8.004829,-0.9540288,5.70842,9.940002,6.350229
LV 3,-14.334404,-0.04179137,-13.093453,-10.863356,0.1565231,-3.1737093,-12.930763,-1.861638,4.5570217,-1.0657303,⋯,-4.5867668,-1.4738001,-3.35543,-0.9476843,-5.51947902,-5.015832,-1.3551791,-16.900343,-0.37875,-2.109437
LV 4,5.664438,-3.71890002,7.659477,7.024201,5.1392727,-7.9123321,8.353613,-10.529646,-0.8117049,-1.6262067,⋯,7.7050968,1.7348322,3.140392,-1.2318803,4.10270176,8.842482,-17.1392495,9.392775,-10.255381,1.045551
LV 5,-2.526532,2.99495871,-4.124941,-3.495563,2.2371354,-2.1989246,1.256028,5.268567,-5.5482976,0.7153978,⋯,3.0901286,0.9475235,2.59256,0.590505,-0.26678222,1.393789,1.2546738,1.520363,4.730265,-1.910161
LV 6,57.109306,-2.5615532,36.543495,45.959927,10.2836796,26.5271033,60.323767,18.990593,15.0875426,29.5871176,⋯,9.4588626,5.7282534,6.560442,7.9495432,24.27239045,58.883947,2.7132251,44.400482,-4.458813,18.413333


## Clustering projection

In [72]:
gtex_archs4_projection <- as.data.frame(gtex_archs4_projection)

In [73]:
gtex_meta <- read.table(
    here('data/gtex/GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt'), 
    sep = '\t', 
    header = TRUE,
    quote = "", 
    fill = TRUE,
    comment.char = "" 
)

In [74]:
table(colnames(gtex_archs4_projection) %in% gtex_meta$SAMPID)


 TRUE 
17382 

In [89]:
head(gtex_meta)

Unnamed: 0_level_0,SAMPID,SMATSSCR,SMCENTER,SMPTHNTS,SMRIN,SMTS,SMTSD,SMUBRID,SMTSISCH,SMTSPAX,⋯,SME1ANTI,SMSPLTRD,SMBSMMRT,SME1SNSE,SME1PCTS,SMRRNART,SME1MPRT,SMNUM5CD,SMDPMPRT,SME2PCTS
Unnamed: 0_level_1,<chr>,<int>,<chr>,<chr>,<dbl>,<chr>,<chr>,<chr>,<int>,<int>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<lgl>,<int>,<dbl>
1,GTEX-1117F-0003-SM-58Q7G,,B1,,,Blood,Whole Blood,13756,1188,,⋯,,,,,,,,,,
2,GTEX-1117F-0003-SM-5DWSB,,B1,,,Blood,Whole Blood,13756,1188,,⋯,,,,,,,,,,
3,GTEX-1117F-0003-SM-6WBT7,,B1,,,Blood,Whole Blood,13756,1188,,⋯,,,,,,,,,,
4,GTEX-1117F-0011-R10a-SM-AHZ7F,,"B1, A1",,,Brain,Brain - Frontal Cortex (BA9),9834,1193,,⋯,,,,,,,,,,
5,GTEX-1117F-0011-R10b-SM-CYKQ8,,"B1, A1",,7.2,Brain,Brain - Frontal Cortex (BA9),9834,1193,,⋯,,,,,,,,,,
6,GTEX-1117F-0226-SM-5GZZ7,0.0,B1,"2 pieces, ~15% vessel stroma, rep delineated",6.8,Adipose Tissue,Adipose - Subcutaneous,2190,1214,1125.0,⋯,14648800.0,11999300.0,0.00315785,14669500.0,50.0354,0.00310538,0.99474,,0.0,50.1944


In [75]:
tissues_to_keep <- names(table(gtex_meta$SMTS)[table(gtex_meta$SMTS) >= 200])

gtex_meta_filtered <- gtex_meta %>% dplyr::filter(SMTS %in% tissues_to_keep)

table(gtex_meta_filtered$SMTS)


Adipose Tissue  Adrenal Gland          Blood   Blood Vessel    Bone Marrow 
          1327            275           3480           1473            217 
         Brain         Breast          Colon      Esophagus          Heart 
          3326            480            821           1582           1141 
         Liver           Lung         Muscle          Nerve       Pancreas 
           251            867           1132            722            360 
     Pituitary       Prostate           Skin         Spleen        Stomach 
           301            262           2014            260            381 
        Testis        Thyroid 
           406            812 

In [76]:
gtex_archs4_projection <- gtex_archs4_projection %>%
    dplyr::select(any_of(gtex_meta_filtered$SAMPID))

Filter LVs

In [77]:
archs4_summary_sig  <- archs4_plier2$summary %>% 
dplyr::filter(FDR < 0.05)  %>% 
dplyr::filter(AUC > 0.7)

head(archs4_summary_sig)
dim(archs4_summary_sig)

archs4_sig_lvs <- paste0('LV', archs4_summary_sig$LV)
head(archs4_sig_lvs)

Unnamed: 0_level_0,pathway,LV index,AUC,p-value,FDR
Unnamed: 0_level_1,<chr>,<chr>,<dbl>,<dbl>,<dbl>
1,Fibroblast Skin Mouse,4,0.7017548,0.0002561277,0.002694104
2,DCLK1+ Progenitor Cell Large Intestine Human,6,0.9450704,1.587106e-11,8.505215e-10
3,Fibroblast Skin Mouse,6,0.9121861,1.152203e-09,4.292596e-08
4,Fibrocartilage Chondrocyte Articular Cartilage Human,6,0.8203644,6.392203e-08,1.681793e-06
5,Leydig Precursor Cell Fetal Gonad Human,6,0.9048556,1.5249129999999998e-20,2.186854e-18
6,Medullary Cell Kidney Mouse,6,0.8333685,1.713542e-11,9.134705e-10


In [None]:
rownames(gtex_archs4_projection) <- gsub(' ', '', rownames(gtex_archs4_projection))
head(gtex_archs4_projection[, 1:5])

Unnamed: 0_level_0,GTEX-1117F-0226-SM-5GZZ7,GTEX-1117F-0426-SM-5EGHI,GTEX-1117F-0526-SM-5EGHJ,GTEX-1117F-0626-SM-5N9CS,GTEX-1117F-0726-SM-5GIEN,GTEX-1117F-1326-SM-5EGHH,GTEX-1117F-2826-SM-5GZXL,GTEX-1117F-2926-SM-5GZYI,GTEX-1117F-3226-SM-5N9CT,GTEX-111CU-0126-SM-5GZWZ,⋯,GTEX-ZZPU-0926-SM-5GZYT,GTEX-ZZPU-1026-SM-5E457,GTEX-ZZPU-1126-SM-5N9CW,GTEX-ZZPU-1226-SM-5N9CK,GTEX-ZZPU-1326-SM-5GZWS,GTEX-ZZPU-1426-SM-5GZZ6,GTEX-ZZPU-1826-SM-5E43L,GTEX-ZZPU-2426-SM-5E44I,GTEX-ZZPU-2626-SM-5E45Y,GTEX-ZZPU-2726-SM-5NQ8O
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
LV1,-10.589027,-14.64281971,-9.064855,-3.801207,-3.3356457,0.2029424,-2.0989028,-7.5754996,-5.6685199,-6.3119247,⋯,-12.001647,-6.0974733,-10.2324669,-6.8742954,-7.588997,-7.1581058,-6.76621145,-10.187978,-11.564694,-6.427916
LV2,3.459603,8.06594762,7.21145,3.122184,4.3657242,5.1479,8.9023561,-1.3133951,10.8377067,7.754371,⋯,5.276786,5.3923146,0.9414166,5.4092998,5.420099,4.9922998,0.03035671,5.70842,9.940002,6.350229
LV3,-14.334404,-0.04179137,-13.093453,-10.863356,0.1565231,-3.1737093,4.5570217,-1.0657303,1.441761,-0.5003182,⋯,-1.952572,-5.7389424,-4.5867668,-1.4738001,-3.35543,-0.9476843,-5.51947902,-16.900343,-0.37875,-2.109437
LV4,5.664438,-3.71890002,7.659477,7.024201,5.1392727,-7.9123321,-0.8117049,-1.6262067,-0.1055743,0.6813731,⋯,3.803885,0.5608233,7.7050968,1.7348322,3.140392,-1.2318803,4.10270176,9.392775,-10.255381,1.045551
LV5,-2.526532,2.99495871,-4.124941,-3.495563,2.2371354,-2.1989246,-5.5482976,0.7153978,-11.6247845,-0.1748736,⋯,7.432149,0.5656654,3.0901286,0.9475235,2.59256,0.590505,-0.26678222,1.520363,4.730265,-1.910161
LV6,57.109306,-2.5615532,36.543495,45.959927,10.2836796,26.5271033,15.0875426,29.5871176,-4.8994037,0.2797221,⋯,-1.798534,12.2832666,9.4588626,5.7282534,6.560442,7.9495432,24.27239045,44.400482,-4.458813,18.413333


In [83]:
gtex_archs4_projection_filtered <- gtex_archs4_projection %>%
  rownames_to_column("LV") %>%
  dplyr::filter(LV %in% archs4_sig_lvs) %>%
  column_to_rownames("LV")

In [84]:
head(gtex_archs4_projection_filtered)
dim(gtex_archs4_projection_filtered)

Unnamed: 0_level_0,GTEX-1117F-0226-SM-5GZZ7,GTEX-1117F-0426-SM-5EGHI,GTEX-1117F-0526-SM-5EGHJ,GTEX-1117F-0626-SM-5N9CS,GTEX-1117F-0726-SM-5GIEN,GTEX-1117F-1326-SM-5EGHH,GTEX-1117F-2826-SM-5GZXL,GTEX-1117F-2926-SM-5GZYI,GTEX-1117F-3226-SM-5N9CT,GTEX-111CU-0126-SM-5GZWZ,⋯,GTEX-ZZPU-0926-SM-5GZYT,GTEX-ZZPU-1026-SM-5E457,GTEX-ZZPU-1126-SM-5N9CW,GTEX-ZZPU-1226-SM-5N9CK,GTEX-ZZPU-1326-SM-5GZWS,GTEX-ZZPU-1426-SM-5GZZ6,GTEX-ZZPU-1826-SM-5E43L,GTEX-ZZPU-2426-SM-5E44I,GTEX-ZZPU-2626-SM-5E45Y,GTEX-ZZPU-2726-SM-5NQ8O
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
LV4,5.664438,-3.7189,7.659477,7.024201,5.1392727,-7.9123321,-0.8117049,-1.6262067,-0.1055743,0.6813731,⋯,3.803885,0.5608233,7.705097,1.7348322,3.140392,-1.23188,4.102702,9.392775,-10.255381,1.045551
LV6,57.109306,-2.5615532,36.543495,45.959927,10.2836796,26.5271033,15.0875426,29.5871176,-4.8994037,0.2797221,⋯,-1.798534,12.2832666,9.458863,5.7282534,6.5604422,7.949543,24.27239,44.400482,-4.458813,18.413333
LV10,-12.357593,6.0309447,-6.331369,-13.107697,6.6758267,-0.8704793,-1.5362578,-2.0871119,9.0422606,6.0102113,⋯,9.566702,-2.8308538,3.976866,-0.1183651,3.403225,3.3905,-1.322968,-7.883405,9.423927,-9.42697
LV11,1.913863,18.8529745,-1.458616,-3.888647,24.5626785,-1.4206931,4.0589378,3.3194414,22.3400172,19.1905583,⋯,18.519646,10.8834511,17.94384,14.1542859,12.7934349,14.33388,6.436627,2.565905,17.576864,8.625206
LV13,12.663101,-0.6241795,11.823197,7.839718,4.1458186,7.2825359,3.2281302,-5.0525464,5.4352274,16.8851218,⋯,-2.402965,11.2052561,1.422812,21.5521889,0.5379011,3.727253,6.410183,11.610667,1.020207,7.14331
LV14,6.290361,-7.1444886,2.867832,13.755431,0.7846897,8.4420012,5.1620606,-0.6550316,2.3879174,3.4650653,⋯,5.917638,3.1768501,7.234432,4.1104308,-1.320076,3.109167,23.076713,11.831553,-6.562454,4.590573


In [91]:
write.csv(gtex_archs4_projection_filtered, here('output/archs4/gtex_archs4_projection_filtered.csv'), row.names = TRUE, quote = FALSE)