- ACC (adrenocortical carcinoma)
- UVM (uveal melanoma)
- SKCM (skin cutaneous melanoma)
- LGG (brain lower grade glioma)
- GBM (glioblatoma)


# Setup

In [55]:
# Data manipulation and analysis
suppressPackageStartupMessages({
  library(dplyr)        # Data manipulation and transformation
  library(tidyverse)    # Collection of data science packages
  library(data.table)   # Fast data manipulation

  # Survival analysis
  library(survival)     # Core survival analysis functions
  library(survminer)    # Survival analysis visualization

  # Genomic data handling
  library(recount3)     # Access to RNA-seq data
  library(biomaRt)      # Access to genomic annotations
  library(TCGAbiolinks) # TCGA data access
  library(SummarizedExperiment) # Container for genomic data
  library(DESeq2)      # RNA-seq analysis
  library(GenomicFeatures) # Genomic feature handling
  library(rtracklayer)  # Import/export genomic tracks

  # Matrix operations
  library(matrixStats)  # Matrix calculations
  library(sparseMatrixStats) # Sparse matrix operations

  # Parallel processing
  library(parallel)     # Base R parallel processing
  library(BiocParallel) # Bioconductor parallel processing

  # Utilities
  library(httr)         # HTTP requests
  library(retry)        # Retry failed operations
  library(futile.logger) # Logging functionality
  library(viridis)      # Color palettes
})

In [56]:
# Set working directory
setwd("/beegfs/scratch/ric.broccoli/kubacki.michal/SRF_SRRM3/TCGA/Data_exploration")


In [57]:
# Set R environment variables for better performance
Sys.setenv(R_MAX_NUM_DLLS = 150)
Sys.setenv(R_GC_MEM_GROW = 3)
Sys.setenv(R_ENABLE_JIT = 3)

# For parallel processing, you can set these based on your available cores
# For example, if you want to use 32 cores like in the shell script:
num_cores <- 4
Sys.setenv(OMP_NUM_THREADS = num_cores)
Sys.setenv(OPENBLAS_NUM_THREADS = num_cores)
Sys.setenv(MKL_NUM_THREADS = num_cores)

# Verify the environment variables were set
Sys.getenv(c("R_MAX_NUM_DLLS", "R_GC_MEM_GROW", "R_ENABLE_JIT", 
             "OMP_NUM_THREADS", "OPENBLAS_NUM_THREADS", "MKL_NUM_THREADS"))

BiocParallel::register(MulticoreParam(workers = num_cores))

# Analysis

In [58]:
cancer_types <- c("ACC", "UVM", "SKCM", "LGG", "GBM")
expression_threshold = 0.75
threshold = 0.75
grouping_method = "quartile"
gene_name = "SRRM3"

In [59]:
projects <- available_projects()

2025-02-17 06:38:00.456419 caching file sra.recount_project.MD.gz.

2025-02-17 06:38:01.006828 caching file gtex.recount_project.MD.gz.

2025-02-17 06:38:01.569484 caching file tcga.recount_project.MD.gz.



In [60]:
print(unique(projects$file_source))
print(unique(projects$project_type))

[1] "sra"  "gtex" "tcga"
[1] "data_sources"


In [61]:
print(unique(projects[projects$file_source == "tcga",]$project))

 [1] "ACC"  "BLCA" "BRCA" "CESC" "CHOL" "COAD" "DLBC" "ESCA" "GBM"  "HNSC"
[11] "KICH" "KIRC" "KIRP" "LAML" "LGG"  "LIHC" "LUAD" "LUSC" "MESO" "OV"  
[21] "PAAD" "PCPG" "PRAD" "READ" "SARC" "SKCM" "STAD" "TGCT" "THCA" "THYM"
[31] "UCEC" "UCS"  "UVM" 


In [62]:
project_info <- subset(projects, 
                      project %in% cancer_types &
                      file_source == "tcga" &
                      project_type == "data_sources")

In [63]:
project_info

Unnamed: 0_level_0,project,organism,file_source,project_home,project_type,n_samples
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<chr>,<table[1d]>
8710,ACC,human,tcga,data_sources/tcga,data_sources,79
8718,GBM,human,tcga,data_sources/tcga,data_sources,175
8724,LGG,human,tcga,data_sources/tcga,data_sources,532
8735,SKCM,human,tcga,data_sources/tcga,data_sources,473
8742,UVM,human,tcga,data_sources/tcga,data_sources,80


In [64]:
clinical_dirs <- lapply(cancer_types, function(ct) {
  file.path("../TCGA_clinical", 
            sprintf("clinical.project-tcga-%s.2025-02-12", 
                    tolower(ct)))
})
names(clinical_dirs) <- cancer_types
print(clinical_dirs)

$ACC
[1] "../TCGA_clinical/clinical.project-tcga-acc.2025-02-12"

$UVM
[1] "../TCGA_clinical/clinical.project-tcga-uvm.2025-02-12"

$SKCM
[1] "../TCGA_clinical/clinical.project-tcga-skcm.2025-02-12"

$LGG
[1] "../TCGA_clinical/clinical.project-tcga-lgg.2025-02-12"

$GBM
[1] "../TCGA_clinical/clinical.project-tcga-gbm.2025-02-12"



In [65]:
# Read clinical.tsv and follow_up.tsv files for each cancer type
clinical_data <- lapply(clinical_dirs, function(dir) {
  read.delim(file.path(dir, "clinical.tsv"), 
             sep="\t", stringsAsFactors=FALSE, na.strings="'--")
})

follow_up_data <- lapply(clinical_dirs, function(dir) {
  read.delim(file.path(dir, "follow_up.tsv"), 
             sep="\t", stringsAsFactors=FALSE, na.strings="'--")
})

exposure_data <- lapply(clinical_dirs, function(dir) {
  read.delim(file.path(dir, "exposure.tsv"), 
             sep="\t", stringsAsFactors=FALSE, na.strings="'--")
})

family_history_data <- lapply(clinical_dirs, function(dir) {
  read.delim(file.path(dir, "family_history.tsv"), 
             sep="\t", stringsAsFactors=FALSE, na.strings="'--")
})

pathology_details_data <- lapply(clinical_dirs, function(dir) {
  read.delim(file.path(dir, "pathology_detail.tsv"), 
             sep="\t", stringsAsFactors=FALSE, na.strings="'--")
})


In [66]:
summary(clinical_data)


     Length Class      Mode
ACC  197    data.frame list
UVM  197    data.frame list
SKCM 197    data.frame list
LGG  197    data.frame list
GBM  197    data.frame list

# Clinical data

## ACC

In [67]:
# Get columns that are not all NA
non_na_cols <- names(clinical_data$ACC)[colSums(!is.na(clinical_data$ACC)) > 0]

# Print unique values for each non-NA column
for (col in non_na_cols) {
  cat(col, ":", unique(clinical_data$ACC[[col]]), "\n")
}

case_id : 0304b12d-7640-4150-a581-2eea2b1f2ad5 075dbfd0-9cf4-4877-884f-ae858902c79e 0824b246-9fa2-4a8b-ad4c-1ffc7731bf7d 08e0d412-d4d8-4d13-b792-a4dd0bd9ec2b 09454ed6-64bc-4a35-af44-7c4344623d45 0acc28bc-d348-41df-8f49-bc0332eb9be4 0b73cbba-5520-4610-b649-912d76114033 11d8a9c2-14d8-4fc2-8388-386051e408b1 15fb07ae-101c-4553-891d-539cee89a5e9 199560cb-15e5-4276-865a-7d44d47c856c 1ae9cd13-73ef-4b1a-a734-4c465c73bc8e 1d5602a7-72e7-4dc6-b2db-2f778fcbf475 202f7a77-9e5a-4f12-8ffe-78ae5527a746 240dca72-76e0-4d57-978d-1242e6e7ed68 28153e3e-8515-4852-aaa0-247f86399f27 2f04e760-8a64-49f2-a1d6-3d390604745c 33c02ece-911b-4f45-b410-9df07e9b189e 344f7ea4-2bd4-4f2d-89f2-9fc7f572a3d6 388b8788-a635-4bc9-9b73-d204b2256747 3ed33021-8efd-420f-9a0b-e7420b586bcd 424a497a-48b9-4507-b234-c4fd08c8acad 42ef34cc-ffb4-432c-82bb-7aa56639ff51 42f69b2b-f3aa-4f4e-ac3a-149e14789927 48402d61-3a6b-4603-bfa0-b1a38a47d34d 4f845fea-e823-47d6-89fb-36ee478a43b8 4fbbbe43-3c0e-4475-bcf8-3745bd7195eb 56cdbba2-327e-4acd-a1eb-0f31

## UVM

In [68]:
# Get columns that are not all NA
non_na_cols <- names(clinical_data$UVM)[colSums(!is.na(clinical_data$UVM)) > 0]

# Print unique values for each non-NA column
for (col in non_na_cols) {
  cat(col, ":", unique(clinical_data$UVM[[col]]), "\n")
}

case_id : 014d4b17-d683-438b-9287-33fcec82bf5a 01b18f83-da8a-4d3f-b722-f00fe7559c60 029cc59c-60df-4411-a044-3a2eeee7b110 15d19ccc-52b8-41f6-b1c1-2cc55691aed5 1a2ce30c-5891-4bf6-8c97-6d1f34d4fe48 1d00217f-28df-4d91-aeb9-0d9a83a907e6 1dbd7b0e-9234-47d9-bdca-5b17f5af619b 2112a42a-9b85-475a-a31b-4ce7751cdeb9 22e7816d-4be9-4f84-b487-baa6ca35452d 2bf47a88-df0a-4102-af2e-998c3d252b9c 31771082-c094-4749-a9c7-80b67762f4d1 339490ef-6cf5-4d05-81b1-5e339e95f678 367869f9-bbe7-4e57-a16f-f2b63f2de697 39af6e0f-908c-4b2b-97b8-a4d1966e47e1 39e494bc-7710-4f9f-98b0-0f509f7aef1a 3a161c85-2544-4283-a10d-e332c9dd1692 3eb7aded-8887-4c27-a464-1b50ec090133 411ddf79-fe92-44ad-8d10-eeb703f11be7 418152bd-7d33-4f0a-b890-2356b30d6f44 4464418c-ec35-4826-b3d5-152ac2a27a04 4480d290-5e8a-4289-8e3c-de087e0de412 4498044c-1607-403e-867e-befd7fad275a 4b9fd822-17aa-4a22-8e17-175a72931d5d 4e531546-56f1-4165-94a2-ee45abd5e0f7 5019e7f1-db46-4a07-9e7b-b6e5d1481b84 5383fb57-69c2-4b34-af7f-c67208efb2b5 56ed4ea2-18ce-48d4-bf72-9ffa

## SKCM

In [69]:
# Get columns that are not all NA
non_na_cols <- names(clinical_data$SKCM)[colSums(!is.na(clinical_data$SKCM)) > 0]

# Print unique values for each non-NA column
for (col in non_na_cols) {
  cat(col, ":", unique(clinical_data$SKCM[[col]]), "\n")
}

case_id : 0153f141-625e-4623-9f8a-296678002c63 015ba831-106b-4b84-9e8c-243a9eeeebf6 01ad975d-c2ed-4e4d-bd3b-c9512fc9073c 01cb0004-fc1e-4da5-9d27-f458f8d711ee 01cfbfae-f344-439d-aeab-a9e15d636325 021a9431-7fdc-42b8-9ff0-e17ded79ff1d 021d32b0-a94a-4dad-95f4-8eb0abd894bf 04add7f0-d212-486e-ae70-d41d8112f523 04c3d01d-a949-4dc7-9829-a8c99180dba0 04f7ecc8-4f6b-41ac-a984-9ea604698a21 05f4058d-3066-4d16-8320-cf92f122945f 06f09d1c-ba44-4cf9-a518-762d241a30db 0797caaf-18ad-41c6-820a-e96554d14b31 080bb1f8-f20c-4daa-9de3-b5782b69583c 088c296e-8e13-4dc8-8a99-b27f4b27f95e 08d2b181-2b6e-4754-bfa9-5a94a1f18526 0a4b780e-8143-4118-ad98-fd2a2a6678c3 0b1f4f52-12ec-44ab-8283-a502bdf48c3a 0bb8c284-412b-4bbc-a508-d0a21db376cd 0caa706a-251f-43a0-897a-111aec0acc82 0d68acef-b6a0-4a8b-9088-15b9103e6aaf 0dcd7af7-ea83-4131-821d-a17943919c1a 0e2af161-5a1e-47e1-a7ef-010e47b8cb40 0e597498-44ad-40ae-bb83-f1b27388ec0a 0eb0b477-1c7a-4416-895a-ea6463dbe847 0f1fb775-c4a2-4a1e-8b09-f935233cdaee 0f579238-5a11-4adc-9c80-572c

## LGG

In [70]:
# Get columns that are not all NA
non_na_cols <- names(clinical_data$LGG)[colSums(!is.na(clinical_data$LGG)) > 0]

# Print unique values for each non-NA column
for (col in non_na_cols) {
  cat(col, ":", unique(clinical_data$LGG[[col]]), "\n")
}

case_id : 001ad307-4ad3-4f1d-b2fc-efc032871c7e 0061e9ef-df6a-414e-af64-c88f8192e2d3 0073a136-d5f4-4fd6-88f9-711768f2abc6 00b18ad6-ff70-41fe-8f4d-0763ac8f28b9 010aac75-3bfe-4bf2-b866-af0f2d92f125 0167cf11-74be-4701-ab9a-4e057d4bb545 019c5de9-74fd-48b7-95a3-227532e16c5a 01a13aba-74a4-4895-a5ad-e5119925c202 01c4ca12-ac08-41dd-9e4f-bfb0971688a2 02194c0c-e2c1-4ea7-994a-07b61b2575e2 029b33b1-7ea9-4b77-a615-7eb0e3ebd5e7 032096a6-1923-44b9-a1cb-181bed041c8e 033c1f48-9225-4159-b23b-7042272877da 048056cd-39dd-4134-a58b-70bebb3dd2ff 04d56bed-98fb-45eb-8c90-95e69264dfe2 04e6e843-7f31-43da-b111-280ce2bd1949 0536f465-1650-4411-823c-c660bc7f3adc 0557b788-5aed-4919-85eb-a503ad893d75 0611f5bc-89b5-44bd-b301-751faaadb561 0639b753-4099-4ca4-adb9-763069a614ef 0709dbcc-0828-4066-aa01-96f059e5fa97 081b6532-0ed4-445a-becb-cb2290f4854d 083c0b78-88c2-4ec1-8a81-a0792b16e2cc 0906ddde-02fa-417d-927b-1c400b189cac 09552b53-4393-4811-a671-a9ef6eff4790 0b283f95-dbd4-4c36-9201-2198a90ee0d6 0b54cda4-7568-481e-b0c6-2c5c

## GBM

In [71]:
# Get columns that are not all NA
non_na_cols <- names(clinical_data$GBM)[colSums(!is.na(clinical_data$GBM)) > 0]

# Print unique values for each non-NA column
for (col in non_na_cols) {
  cat(col, ":", unique(clinical_data$GBM[[col]]), "\n")
}

case_id : 0078b0c4-68a9-483b-9aab-61156d263213 0133e584-111e-450a-b451-77a2799ef529 0164acd3-34db-4d35-b96c-936daad0ff22 01a92062-967a-4900-8dc7-a5ecd3b3f8e2 01c878a3-7c3d-457f-99fa-df255960a122 025a7401-a65d-4ea0-8b4e-0ba775b0322a 035b09b7-add1-456d-ae80-20e8a03f6d7b 0383769b-f54d-4293-a9dc-88b830d775dc 0389b35b-651b-4776-b12a-d315a100f47c 03e9a354-7c95-4484-9ead-ed04acce7cb6 0456a66b-c9b3-4eb1-9872-424a7ea712a2 04b4ebb4-6780-42d8-a980-711cd10cfe14 054f472f-98cb-4559-b2e2-b5f800fc8eef 0553e60e-3510-417d-af8a-75947ebe8ab6 05875fe7-ad0e-45e1-a880-9aece4fea3b9 05944575-8473-4ed0-9de8-8091018001c2 05d5da50-52fe-4342-9c15-9ee4c1d6662e 0628cb4a-c480-4b2f-bd2e-bb33e6994302 0645038d-8abe-4a79-8695-bf8824d33f67 06c427a1-8c2d-4003-b744-139114a04440 06e32525-6043-45a1-9fef-fa24ab219a1f 074fa012-a61e-4646-bb3c-34d16782a2ea 0798dbe2-1914-427c-a2fe-2a865d0d6eda 07a2c1da-040e-44ea-9a56-751a06761894 08505b91-58b8-4121-a8ea-b9832331c24a 0987f48e-9d58-47b5-a1a6-de704caf4ed5 09ddf661-25fd-4baa-a1e8-bf90