- ACC (adrenocortical carcinoma)
- UVM (uveal melanoma)
- SKCM (skin cutaneous melanoma)
- LGG (brain lower grade glioma)
- GBM (glioblatoma)


# Setup

In [2]:
# Data manipulation and analysis
suppressPackageStartupMessages({
  library(dplyr)        # Data manipulation and transformation
  library(tidyverse)    # Collection of data science packages
  library(data.table)   # Fast data manipulation

  # Survival analysis
  library(survival)     # Core survival analysis functions
  library(survminer)    # Survival analysis visualization

  # Genomic data handling
  library(recount3)     # Access to RNA-seq data
  library(biomaRt)      # Access to genomic annotations
  library(TCGAbiolinks) # TCGA data access
  library(SummarizedExperiment) # Container for genomic data
  library(DESeq2)      # RNA-seq analysis
  library(GenomicFeatures) # Genomic feature handling
  library(rtracklayer)  # Import/export genomic tracks

  # Matrix operations
  library(matrixStats)  # Matrix calculations
  library(sparseMatrixStats) # Sparse matrix operations

  # Parallel processing
  library(parallel)     # Base R parallel processing
  library(BiocParallel) # Bioconductor parallel processing

  # Utilities
  library(httr)         # HTTP requests
  library(retry)        # Retry failed operations
  library(futile.logger) # Logging functionality
  library(viridis)      # Color palettes
})

In [3]:
# Set working directory
setwd("/beegfs/scratch/ric.broccoli/kubacki.michal/SRF_SRRM3/TCGA/Data_exploration")


In [4]:
# Set R environment variables for better performance
Sys.setenv(R_MAX_NUM_DLLS = 150)
Sys.setenv(R_GC_MEM_GROW = 3)
Sys.setenv(R_ENABLE_JIT = 3)

# For parallel processing, you can set these based on your available cores
# For example, if you want to use 32 cores like in the shell script:
num_cores <- 4
Sys.setenv(OMP_NUM_THREADS = num_cores)
Sys.setenv(OPENBLAS_NUM_THREADS = num_cores)
Sys.setenv(MKL_NUM_THREADS = num_cores)

# Verify the environment variables were set
Sys.getenv(c("R_MAX_NUM_DLLS", "R_GC_MEM_GROW", "R_ENABLE_JIT", 
             "OMP_NUM_THREADS", "OPENBLAS_NUM_THREADS", "MKL_NUM_THREADS"))

BiocParallel::register(MulticoreParam(workers = num_cores))

# Analysis

In [5]:
cancer_types <- c("ACC", "UVM", "SKCM", "LGG", "GBM")
expression_threshold = 0.75
threshold = 0.75
grouping_method = "quartile"
gene_name = "SRRM3"

In [6]:
projects <- available_projects()

2025-02-17 06:43:27.390837 caching file sra.recount_project.MD.gz.

2025-02-17 06:43:28.007464 caching file gtex.recount_project.MD.gz.

2025-02-17 06:43:28.617104 caching file tcga.recount_project.MD.gz.



In [7]:
print(unique(projects$file_source))
print(unique(projects$project_type))

[1] "sra"  "gtex" "tcga"
[1] "data_sources"


In [8]:
print(unique(projects[projects$file_source == "tcga",]$project))

 [1] "ACC"  "BLCA" "BRCA" "CESC" "CHOL" "COAD" "DLBC" "ESCA" "GBM"  "HNSC"
[11] "KICH" "KIRC" "KIRP" "LAML" "LGG"  "LIHC" "LUAD" "LUSC" "MESO" "OV"  
[21] "PAAD" "PCPG" "PRAD" "READ" "SARC" "SKCM" "STAD" "TGCT" "THCA" "THYM"
[31] "UCEC" "UCS"  "UVM" 


In [9]:
project_info <- subset(projects, 
                      project %in% cancer_types &
                      file_source == "tcga" &
                      project_type == "data_sources")

In [10]:
project_info

Unnamed: 0_level_0,project,organism,file_source,project_home,project_type,n_samples
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<chr>,<table[1d]>
8710,ACC,human,tcga,data_sources/tcga,data_sources,79
8718,GBM,human,tcga,data_sources/tcga,data_sources,175
8724,LGG,human,tcga,data_sources/tcga,data_sources,532
8735,SKCM,human,tcga,data_sources/tcga,data_sources,473
8742,UVM,human,tcga,data_sources/tcga,data_sources,80


In [11]:
clinical_dirs <- lapply(cancer_types, function(ct) {
  file.path("../TCGA_clinical", 
            sprintf("clinical.project-tcga-%s.2025-02-12", 
                    tolower(ct)))
})
names(clinical_dirs) <- cancer_types
print(clinical_dirs)

$ACC
[1] "../TCGA_clinical/clinical.project-tcga-acc.2025-02-12"

$UVM
[1] "../TCGA_clinical/clinical.project-tcga-uvm.2025-02-12"

$SKCM
[1] "../TCGA_clinical/clinical.project-tcga-skcm.2025-02-12"

$LGG
[1] "../TCGA_clinical/clinical.project-tcga-lgg.2025-02-12"

$GBM
[1] "../TCGA_clinical/clinical.project-tcga-gbm.2025-02-12"



In [12]:
# Read clinical.tsv and follow_up.tsv files for each cancer type
clinical_data <- lapply(clinical_dirs, function(dir) {
  read.delim(file.path(dir, "clinical.tsv"), 
             sep="\t", stringsAsFactors=FALSE, na.strings="'--")
})

follow_up_data <- lapply(clinical_dirs, function(dir) {
  read.delim(file.path(dir, "follow_up.tsv"), 
             sep="\t", stringsAsFactors=FALSE, na.strings="'--")
})

exposure_data <- lapply(clinical_dirs, function(dir) {
  read.delim(file.path(dir, "exposure.tsv"), 
             sep="\t", stringsAsFactors=FALSE, na.strings="'--")
})

family_history_data <- lapply(clinical_dirs, function(dir) {
  read.delim(file.path(dir, "family_history.tsv"), 
             sep="\t", stringsAsFactors=FALSE, na.strings="'--")
})

pathology_details_data <- lapply(clinical_dirs, function(dir) {
  read.delim(file.path(dir, "pathology_detail.tsv"), 
             sep="\t", stringsAsFactors=FALSE, na.strings="'--")
})


In [13]:
summary(clinical_data)


     Length Class      Mode
ACC  197    data.frame list
UVM  197    data.frame list
SKCM 197    data.frame list
LGG  197    data.frame list
GBM  197    data.frame list

# Clinical data

## ACC

In [14]:
# Get columns that are not all NA
non_na_cols <- names(clinical_data$ACC)[colSums(!is.na(clinical_data$ACC)) > 0]

# Print unique values for each non-NA column, limited to 10 elements
for (col in non_na_cols) {
  unique_vals <- unique(clinical_data$ACC[[col]])
  if (length(unique_vals) > 10) {
    unique_vals <- unique_vals[1:10]
    cat(col, ":", unique_vals, "...\n")
  } else {
    cat(col, ":", unique_vals, "\n")
  }
}

case_id : 0304b12d-7640-4150-a581-2eea2b1f2ad5 075dbfd0-9cf4-4877-884f-ae858902c79e 0824b246-9fa2-4a8b-ad4c-1ffc7731bf7d 08e0d412-d4d8-4d13-b792-a4dd0bd9ec2b 09454ed6-64bc-4a35-af44-7c4344623d45 0acc28bc-d348-41df-8f49-bc0332eb9be4 0b73cbba-5520-4610-b649-912d76114033 11d8a9c2-14d8-4fc2-8388-386051e408b1 15fb07ae-101c-4553-891d-539cee89a5e9 199560cb-15e5-4276-865a-7d44d47c856c ...
case_submitter_id : TCGA-OR-A5LL TCGA-OR-A5J7 TCGA-P6-A5OG TCGA-OR-A5J8 TCGA-OR-A5KB TCGA-OR-A5JL TCGA-OR-A5JP TCGA-OR-A5KS TCGA-OR-A5LT TCGA-OR-A5JQ ...
project_id : TCGA-ACC 
age_at_index : 75 30 45 66 61 36 40 72 57 26 ...
age_is_obfuscated : false 
country_of_residence_at_enrollment : France Brazil United States Australia Germany Canada 
days_to_birth : -27607 -11279 -16659 -24266 -22550 -13155 -14667 -26404 -21032 -9667 ...
days_to_death : 1613 490 383 579 NA 2405 552 922 423 391 ...
ethnicity : not reported hispanic or latino not hispanic or latino Unknown 
gender : female male 
race : not reported whit

## UVM

In [16]:
# Get columns that are not all NA
non_na_cols <- names(clinical_data$UVM)[colSums(!is.na(clinical_data$UVM)) > 0]

# Print unique values for each non-NA column, limited to 10 elements
for (col in non_na_cols) {
  unique_vals <- unique(clinical_data$UVM[[col]])
  if (length(unique_vals) > 10) {
    unique_vals <- unique_vals[1:10]
    cat(col, ":", unique_vals, "...\n")
  } else {
    cat(col, ":", unique_vals, "\n")
  }
}

case_id : 014d4b17-d683-438b-9287-33fcec82bf5a 01b18f83-da8a-4d3f-b722-f00fe7559c60 029cc59c-60df-4411-a044-3a2eeee7b110 15d19ccc-52b8-41f6-b1c1-2cc55691aed5 1a2ce30c-5891-4bf6-8c97-6d1f34d4fe48 1d00217f-28df-4d91-aeb9-0d9a83a907e6 1dbd7b0e-9234-47d9-bdca-5b17f5af619b 2112a42a-9b85-475a-a31b-4ce7751cdeb9 22e7816d-4be9-4f84-b487-baa6ca35452d 2bf47a88-df0a-4102-af2e-998c3d252b9c ...
case_submitter_id : TCGA-VD-A8KO TCGA-WC-AA9A TCGA-V4-A9E5 TCGA-V4-A9EY TCGA-VD-A8KM TCGA-VD-AA8Q TCGA-VD-AA8N TCGA-V3-A9ZY TCGA-VD-A8KI TCGA-RZ-AB0B ...
project_id : TCGA-UVM 
age_at_index : 51 70 66 65 71 86 54 68 47 76 ...
age_is_obfuscated : false 
cause_of_death : NA Cancer Related Unknown Not Cancer Related Not Reported 
country_of_residence_at_enrollment : United Kingdom United States France 
days_to_birth : -18844 -25641 -18948 -24187 -24047 -26285 -31475 -19894 -24966 -17514 ...
days_to_death : NA 453 636 1113 149 575 2360 1314 1581 82 ...
ethnicity : not reported not hispanic or latino Unknown hispa

## SKCM

In [15]:
# Get columns that are not all NA
non_na_cols <- names(clinical_data$SKCM)[colSums(!is.na(clinical_data$SKCM)) > 0]

# Print unique values for each non-NA column, limited to 10 elements
for (col in non_na_cols) {
  unique_vals <- unique(clinical_data$SKCM[[col]])
  if (length(unique_vals) > 10) {
    unique_vals <- unique_vals[1:10]
    cat(col, ":", unique_vals, "...\n")
  } else {
    cat(col, ":", unique_vals, "\n")
  }
}

case_id : 0153f141-625e-4623-9f8a-296678002c63 015ba831-106b-4b84-9e8c-243a9eeeebf6 01ad975d-c2ed-4e4d-bd3b-c9512fc9073c 01cb0004-fc1e-4da5-9d27-f458f8d711ee 01cfbfae-f344-439d-aeab-a9e15d636325 021a9431-7fdc-42b8-9ff0-e17ded79ff1d 021d32b0-a94a-4dad-95f4-8eb0abd894bf 04add7f0-d212-486e-ae70-d41d8112f523 04c3d01d-a949-4dc7-9829-a8c99180dba0 04f7ecc8-4f6b-41ac-a984-9ea604698a21 ...
case_submitter_id : TCGA-D3-A3ML TCGA-EE-A3AF TCGA-DA-A1I2 TCGA-EE-A29V TCGA-EE-A2ML TCGA-ER-A42L TCGA-HR-A2OG TCGA-XV-AB01 TCGA-FW-A3TV TCGA-D3-A2JO ...
project_id : TCGA-SKCM 
age_at_index : 70 48 45 85 35 49 59 54 57 50 ...
age_is_obfuscated : false true 
country_of_residence_at_enrollment : United States Australia Canada Russia Vietnam Ukraine Poland Germany United Kingdom Puerto Rico ...
days_to_birth : -25643 -17731 -16701 -31092 -13140 -17984 -21767 -20058 -20972 -18577 ...
days_to_death : 422 420 5370 787 6590 NA 472 423 1446 317 ...
ethnicity : not hispanic or latino not reported hispanic or latino U

## LGG

In [17]:
# Get columns that are not all NA
non_na_cols <- names(clinical_data$LGG)[colSums(!is.na(clinical_data$LGG)) > 0]

# Print unique values for each non-NA column, limited to 10 elements
for (col in non_na_cols) {
  unique_vals <- unique(clinical_data$LGG[[col]])
  if (length(unique_vals) > 10) {
    unique_vals <- unique_vals[1:10]
    cat(col, ":", unique_vals, "...\n")
  } else {
    cat(col, ":", unique_vals, "\n")
  }
}

case_id : 001ad307-4ad3-4f1d-b2fc-efc032871c7e 0061e9ef-df6a-414e-af64-c88f8192e2d3 0073a136-d5f4-4fd6-88f9-711768f2abc6 00b18ad6-ff70-41fe-8f4d-0763ac8f28b9 010aac75-3bfe-4bf2-b866-af0f2d92f125 0167cf11-74be-4701-ab9a-4e057d4bb545 019c5de9-74fd-48b7-95a3-227532e16c5a 01a13aba-74a4-4895-a5ad-e5119925c202 01c4ca12-ac08-41dd-9e4f-bfb0971688a2 02194c0c-e2c1-4ea7-994a-07b61b2575e2 ...
case_submitter_id : TCGA-HT-A614 TCGA-E1-5311 TCGA-VM-A8CF TCGA-HT-7677 TCGA-DB-5278 TCGA-DU-A5TY TCGA-HT-7604 TCGA-S9-A7IZ TCGA-DU-7009 TCGA-HT-7902 ...
project_id : TCGA-LGG 
age_at_index : 47 31 44 53 17 46 50 48 32 30 ...
age_is_obfuscated : false 
cause_of_death : NA Not Reported 
country_of_residence_at_enrollment : United States Germany Australia Russia NA Brazil Italy Vietnam 
days_to_birth : -17392 -11332 -16334 -19610 -6480 -16945 -18443 -17874 -11854 -11118 ...
days_to_death : NA 4084 1033 4695 4229 775 1183 954 3470 242 ...
ethnicity : not hispanic or latino Unknown hispanic or latino not reported

## GBM

In [19]:
# Get columns that are not all NA
non_na_cols <- names(clinical_data$GBM)[colSums(!is.na(clinical_data$GBM)) > 0]

# Print unique values for each non-NA column, limited to 10 elements
for (col in non_na_cols) {
  unique_vals <- unique(clinical_data$GBM[[col]])
  if (length(unique_vals) > 10) {
    unique_vals <- unique_vals[1:10]
    cat(col, ":", unique_vals, "...\n")
  } else {
    cat(col, ":", unique_vals, "\n")
  }
}

case_id : 0078b0c4-68a9-483b-9aab-61156d263213 0133e584-111e-450a-b451-77a2799ef529 0164acd3-34db-4d35-b96c-936daad0ff22 01a92062-967a-4900-8dc7-a5ecd3b3f8e2 01c878a3-7c3d-457f-99fa-df255960a122 025a7401-a65d-4ea0-8b4e-0ba775b0322a 035b09b7-add1-456d-ae80-20e8a03f6d7b 0383769b-f54d-4293-a9dc-88b830d775dc 0389b35b-651b-4776-b12a-d315a100f47c 03e9a354-7c95-4484-9ead-ed04acce7cb6 ...
case_submitter_id : TCGA-14-1034 TCGA-06-0133 TCGA-06-0140 TCGA-06-0171 TCGA-06-0402 TCGA-12-0819 TCGA-19-2621 TCGA-06-0194 TCGA-12-0619 TCGA-15-1447 ...
project_id : TCGA-GBM 
age_at_index : 60 64 86 65 71 49 83 37 36 43 ...
age_is_obfuscated : false NA 
country_of_residence_at_enrollment : NA United States Canada 
days_to_birth : -22029 -23402 -31566 -24085 -26059 -18160 -30429 -13852 -21920 -13326 ...
days_to_death : 485 435 6 399 8 754 33 142 1062 NA ...
ethnicity : not reported not hispanic or latino NA hispanic or latino 
gender : female male not reported NA 
race : not reported white black or african a