# Goals
**[Script]** Merge data across samples, add gene/transcript annotations, and [normalize data](https://bigomics.ch/blog/why-how-normalize-rna-seq-data/) to account for technical variability and to limit false positives and negatives
- Use VST for EDA (PCA), TPM for within sample comparisons, and CPM+TMM for across sample comparisons (DE)
    - **Counts (input for normalization):** Do not use directly, as counts are unnormalized
    - **VST (EDA):** Reduces the dependence of the variance on the mean
    - **TPM (within sample):** Corrects for gene length (more reads for longer genes) and sequencing depth
        - Log2TPM expression: <4 low , 4-6 intermediate, and >6 high expression 
    - **CPM+TMM (across samples):** Corrects for both sequencing depth and RNA composition bias
    - **[TPM+TMM?](https://www.proteinatlas.org/about/assays+annotation#classification_rna) (across samples)**  Renormalize to account for different (or filtered) counts then TMM to correct for sample-level bias
        - **Use Cases:** heterogeneous datasets like GTEx or HPA, different tissues, nucleus vs. cytoplasm, polyA vs. rRNA
    - **z-score (across samples):** Removes the effect of outliers, usually highly expressed genes
    - **Log-transformation:** Reduces skewness --> more normal distribution; makes patterns more visible
        - Log scales are symmetrical (e.g. 8-fold up = 8-fold down)
    - **Linear transformation / Scaling:** Unlike logging, scaling does not change skewness (same distribution)

**[Overall]** Collect all metadata and expression data for downstream analysis (ex. to identify batch effects and outlier samples before DE analysis)

**[To Do]**
1. Add this script to the end of the nextflow pipeline so it runs automatically

# Packages

In [2]:
#####################
### Normalization ###
#####################
library("edgeR")
library("DESeq2")

#####################
### Data Cleaning ###
#####################
library("tidyverse")
library("data.table")
library("janitor")
library("openxlsx")

####################
### Session Info ###
####################
library("sessioninfo")

## Options

In [None]:
options(warn = 1)
options(repr.matrix.max.cols=200, repr.matrix.max.rows=200)

## Functions

In [None]:
merge_counts = function(file_paths, count_col, merge_cols) {
    sample_names = basename(file_paths) %>% str_remove("\\..*$")
    
    list_of_dfs = map2(file_paths, sample_names,
        ~ fread(.x) %>% select(all_of(c({{merge_cols}}, {{count_col}}))) %>%
        rename(!!.y := {{count_col}}))
    
    df = reduce(list_of_dfs, full_join, by={{merge_cols}})
    return(df)
}

In [None]:
normalize_counts = function(df, id_col, method) {
    df_norm = df %>% column_to_rownames({{id_col}}) %>% as.matrix()
    
    if (method=="CPM"){
        df_norm = df_norm %>% edgeR::cpm(normalized.lib.sizes=TRUE, log=TRUE, prior.count=2)
    } else if (method=="VST"){
        df_norm = df_norm %>% DESeq2::vst(blind=FALSE, nsub=1000, fitType="parametric")
    } else if (method=="zscore"){
        df_norm = df_norm %>% scale(center=TRUE, scale=TRUE)
    } else{
        stop("Error: Method must be one of 'CPM', 'VST', or 'zscore'")
    }
    df_norm = df_norm %>% as.data.frame() %>% rownames_to_column({{id_col}})
    return(df_norm)
}

# Parameters

## Inputs

In [None]:
project_name = "GSE206529"

output_path = file.path("../../outputs", project_name)

star_path = file.path(output_path, "star_output")
star_log_files = list.files(star_path, full.names=TRUE, pattern="*_Log.final.out")

rsem_path = file.path(output_path, "rsem_output")
rsem_gene_files = list.files(rsem_path, full.names=TRUE, pattern="*.genes.results")
rsem_transcript_files = list.files(rsem_path, full.names=TRUE, pattern="*.isoforms.results")

In [None]:
gene_ann_file = "/mnt/disks/resources/data/references/outputs/Homo_sapiens/ensembl/111/Homo_sapiens.GRCh38.111.genes.sorted.bed"
transcript_ann_file = "/mnt/disks/resources/data/references/outputs/Homo_sapiens/ensembl/111/Homo_sapiens.GRCh38.111.transcripts.sorted.bed"

## Outputs

In [None]:
merged_path = file.path(output_path, "merged_files")
dir.create(merged_path, recursive=TRUE, showWarnings=FALSE)

results_path = file.path("../../results", project_name)
dir.create(results_path, recursive=TRUE, showWarnings=FALSE)

# Count files; All other files will be named from these
gene_counts_file = file.path(merged_path, paste0(project_name, "_genes_raw_counts.csv"))
transcript_counts_file = file.path(merged_path, paste0(project_name, "_transcripts_raw_counts.csv"))

# Final Excel files with multiple tabs (ex. "Counts", "TPM", "Log2CPM")
gene_excel_file = file.path(results_path, paste0(project_name, "_gene_data.xlsx"))
transcript_excel_file = file.path(results_path, paste0(project_name, "_transcript_data.xlsx"))

# Merge STAR log files

In [None]:
sample_names = basename(star_log_files) %>% str_remove("_Log.final.out")

list_of_dfs = map2(star_log_files, sample_names,
    ~ readLines(.x) %>%
        .[str_detect(., "\\|")] %>%
        str_split_fixed("\\|", 2) %>%
        as_tibble(.name_repair="minimal") %>%
        setNames(c("Metric", .y)) %>%
        mutate(across(everything(), str_trim)))
                   
df_star = reduce(list_of_dfs, full_join, by=c("Metric"))

star_merged_file = file.path(merged_path, paste0(project_name, "_star_mapping_metrics.csv"))
write.csv(df_star, star_merged_file, row.names=FALSE)

# Get gene/transcript annotations

In [None]:
df_transcript_ann = fread(transcript_ann_file) %>%
    clean_names(replace=c("#"="")) %>%
    select(transcript_id, transcript_biotype, gene_id, gene_name, gene_biotype, chr, start, end, strand) %>%
    distinct() %>%
    mutate(across(where(is.character), ~ na_if(., "")))

df_gene_ann = fread(gene_ann_file) %>%
    clean_names(replace=c("#"="")) %>%
    left_join(df_transcript_ann %>% select(gene_id, gene_name, gene_biotype) %>% distinct(), by="gene_id") %>%
    select(gene_id, gene_name, gene_biotype, chr, start, end, strand) %>%
    distinct() %>%
    mutate(across(where(is.character), ~ na_if(., "")))

# Merge RSEM files
- **Counts:** rounded to nearest integer for integer-based tools (ex. edgeR's cpm() uses a discrete probability distribution)
    - Why are there decimals in RSEM's expected_counts? RSEM handles ambiguous read mapping by fractionally assigning reads to several genes
- **TPM:** RSEM normalizes counts by transcript effective length (based on IsoPct) then by library size aka sequencing depth (total reads per sample / 1M)
    - Why gene length? Since longer genes inherently produce more reads than shorter genes at equal expression level
    - Why sequencing depth? to account for unequal RNA input (e.g. extraction efficiency, degradation)
        - **Note:** If not all genes are present, sequencing depth cannot be calculated

In [None]:
# Genes
df_gene_counts = merge_counts(rsem_gene_files, "expected_count", c("gene_id")) %>%
    mutate(across(where(is.numeric), ~ round(.x, digits=0)))
df_gene_counts_ann = right_join(df_gene_ann, df_gene_counts, by="gene_id")
write.csv(df_gene_counts_ann, gene_counts_file, row.names=FALSE)

df_gene_TPM = merge_counts(rsem_gene_files, "TPM", c("gene_id"))
df_gene_Log2TPM = df_gene_TPM %>% mutate(across(where(is.numeric), ~ log2(.x + 1)))
df_gene_Log2TPM_ann = right_join(df_gene_ann, df_gene_Log2TPM, by="gene_id")
write.csv(df_gene_Log2TPM_ann, gsub("raw_counts","Log2TPM", gene_counts_file), row.names=FALSE)

# Transcripts
df_transcript_counts = merge_counts(rsem_transcript_files, "expected_count", c("transcript_id")) %>%
    mutate(across(where(is.numeric), ~ round(.x, digits=0)))
df_transcript_counts_ann = right_join(df_transcript_ann, df_transcript_counts, by=c("transcript_id"))
write.csv(df_transcript_counts_ann, transcript_counts_file, row.names=FALSE)

df_transcript_TPM = merge_counts(rsem_transcript_files, "TPM", c("transcript_id"))
df_transcript_Log2TPM = df_transcript_TPM %>% mutate(across(where(is.numeric), ~ log2(.x + 1)))
df_transcript_Log2TPM_ann = right_join(df_transcript_ann, df_transcript_Log2TPM, by=c("transcript_id"))
write.csv(df_transcript_Log2TPM_ann, gsub("raw_counts","Log2TPM", transcript_counts_file), row.names=FALSE)

df_transcript_IsoPct = merge_counts(rsem_transcript_files, "IsoPct", c("transcript_id"))
df_transcript_IsoPct_ann = right_join(df_transcript_ann, df_transcript_IsoPct, by=c("transcript_id"))
write.csv(df_transcript_IsoPct_ann, gsub("raw_counts","IsoPct", transcript_counts_file), row.names=FALSE)

In [None]:
# Write out genes or transcripts that aren't annotated; shouldn't happen as everything is from the same GTF file
df_genes_unann = anti_join(df_gene_counts_ann, df_gene_ann)
if (nrow(df_genes_unann)>0){
    write.csv(df_genes_unann, gsub("raw_counts","unannotated", gene_counts_file), row.names=FALSE)
}

df_transcripts_unann = anti_join(df_transcript_counts_ann, df_transcript_ann)
if (nrow(df_transcripts_unann)>0){
    write.csv(df_transcripts_unann, gsub("raw_counts","unannotated", transcript_counts_file), row.names=FALSE)   
}
print(paste(nrow(df_genes_unann), "genes not annotated from GTF"))
print(paste(nrow(df_transcripts_unann), "transcripts not annotated from GTF"))

# Normalize data

## CPM + TMM + Log2
- [Trimmed mean of M-values (TMM)](https://bioconductor.org/packages/release/bioc/vignettes/edgeR/inst/doc/edgeRUsersGuide.pdf) normalization adjusts library sizes by trimming extreme values and weighting observations, thereby enabling accurate comparison of gene expression **across samples**
    - Extreme values, such as highly expressed genes, can negatively impact normalization (trimmed for this reason)
    - Assumes majority of genes are not differentially expressed between samples (true in almost all cases)
    - TMMwsp: more stable performance when the counts have a high proportion of zeros
    - `edgeR::normLibSizes`: What does effective library size mean?
        - <1: Few high count genes are lowering other counts --> library size scaled down to scale counts up
        - \>1: Many low count genes --> library size scaled up to scale counts down

In [None]:
# CPM + TMM norm with log2 transformation
df_gene_Log2CPM = normalize_counts(df_gene_counts, id_col="gene_id", method="CPM")
df_gene_Log2CPM_ann = right_join(df_gene_ann, df_gene_Log2CPM, by="gene_id")
write.csv(df_gene_Log2CPM_ann, gsub("raw_counts","Log2CPM", gene_counts_file), row.names=FALSE)

df_transcript_Log2CPM = normalize_counts(df_transcript_counts, id_col="transcript_id", method="CPM")
df_transcript_Log2CPM_ann = right_join(df_transcript_ann, df_transcript_Log2CPM, by="transcript_id")
write.csv(df_transcript_Log2CPM_ann, gsub("raw_counts","Log2CPM", transcript_counts_file), row.names=FALSE)

## VST
- Variance Stabilizing Transformation (VST)
    - Variance is stabilized across the range of mean values
        - Makes data more homoscedastic (constant variance along the range of mean values)
    - Also normalizes with respect to library size
        - Like voom, vst is useful when there is a large difference in library sizes
    - Can improve detection of differentially expressed genes and reduce false-positives

In [None]:
# VST: Uses defaults - 1000 genes to estimate dispersion trend (faster) with parametric fit
df_gene_VST = normalize_counts(df_gene_counts, id_col="gene_id", method="VST")
df_gene_VST_ann = right_join(df_gene_ann, df_gene_VST, by="gene_id")
write.csv(df_gene_VST_ann, gsub("raw_counts","VST", gene_counts_file), row.names=FALSE)

df_transcript_VST = normalize_counts(df_transcript_counts, id_col="transcript_id", method="VST")
df_transcript_VST_ann = right_join(df_transcript_ann, df_transcript_VST, by="transcript_id")
write.csv(df_transcript_VST_ann, gsub("raw_counts","VST", transcript_counts_file), row.names=FALSE)

## z-score
- **z-score (across samples):** Removes the effect of outliers, usually highly expressed genes
    - Z score equation: z_score = value - mean(values) / sd(values)
        - Modified Z score uses median instead of mean to further alleviate outlier effect
    - **Note:** Use sparingly as Z-score normalization is considered "drastic" (changes read distribution and removes info. about low/high genes)
    - **Use Cases:** Gene signature score (all genes equal for scoring), PCA (an outlier sample indicates transcriptome wide differences that affect all genes, not just the highest expressed)

In [None]:
df_gene_zscore = normalize_counts(df_gene_counts, id_col="gene_id", method="zscore")
df_gene_zscore_ann = right_join(df_gene_ann, df_gene_zscore, by="gene_id")
write.csv(df_gene_zscore_ann, gsub("raw_counts","zscore", gene_counts_file), row.names=FALSE)

df_transcript_zscore = normalize_counts(df_transcript_counts, id_col="transcript_id", method="zscore")
df_transcript_zscore_ann = right_join(df_transcript_ann, df_transcript_zscore, by="transcript_id")
write.csv(df_transcript_zscore_ann, gsub("raw_counts","zscore", transcript_counts_file), row.names=FALSE)

# Write to Excel

In [None]:
gene_list = list(
    "Counts"  = df_gene_counts_ann,
    "Log2TPM" = df_gene_Log2TPM_ann,
    "Log2CPM" = df_gene_Log2CPM_ann,
    "VST"     = df_gene_VST_ann,
    "zscore"  = df_gene_zscore_ann)

hs = createStyle(fontColour="black", fgFill="#FFFFCC", halign="left", textDecoration="bold")

write.xlsx(gene_list, headerStyle=hs, borders="all",
           firstActiveRow=2, firstActiveCol=8, withFilter=TRUE,
           file=gene_excel_file)

In [None]:
transcript_list = list(
    "Counts"  = df_transcript_counts_ann,
    "Log2TPM" = df_transcript_Log2TPM_ann,
    "Log2CPM" = df_transcript_Log2CPM_ann,
    "VST"     = df_transcript_VST_ann,
    "zscore"  = df_transcript_zscore_ann,
    "IsoPct"  = df_transcript_IsoPct_ann)

hs = createStyle(fontColour="black", fgFill="#FFFFCC", halign="left", textDecoration="bold")

write.xlsx(transcript_list, headerStyle=hs, borders="all",
           firstActiveRow=2, firstActiveCol=10, withFilter=TRUE,
           file=transcript_excel_file)

# QC

In [None]:
# TPM should sum to 1 million for each sample
colSums(df_gene_TPM %>% select(-gene_id), na.rm=TRUE) %>% round(0) / 1e6
colSums(df_transcript_TPM %>% select(-transcript_id), na.rm=TRUE) %>% round(0) / 1e6

In [None]:
# z score mean should be 0 and sd should be 1 for each sample
colMeans(df_gene_zscore %>% select(-gene_id), na.rm=TRUE) %>% round(3)
matrixStats::colSds(df_gene_zscore %>% select(-gene_id) %>% as.matrix(), na.rm=TRUE) %>% round(3)

colMeans(df_transcript_zscore %>% select(-transcript_id), na.rm=TRUE) %>% round(3)
matrixStats::colSds(df_transcript_zscore %>% select(-transcript_id) %>% as.matrix(), na.rm=TRUE) %>% round(3)

# Session info

In [3]:
si = session_info(pkgs="attached", to_file=FALSE)
as_tibble(si$platform) %>% select(version, os, system, date)
as_tibble(si$packages) %>% select(package, loadedversion)

version,os,system,date
<chr>,<chr>,<chr>,<chr>
R version 4.4.0 (2024-04-24),Ubuntu 22.04.4 LTS,"x86_64, linux-gnu",2025-08-12


package,loadedversion
<chr>,<chr>
Biobase,2.64.0
BiocGenerics,0.50.0
data.table,1.15.0
DESeq2,1.44.0
dplyr,1.1.4
edgeR,4.2.2
forcats,1.0.0
GenomeInfoDb,1.40.1
GenomicRanges,1.56.1
ggplot2,3.5.1
