## Differential expression with EdgeR

Here we use the EdgeR package to calculate differential expression for a number of group contrasts. We use a generalized linear model (GLM) to fit each transcript across all conditions, allowing us to leverage dispersion data for the entire dataset, rather than narrowing each individual pair.

### Set up environment
Load libraries, metadata, expression counts, and transcript annotations.

In [None]:
suppressPackageStartupMessages(require(dplyr))
suppressPackageStartupMessages(require(ggplot2))
suppressPackageStartupMessages(require(edgeR))
suppressPackageStartupMessages(require(DESeq2))
suppressPackageStartupMessages(require(EDASeq))

In [None]:
source('src/load_datasets.r')
head(meta, 10)

Sum counts over gene symbols, as in part 1 & 2

In [None]:
tpm_gene <- tpm_tall %>%
    inner_join(annot, by=c('target_id'='gencode_tx')) %>%
    group_by(hugo_symbol, Name) %>%
    summarize(tpm = sum(tpm, na.rm=T)) %>%
    inner_join(meta)

# Stack matrix to tall table
counts_tall <- counts_mat %>%
    as.data.frame %>%
    mutate(gencode_tx = rownames(counts_mat)) %>%
    gather(Name, counts, starts_with('GS-'))

# Summarize by gene
gene_counts_tall <- counts_tall %>%
    inner_join(annot, by='gencode_tx') %>%
    group_by(hugo_symbol, Name) %>%
    summarize(counts = sum(counts))

# Pivot back to matrix
gene_counts <- gene_counts_tall %>%
    spread(Name, counts)
rownames(gene_counts) <- gene_counts$hugo_symbol
gene_counts$hugo_symbol <- NULL
gene_counts <- as.matrix(gene_counts)
gene_counts <- gene_counts[, meta$Name]

head(tpm_gene)
gene_counts[1:3,1:3]

### Filter gene list
Force at least N samples to have more than 5 reads in order for a transcript to be used. This ensures no group of 3 is all zeros for the differential expression

In [None]:
filter_genes <- function(expr, n_reads=5, n_samples=2) {
    filter <- apply(expr, 1, function(x) length(x[x > n_reads]) >= n_samples)
    return(expr[filter,])
}
                    
N <- ncol(gene_counts) - 1
                    
filtered <- filter_genes(gene_counts, n_samples=N) # filter prevents zeros in one group
set_filtered <- EDASeq::newSeqExpressionSet(as.matrix(filtered), phenoData=meta)

### Design comparison matrix and contrasts
We predefine a list of contrasts, using columns of a design matrix.

In [None]:
design <- model.matrix(~0 + Description, data=meta)
rownames(design) <- meta$Name
head(design, 3)

In [None]:
contrasts <- list(
    contrast_1 = c(-1, 1, 0), # group 2 vs group 1
    contrast_2 = c(-1, 0, 1) # group 3 vs group 1
)

### Differential expression plots and top hits

In [None]:
calc_edger <- function(eset, design) {
    y <- DGEList(counts=counts(eset)[,rownames(design)])
    y <- calcNormFactors(y, method="upperquartile")
    y <- estimateGLMCommonDisp(y, design)
    y <- estimateGLMTagwiseDisp(y, design)
    fit <- glmFit(y, design)
    return(fit)
}

get_contrast_hits <- function(fit, contrast, n) {
    lrt <- glmLRT(fit, contrast=contrast)
    top <- topTags(lrt, n=n)$table
    top$hugo_symbol <- as.character(rownames(top))
    return(top)
}

plot_contrast <- function(top, label) {
    options(repr.plot.width=8, repr.plot.height=3)
    
    p <- ggplot(top, aes(x=logCPM, y=logFC)) +
        geom_point(aes(color=(FDR < 0.05 & abs(logFC) > 1)), alpha=0.1, size=2) + geom_smooth(span=.2, se=FALSE) +
        scale_color_manual(values = c(`TRUE`='red', `FALSE`='black')) +
        ggtitle(label)
    print(p)
    
    p <- ggplot(top, aes(x=logFC, y=-log10(FDR), size=logCPM, color=FDR < 0.05)) + 
        geom_point(alpha=0.1) +
        scale_color_manual(values = c(`TRUE`='red', `FALSE`='black')) +
        ggtitle(label)
    print(p)
    
}


In [None]:
fit <- calc_edger(set_filtered, design)

### Build GSEA preranked datasets
GSEA works best with fold-change ranked data from the entire list of genes. We collapse based on gene symbol, using the greatest absolute fold change across transcripts.

In [None]:
# Write fold change data to file
for (label in names(contrasts)) {
    top <- get_contrast_hits(fit, contrasts[[label]], nrow(set_filtered))
    top %>%
        select(hugo_symbol, logFC) %>%
        arrange(desc(logFC)) %>%
        write.table(file=paste0('results/', label, '_diff_genes_fc_summarized.rnk'), 
                                row.names=F, col.names=F, quote=F, sep='\t')
}

## Plot global differential expression

In [None]:
# Plot volcano and MA plots for all contrasts
for (label in names(contrasts)) {
    top <- get_contrast_hits(fit, contrasts[[label]], nrow(set_filtered))
    plot_contrast(top, label)
}

## Show top 20 genes from each comparison

In [None]:
# Print top hits for all contrasts
for (label in names(contrasts)) {
    top <- get_contrast_hits(fit, contrasts[[label]], nrow(set_filtered))
    
    # Get all logFC and FDR q-values
    degs <- top %>% 
        select(hugo_symbol, logFC, logCPM, FDR)
    
    # Print top 20 to notebook
    cat('Top 20 genes for', label, ':\n')
    print(degs %>% select(-hugo_symbol) %>% head(20))
    cat('\n')
    
    # write to file
    write.csv(degs, file=paste0('results/', label, '_gene_hits.csv'), quote=F, row.names=F)
}

## Individual gene plots
Below are plots of top differentially expressed genes or various genes of interest, across all treatment conditions

In [None]:
plot_gene <- function(symbol) {
    options(repr.plot.width=6, repr.plot.height=2)
    
    plot_data <- tpm_gene %>%
        filter(hugo_symbol == symbol)
    
    p <- ggplot(plot_data, aes(x=Timepoint, y=log2(tpm + 1), color=Concentration))
    p + geom_jitter(position=position_jitter(width=0.2, height=0)) + 
        facet_wrap(~Treatment) +
        ggtitle(symbol)
}

## Heatmap of gene set expression across all conditions
We use the `pheatmap` library to plot annotated expression heatmaps across conditions

In [None]:
library(pheatmap)

Read in list of genes from gene sets of interest

In [None]:
fpath = 'data/gene_list.csv'

tgt_genes <- read.csv(fpath, header=F, stringsAsFactors=F)$V1
tgt_genes[1:5]
head(tpm_gene)

Set up metadata and expression matrix for heatmap

In [None]:
meta_sorted <- meta %>%
    arrange(Treatment, Concentration, Timepoint, Replicate)

pathway_mat <- tpm_gene %>%
    filter(hugo_symbol %in% tgt_genes) %>%
    mutate(log2_tpm = log2(tpm + 1)) %>%
    select(hugo_symbol, Name, log2_tpm) %>%
    spread(Name, log2_tpm) %>%
    ungroup
rnames <- pathway_mat$hugo_symbol
pathway_mat <- pathway_mat %>%
    select(starts_with('GS-')) %>%
    as.matrix
rownames(pathway_mat) <- rnames
pathway_mat <- pathway_mat[, meta_sorted$Name] # rearrange
colnames(pathway_mat) <- meta_sorted$Description # re-label

head(meta_sorted)
pathway_mat[1:5, 1:5]
cat('Dimension of pathway gene matrix:\n')
dim(pathway_mat)

In [None]:
scaled_mat <- t(scale(t(pathway_mat)))
na_rows <- apply(scaled_mat, 1, function(row) any(is.na(row)))
scaled_mat <- scaled_mat[!na_rows,]

scaled_mat[1:5, 1:5]
cat('Dimension after removing NA rows:\n')
dim(scaled_mat)

In [None]:
options(repr.plot.width=8, repr.plot.height=6)
pheatmap(scaled_mat, show_rownames=F, cluster_cols=F)