# Chromosome Y Gene Differential Expression in Blood (AMP-PD) and Brain (NABEC)
- **Author(s)** - Frank Grenn
- **Quick Description:** Differential gene expression between blood and brain in certain haplogroups. 

In [None]:
library("edgeR")
library(data.table)
# Load the tidyverse
library(tidyverse)
library(dplyr)


In [None]:
WRKDIR <- "/PATH/chrY"

## 1) Read and process AMP-PD chrY blood expression data from featureCounts

In [None]:
amp_counts <- as.data.frame(fread(paste0(WRKDIR,"/expression/amppd_chrY_featureCounts.csv")))
print(dim(amp_counts))

In [None]:
#remove everything after the version in the Geneid and check if there are no duplicates
s<-gsub("\\..*","",amp_counts$Geneid)
print(length(amp_counts$Geneid))
print(length(unique(amp_counts$Geneid)))
print(length(unique(s)))

In [None]:
amp_counts$Geneid <- gsub("\\..*","",amp_counts$Geneid)

In [None]:
amp_counts[1:5,1:5]

In [None]:
rownames(amp_counts) <- amp_counts$Geneid
counts <- amp_counts[ , -which(names(amp_counts) %in% c("Geneid"))]

In [None]:
dim(counts)

In [None]:
amppd_exp_final <- counts
colnames(amppd_exp_final) <- paste0(colnames(amppd_exp_final),"_blood")
print(dim(amppd_exp_final))
print(amppd_exp_final[1:5,1:5])

In [None]:
amppd_haplos <- fread(paste0(WRKDIR,"/output_male_hemizygous_only_het_filter_run/chrY_meta.csv"))
amppd_haplos$id_tissue <- paste0(amppd_haplos$fid,'_blood')
amppd_haplos$tissue <- 'blood'
print(head(amppd_haplos))

In [None]:
amppd_haplos <- amppd_haplos[(amppd_haplos$pheno!=-9) & (amppd_haplos$genetic_carrier==0) & (amppd_haplos$InfPop=="EUROPE"),]
print(dim(amppd_haplos))


In [None]:
print(dim(amppd_exp_final))
print(dim(amppd_haplos))

In [None]:
amppd_exp_final <- amppd_exp_final[,which(colnames(amppd_exp_final) %in% amppd_haplos$id_tissue)]
amppd_haplos <- amppd_haplos[which(amppd_haplos$id_tissue %in% colnames(amppd_exp_final)),]
print(dim(amppd_exp_final))
print(dim(amppd_haplos))


## 2) Read and process NABEC chrY brain expression data

In [None]:
nabec_counts <- fread("/PATH/quants_chrY_default_ref_matrix.csv")
print(dim(nabec_counts))
print(nabec_counts[1:4,1:4])

In [None]:
#remove everything after the version in the Geneid and check if there are no duplicates
s<-gsub("\\..*","",nabec_counts$Geneid)
print(length(nabec_counts$Geneid))
print(length(unique(nabec_counts$Geneid)))
print(length(unique(s)))

In [None]:
nabec_counts$Geneid <- gsub("\\..*","",nabec_counts$Geneid)

In [None]:
nabec_exp_final <- as.data.frame(nabec_counts)
rownames(nabec_exp_final) <- nabec_exp_final$'Geneid'

In [None]:
nabec_exp_final <- nabec_exp_final[,-which(names(nabec_exp_final) %in% c("Geneid"))]
colnames(nabec_exp_final) <- paste0(colnames(nabec_exp_final),'_brain')
print(dim(nabec_exp_final))
print(nabec_exp_final[0:5,0:7])

In [None]:
nabec_haplos <- as.data.frame(fread(paste0(WRKDIR,"/output_nabec/nabec_haplos.csv")))
nabec_haplos$new_id <- paste0(nabec_haplos$new_id,"fctx")
nabec_haplos$'id_tissue' <- paste0(nabec_haplos$new_id,'_brain')
nabec_haplos$tissue <- 'brain'
print(dim(nabec_haplos))
print(head(nabec_haplos))

In [None]:
print(dim(nabec_exp_final))
print(dim(nabec_haplos))

In [None]:
nabec_exp_final <- nabec_exp_final[,which(colnames(nabec_exp_final) %in% nabec_haplos$id_tissue)]
nabec_haplos <- nabec_haplos[which(nabec_haplos$id_tissue %in% colnames(nabec_exp_final)),]
print(dim(nabec_exp_final))
print(dim(nabec_haplos))

## 3) Setup data for DE analysis

#### combine expression data

In [None]:
print(dim(amppd_exp_final))
print(amppd_exp_final[1:5,1:5])
print(dim(nabec_exp_final))
print(nabec_exp_final[0:5,0:5])


In [None]:
print(dim(amppd_exp_final))
print(dim(nabec_exp_final))
merged <- merge(x = amppd_exp_final, y = nabec_exp_final,by.x = 0, by.y = 0)
print(dim(merged))
print(merged[1:5,1:5])

In [None]:
exp_final <- merged
rownames(exp_final) <- exp_final$'Row.names'
exp_final <- exp_final[,-which(names(exp_final) %in% c("Row.names"))]
print(dim(merged))
print(dim(exp_final))

In [None]:
exp_final[1:4,1:4]

#### combine sample data

In [None]:
print(dim(nabec_haplos))
nabec_temp <- nabec_haplos%>% dplyr::select("new_id","yhaplo_haplo_major","tissue")
colnames(nabec_temp) <- c("id","yhaplo_haplo_major","tissue")
print(dim(nabec_temp))
print(head(nabec_temp))



In [None]:
print(dim(amppd_haplos))
amppd_temp <- as.data.frame(amppd_haplos) %>% dplyr::select("fid","yhaplo_haplo_major","tissue")
colnames(amppd_temp) <- c("id","yhaplo_haplo_major","tissue")
print(dim(amppd_temp))
print(head(amppd_temp))

In [None]:

meta <- rbind(amppd_temp, nabec_temp)
meta$'id_tissue' <- paste0(meta$id,'_',meta$tissue)
print(dim(meta))
print(head(meta))

In [None]:
exp_meta <- as.data.frame(meta[which(meta$id_tissue %in% colnames(exp_final)),])
rownames(exp_meta) <- exp_meta$id_tissue
print(dim(meta))
print(dim(exp_final))
print(dim(exp_meta))

In [None]:
table(exp_meta$tissue)

In [None]:
table(exp_meta$yhaplo_haplo_major)

In [None]:
print(table(exp_meta[exp_meta$tissue=='brain',]$yhaplo_haplo_major))
print(table(exp_meta[exp_meta$tissue=='blood',]$yhaplo_haplo_major))

In [None]:
exp_final_subset <- exp_final %>% dplyr::select(exp_meta$id_tissue)
print(dim(exp_final_subset))
print(dim(exp_meta))

In [None]:
#any zero rows to remove?
print(dim(exp_final_subset))
print(dim(exp_final_subset[rowSums(exp_final_subset[])>0,]))

In [None]:
exp_final_subset <- exp_final_subset[rowSums(exp_final_subset[])>0,]
print(dim(exp_final_subset))

In [None]:
print(all(colnames(exp_final_subset) == rownames(exp_meta)))

## 4) edgeR for differential expression

In [None]:
dge <- DGEList(counts=exp_final_subset, samples = exp_meta, group = exp_meta$tissue)


In [None]:
design <- model.matrix(~group+0+yhaplo_haplo_major,data = dge$samples)#

In [None]:
# filter out low expressed genes

#print(dim(dge))
#isexpr <- rowSums(cpm(final_exp) > 10) >= 2
#dge <- dge[isexpr,]
#print(dim(dge))

keep <- filterByExpr(dge, design)
print(table(keep))
print(dim(dge$counts))
dge <- dge[keep, , keep.lib.sizes=FALSE]
print(dim(dge$counts))

In [None]:
AveLogCPM <- aveLogCPM(dge)
hist(AveLogCPM)

In [None]:
dge <- calcNormFactors(dge)
print(head(dge$samples))

In [None]:
pch <- c(0,1)
colors <- c("red", "blue")
plotMDS(dge, col=colors[dge$samples$group], pch = pch[dge$samples$group])

legend("topleft", legend=levels(dge$samples$group), pch = pch, col=colors, ncol=2)

In [None]:
plotMD(dge, column=1)

In [None]:
dge <- estimateDisp(dge, design, robust=TRUE)

In [None]:
plotBCV(dge)

In [None]:
fit <- glmQLFit(dge, design, robust=TRUE)
head(fit$coefficients)

In [None]:
plotQLDisp(fit)

In [None]:
summary(fit$df.prior)

In [None]:
design

In [None]:
contr.matrix <- makeContrasts(bloodvsbrain = groupblood-groupbrain,levels = colnames(design))
print(contr.matrix)

In [None]:
res <- glmQLFTest(fit, contrast=contr.matrix)

In [None]:
names(res)

In [None]:
topTagGene <- topTags(res,n=Inf)
print(head(topTagGene$table))

In [None]:
is.de <- decideTestsDGE(res)
summary(is.de)

In [None]:
plotMD(res, status=is.de)

#### filter by logFC

In [None]:
#distribution of the log fold change
summary(topTags(res,n=Inf)$table$logFC)

In [None]:
#default, no logFC filter
dim(topTagGene$table[which(topTagGene$table$FDR<0.05 & topTagGene$table$logFC < 0),])
dim(topTagGene$table[which(topTagGene$table$FDR<0.05 & topTagGene$table$logFC > 0),])

In [None]:
#try with a logFC cutoff
cutoff = 4
print(log2(cutoff))
is.de <- decideTestsDGE(res,lfc=log2(cutoff))
print(summary(is.de))



print(dim(topTagGene$table[which(topTagGene$table$FDR<0.05 & (topTagGene$table$logFC) > log2(cutoff)),]))
print(dim(topTagGene$table[which(topTagGene$table$FDR<0.05 & (topTagGene$table$logFC) < -log2(cutoff)),]))

#### merge the group raw count means to the topTagGene$table

In [None]:
results <- topTagGene$table
print(head(results))

In [None]:
nabec_exp <- exp_final_subset %>% dplyr::select(nabec_haplos$id_tissue)
nabec_exp$nabec_raw_means <- rowMeans(nabec_exp)


results <- merge(x = results, y = nabec_exp %>% select("nabec_raw_means"),by.x = 0, by.y = 0)
rownames(results) <- results$'Row.names'
results <-results[,-which(names(results) %in% c("Row.names"))]
results <- results[order(results$"FDR"),]
print(dim(results))
print(head(results))

In [None]:
amppd_exp <- exp_final_subset %>% dplyr::select(amppd_haplos$id_tissue)
amppd_exp$amppd_raw_means <- rowMeans(amppd_exp)


results <- merge(x = results, y = amppd_exp %>% select("amppd_raw_means"),by.x = 0, by.y = 0)
rownames(results) <- results$'Row.names'
results <-results[,-which(names(results) %in% c("Row.names"))]
results <- results[order(results$"FDR"),]
print(dim(results))
print(head(results))

In [None]:
print(dim(results[which(results$FDR<0.05 & (results$logFC) > log2(cutoff)),]))


print(head(results[which(results$FDR<0.05 & (results$logFC) > log2(cutoff)),]))
print(dim(results[which(results$FDR<0.05 & (results$logFC) < -log2(cutoff)),]))
print(head(results[which(results$FDR<0.05 & (results$logFC) < -log2(cutoff)),]))

In [None]:
write.csv(results, paste0(WRKDIR,"/expression/brain_blood_haplo_featureCounts_edgeR_diff_exp_results.csv"), row.names=TRUE)

## (re)Run DE analysis with glmTreat to filter for logFC

In [None]:
treat <- glmTreat(fit,contrast = contr.matrix, lfc = log2(cutoff))

In [None]:
plotQLDisp(treat)

In [None]:
topTagGene_treat <- topTags(treat,n=Inf)
head(topTagGene_treat$table)

In [None]:
is.de <- decideTestsDGE(treat)
summary(is.de)

In [None]:
plotMD(treat, status=is.de)

#### merge with the p values before thresholded testing

In [None]:
results_treat <- topTagGene_treat$table
print(head(results_treat))

In [None]:
treat_merge_vals <- results_treat %>% select("unshrunk.logFC","PValue","FDR")
colnames(treat_merge_vals) <- paste0("threshold_test_",colnames(treat_merge_vals))
print(head(treat_merge_vals))

In [None]:
merged_results <- merge(x = results, y = treat_merge_vals, by = 0)
merged_results <- merged_results[order(merged_results$'FDR',merged_results$'threshold_test_FDR'),]

rownames(merged_results) <- merged_results$'Row.names'

merged_results <- merged_results[,2:length(colnames(merged_results))]


print(dim(results))
print(dim(treat_merge_vals))
print(dim(merged_results))
print(head(merged_results))

In [None]:
print(dim(merged_results[which(merged_results$FDR<0.05 & (merged_results$logFC) > log2(cutoff)),]))

print(head(merged_results[which(merged_results$FDR<0.05 & (merged_results$logFC) > log2(cutoff)),]))
print(dim(merged_results[which(merged_results$FDR<0.05 & (merged_results$logFC) < -log2(cutoff)),]))
print(head(merged_results[which(merged_results$FDR<0.05 & (merged_results$logFC) < -log2(cutoff)),]))

In [None]:
print(dim(merged_results[which(merged_results$threshold_test_FDR<0.05 & (merged_results$logFC) > log2(cutoff)),]))

print(head(merged_results[which(merged_results$threshold_test_FDR<0.05 & (merged_results$logFC) > log2(cutoff)),]))
print(dim(merged_results[which(merged_results$threshold_test_FDR<0.05 & (merged_results$logFC) < -log2(cutoff)),]))
print(head(merged_results[which(merged_results$threshold_test_FDR<0.05 & (merged_results$logFC) < -log2(cutoff)),]))

In [None]:
write.csv(merged_results, paste0(WRKDIR,"/expression/brain_blood_haplo_featureCounts_edgeR_diff_exp_results.csv"), row.names=TRUE)