# Chromosome Y Gene Differential Expression in Haplogroups
- **Author(s)** - Frank Grenn
- **Date Started** - August 2021
- **Quick Description:** Differential gene expression between haplogroups in AMPPD data.

In [None]:
library("edgeR")
library(data.table)
library(dplyr)



In [None]:
WRKDIR <- "$PATH/chrY"

## Get data

In [None]:
amp_counts <- as.data.frame(fread(paste0(WRKDIR,"/expression/amppd_chrY_featureCounts.csv")))
print(dim(amp_counts))

In [None]:
#remove everything after the version in the Geneid and check if there are no duplicates
s<-gsub("\\..*","",amp_counts$Geneid)
print(length(amp_counts$Geneid))
print(length(unique(amp_counts$Geneid)))
print(length(unique(s)))

In [None]:
amp_counts$Geneid <- gsub("\\..*","",amp_counts$Geneid)

In [None]:
amp_counts[1:5,1:5]

In [None]:
rownames(amp_counts) <- amp_counts$Geneid
counts <- amp_counts[ , -which(names(amp_counts) %in% c("Geneid"))]

In [None]:
dim(counts)

In [None]:
amppd_exp_final <- counts
#colnames(amppd_exp_final) <- paste0(colnames(amppd_exp_final),"_blood")
print(dim(amppd_exp_final))
print(amppd_exp_final[1:5,1:5])

In [None]:
amppd_haplos <- fread(paste0(WRKDIR,"/output_male_hemizygous_only_het_filter_run/chrY_meta.csv"))
amppd_haplos$id_tissue <- paste0(amppd_haplos$fid,'_blood')
amppd_haplos$tissue <- 'blood'
print(head(amppd_haplos))

In [None]:
amppd_haplos <- amppd_haplos[(amppd_haplos$pheno!=-9) & (amppd_haplos$genetic_carrier==0) & (amppd_haplos$InfPop=="EUROPE"),]
print(dim(amppd_haplos))

In [None]:
print(dim(amppd_exp_final))
print(dim(amppd_haplos))

In [None]:
amppd_exp_final <- amppd_exp_final[,which(colnames(amppd_exp_final) %in% amppd_haplos$fid)]
amppd_haplos <- amppd_haplos[which(amppd_haplos$fid %in% colnames(amppd_exp_final)),]
rownames(amppd_haplos) <- amppd_haplos$fid
print(dim(amppd_exp_final))
print(dim(amppd_haplos))

In [None]:
print(dim(amppd_exp_final))
print(dim(amppd_exp_final[rowSums(amppd_exp_final[])>0,]))
amppd_exp_final <- amppd_exp_final[rowSums(amppd_exp_final[])>0,]

In [None]:
final_exp <- amppd_exp_final %>% dplyr::select(amppd_haplos$fid)
print(dim(final_exp))
print(dim(amppd_haplos))

In [None]:
print(all(colnames(final_exp) == rownames(amppd_haplos)))

### Case control differential expression with edgeR

In [None]:
dge <- DGEList(counts=final_exp, samples = amppd_haplos, group = amppd_haplos$pheno)

In [None]:
design <- model.matrix(~0+group+AGE_BASELINE+ltrack_haplo_major,data = dge$samples)
#design <- model.matrix(~0+group,data = dge$samples)

In [None]:
# filter out low expressed genes

#print(dim(dge))
#isexpr <- rowSums(cpm(final_exp) > 10) >= 2
#dge <- dge[isexpr,]
#print(dim(dge))

keep <- filterByExpr(dge, design)
print(table(keep))
print(dim(dge$counts))
dge <- dge[keep, , keep.lib.sizes=FALSE]
print(dim(dge$counts))

In [None]:
summary(dge$samples$group)

In [None]:
AveLogCPM <- aveLogCPM(dge)
hist(AveLogCPM)

In [None]:
dge <- calcNormFactors(dge)
print(head(dge$samples))

In [None]:
pch <- c(0,1)
colors <- c("red", "blue")
plotMDS(dge, col=colors[dge$samples$group], pch = pch[dge$samples$group])

legend("topleft", legend=levels(dge$samples$group), pch = pch, col=colors, ncol=2)

In [None]:
plotMD(dge, column=1)

In [None]:
dge <- estimateDisp(dge, design, robust=TRUE)

In [None]:
plotBCV(dge)

In [None]:
fit <- glmQLFit(dge, design, robust=TRUE)
head(fit$coefficients)

In [None]:
plotQLDisp(fit)

In [None]:
summary(fit$df.prior)

In [None]:
design

In [None]:
contr.matrix <- makeContrasts(casevsControl = group2-group1,levels = colnames(design))
print(contr.matrix)

In [None]:
res <- glmQLFTest(fit, contrast=contr.matrix)

In [None]:
topTagGene <- topTags(res,n=Inf)
print(head(topTagGene$table))

In [None]:
is.de <- decideTestsDGE(res)
summary(is.de)

In [None]:
plotMD(res, status=is.de)

In [None]:
#default, no logFC filter
dim(topTagGene$table[which(topTagGene$table$FDR<0.05 & topTagGene$table$logFC < 0),])
dim(topTagGene$table[which(topTagGene$table$FDR<0.05 & topTagGene$table$logFC > 0),])

#try with a logFC cutoff
cutoff = 4
print(log2(cutoff))
is.de <- decideTestsDGE(res,lfc=log2(cutoff))
print(summary(is.de))



print(dim(topTagGene$table[which(topTagGene$table$FDR<0.05 & (topTagGene$table$logFC) > log2(cutoff)),]))
print(dim(topTagGene$table[which(topTagGene$table$FDR<0.05 & (topTagGene$table$logFC) < -log2(cutoff)),]))


In [None]:
head(amppd_haplos)

In [None]:
results <- topTagGene$table
#print(head(results))

case_exp <- final_exp %>% dplyr::select(amppd_haplos[amppd_haplos$pheno ==2,]$fid)
case_exp$case_means <- rowMeans(case_exp)

results <- merge(x = results, y = case_exp %>% select("case_means"),by.x = 0, by.y = 0)
rownames(results) <- results$'Row.names'
results <-results[,-which(names(results) %in% c("Row.names"))]
results <- results[order(results$"FDR"),]
#print(dim(results))
#print(head(results))

control_exp <- final_exp %>% dplyr::select(amppd_haplos[amppd_haplos$pheno ==1,]$fid)
control_exp$control_means <- rowMeans(control_exp)

results <- merge(x = results, y = control_exp %>% select("control_means"),by.x = 0, by.y = 0)
rownames(results) <- results$'Row.names'
results <-results[,-which(names(results) %in% c("Row.names"))]
results <- results[order(results$"FDR"),]
print(dim(results))
print(head(results))


print("check with logFC cutoff:")
print(dim(results[which(results$FDR<0.05 & (results$logFC) > log2(cutoff)),]))


print(head(results[which(results$FDR<0.05 & (results$logFC) > log2(cutoff)),]))
print(dim(results[which(results$FDR<0.05 & (results$logFC) < -log2(cutoff)),]))
print(head(results[which(results$FDR<0.05 & (results$logFC) < -log2(cutoff)),]))

#write.csv(results, paste0(WRKDIR,"/expression/amppd_case_control_diff_exp_edgeR_results.csv"), row.names=TRUE)

In [None]:


# (re)Run DE analysis with glmTreat to filter for logFC

treat <- glmTreat(fit,contrast = contr.matrix, lfc = log2(cutoff))

topTagGene_treat <- topTags(treat,n=Inf)
head(topTagGene_treat$table)

is.de <- decideTestsDGE(treat)
summary(is.de)

# merge with the p values before thresholded testing

results_treat <- topTagGene_treat$table


treat_merge_vals <- results_treat %>% select("unshrunk.logFC","PValue","FDR")
colnames(treat_merge_vals) <- paste0("threshold_test_",colnames(treat_merge_vals))
print(head(treat_merge_vals))

merged_results <- merge(x = results, y = treat_merge_vals, by = 0)
merged_results <- merged_results[order(merged_results$'FDR',merged_results$'threshold_test_FDR'),]

rownames(merged_results) <- merged_results$'Row.names'

merged_results <- merged_results[,2:length(colnames(merged_results))]


print(dim(results))
print(dim(treat_merge_vals))
print(dim(merged_results))
print(head(merged_results))


print(dim(merged_results[which(merged_results$FDR<0.05 & (merged_results$logFC) > log2(cutoff)),]))
print(head(merged_results[which(merged_results$FDR<0.05 & (merged_results$logFC) > log2(cutoff)),]))
print(dim(merged_results[which(merged_results$FDR<0.05 & (merged_results$logFC) < -log2(cutoff)),]))
print(head(merged_results[which(merged_results$FDR<0.05 & (merged_results$logFC) < -log2(cutoff)),]))



print(dim(merged_results[which(merged_results$threshold_test_FDR<0.05 & (merged_results$logFC) > log2(cutoff)),]))
print(head(merged_results[which(merged_results$threshold_test_FDR<0.05 & (merged_results$logFC) > log2(cutoff)),]))
print(dim(merged_results[which(merged_results$threshold_test_FDR<0.05 & (merged_results$logFC) < -log2(cutoff)),]))
print(head(merged_results[which(merged_results$threshold_test_FDR<0.05 & (merged_results$logFC) < -log2(cutoff)),]))


write.csv(merged_results, paste0(WRKDIR,"/expression/amppd_case_control_diff_exp_edgeR_results.csv"), row.names=TRUE)

### Major Haplogroup differential expression with edgeR

In [None]:
table(amppd_haplos$ltrack_haplo_major)
#only use E, G, I, J and R because present in >40 samples

In [None]:
dge <- DGEList(counts=final_exp, samples = amppd_haplos, group = amppd_haplos$ltrack_haplo_major)

In [None]:
design <- model.matrix(~0+group+pheno,data = dge$samples)

In [None]:
# filter out low expressed genes

#print(dim(dge))
#isexpr <- rowSums(cpm(final_exp) > 10) >= 2
#dge <- dge[isexpr,]
#print(dim(dge))

keep <- filterByExpr(dge, design)
print(table(keep))
print(dim(dge$counts))
dge <- dge[keep, , keep.lib.sizes=FALSE]
print(dim(dge$counts))

In [None]:
AveLogCPM <- aveLogCPM(dge)
hist(AveLogCPM)

In [None]:
dge <- calcNormFactors(dge)
print(head(dge$samples))

In [None]:
pch <- c(0,1,0,1,0,1,0,1,0,1,0,1)
colors <- c("red", "blue","green","yellow","black","grey","violet","cyan","brown","orange","pink","darkgreen")
plotMDS(dge, col=colors[dge$samples$group], pch = pch[dge$samples$group])

legend("topleft", legend=levels(dge$samples$group), pch = pch, col=colors, ncol=2)

In [None]:
plotMD(dge, column=1)

In [None]:
dge <- estimateDisp(dge, design, robust=TRUE)

In [None]:
plotBCV(dge)

In [None]:
fit <- glmQLFit(dge, design, robust=TRUE)
head(fit$coefficients)

In [None]:
plotQLDisp(fit)

In [None]:
summary(fit$df.prior)

## Major haplogroup specific DE analyses

### R

In [None]:
#groupA,groupB,groupC,groupE,groupG,groupH,groupI,groupJ,groupL,groupN,groupO,groupQ,groupR,groupT

#E, G, I, J and R
contr.matrix <- makeContrasts(RvsEverything = groupR-(groupA+groupC+groupE+groupG+groupH+groupI+groupJ+groupL+groupN+groupQ+groupT)/11,levels = colnames(design))
print(contr.matrix)

res <- glmQLFTest(fit, contrast=contr.matrix)

topTagGene <- topTags(res,n=Inf)
print(head(topTagGene$table))

is.de <- decideTestsDGE(res)
summary(is.de)

plotMD(res, status=is.de)

In [None]:
#default, no logFC filter
dim(topTagGene$table[which(topTagGene$table$FDR<0.05 & topTagGene$table$logFC < 0),])
dim(topTagGene$table[which(topTagGene$table$FDR<0.05 & topTagGene$table$logFC > 0),])

#try with a logFC cutoff
cutoff = 4
print(log2(cutoff))
is.de <- decideTestsDGE(res,lfc=log2(cutoff))
print(summary(is.de))



print(dim(topTagGene$table[which(topTagGene$table$FDR<0.05 & (topTagGene$table$logFC) > log2(cutoff)),]))
print(dim(topTagGene$table[which(topTagGene$table$FDR<0.05 & (topTagGene$table$logFC) < -log2(cutoff)),]))



In [None]:
results <- topTagGene$table
#print(head(results))

haplo_specific_exp <- final_exp %>% dplyr::select(amppd_haplos[amppd_haplos$ltrack_haplo_major =='R',]$fid)
haplo_specific_exp$haplo_R_raw_means <- rowMeans(haplo_specific_exp)

results <- merge(x = results, y = haplo_specific_exp %>% select("haplo_R_raw_means"),by.x = 0, by.y = 0)
rownames(results) <- results$'Row.names'
results <-results[,-which(names(results) %in% c("Row.names"))]
results <- results[order(results$"FDR"),]
#print(dim(results))
#print(head(results))

haplo_specific_exp <- final_exp %>% dplyr::select(amppd_haplos[amppd_haplos$ltrack_haplo_major !='R',]$fid)
haplo_specific_exp$haplo_not_R_raw_means <- rowMeans(haplo_specific_exp)

results <- merge(x = results, y = haplo_specific_exp %>% select("haplo_not_R_raw_means"),by.x = 0, by.y = 0)
rownames(results) <- results$'Row.names'
results <-results[,-which(names(results) %in% c("Row.names"))]
results <- results[order(results$"FDR"),]
print(dim(results))
print(head(results))


print("check with logFC cutoff:")
print(dim(results[which(results$FDR<0.05 & (results$logFC) > log2(cutoff)),]))


print(head(results[which(results$FDR<0.05 & (results$logFC) > log2(cutoff)),]))
print(dim(results[which(results$FDR<0.05 & (results$logFC) < -log2(cutoff)),]))
print(head(results[which(results$FDR<0.05 & (results$logFC) < -log2(cutoff)),]))

#write.csv(results, paste0(WRKDIR,"/expression/amppd_haplogroup_R_diff_exp_edgeR_results.csv"), row.names=TRUE)

In [None]:
# (re)Run DE analysis with glmTreat to filter for logFC

treat <- glmTreat(fit,contrast = contr.matrix, lfc = log2(cutoff))

topTagGene_treat <- topTags(treat,n=Inf)
head(topTagGene_treat$table)

is.de <- decideTestsDGE(treat)
summary(is.de)

# merge with the p values before thresholded testing

results_treat <- topTagGene_treat$table


treat_merge_vals <- results_treat %>% select("unshrunk.logFC","PValue","FDR")
colnames(treat_merge_vals) <- paste0("threshold_test_",colnames(treat_merge_vals))
print(head(treat_merge_vals))

merged_results <- merge(x = results, y = treat_merge_vals, by = 0)
merged_results <- merged_results[order(merged_results$'FDR',merged_results$'threshold_test_FDR'),]

rownames(merged_results) <- merged_results$'Row.names'

merged_results <- merged_results[,2:length(colnames(merged_results))]


print(dim(results))
print(dim(treat_merge_vals))
print(dim(merged_results))
print(head(merged_results))


print(dim(merged_results[which(merged_results$FDR<0.05 & (merged_results$logFC) > log2(cutoff)),]))
print(head(merged_results[which(merged_results$FDR<0.05 & (merged_results$logFC) > log2(cutoff)),]))
print(dim(merged_results[which(merged_results$FDR<0.05 & (merged_results$logFC) < -log2(cutoff)),]))
print(head(merged_results[which(merged_results$FDR<0.05 & (merged_results$logFC) < -log2(cutoff)),]))



print(dim(merged_results[which(merged_results$threshold_test_FDR<0.05 & (merged_results$logFC) > log2(cutoff)),]))
print(head(merged_results[which(merged_results$threshold_test_FDR<0.05 & (merged_results$logFC) > log2(cutoff)),]))
print(dim(merged_results[which(merged_results$threshold_test_FDR<0.05 & (merged_results$logFC) < -log2(cutoff)),]))
print(head(merged_results[which(merged_results$threshold_test_FDR<0.05 & (merged_results$logFC) < -log2(cutoff)),]))


write.csv(merged_results, paste0(WRKDIR,"/expression/amppd_haplogroup_R_diff_exp_edgeR_results.csv"), row.names=TRUE)

### E

In [None]:
#groupA,groupB,groupC,groupE,groupG,groupH,groupI,groupJ,groupL,groupN,groupO,groupQ,groupR,groupT

#E, G, I, J and R
contr.matrix <- makeContrasts(EvsEverything = groupE-(groupA+groupC+groupG+groupH+groupI+groupJ+groupL+groupN+groupQ+groupR+groupT)/11,
                              levels = colnames(design))
print(contr.matrix)

res <- glmQLFTest(fit, contrast=contr.matrix)

topTagGene <- topTags(res,n=Inf)
print(head(topTagGene$table))

is.de <- decideTestsDGE(res)
summary(is.de)

plotMD(res, status=is.de)

In [None]:
#default, no logFC filter
dim(topTagGene$table[which(topTagGene$table$FDR<0.05 & topTagGene$table$logFC < 0),])
dim(topTagGene$table[which(topTagGene$table$FDR<0.05 & topTagGene$table$logFC > 0),])

#try with a logFC cutoff
cutoff = 4
print(log2(cutoff))
is.de <- decideTestsDGE(res,lfc=log2(cutoff))
print(summary(is.de))



print(dim(topTagGene$table[which(topTagGene$table$FDR<0.05 & (topTagGene$table$logFC) > log2(cutoff)),]))
print(dim(topTagGene$table[which(topTagGene$table$FDR<0.05 & (topTagGene$table$logFC) < -log2(cutoff)),]))

In [None]:
results <- topTagGene$table
#print(head(results))

haplo_specific_exp <- final_exp %>% dplyr::select(amppd_haplos[amppd_haplos$ltrack_haplo_major =='E',]$fid)
haplo_specific_exp$haplo_E_raw_means <- rowMeans(haplo_specific_exp)

results <- merge(x = results, y = haplo_specific_exp %>% select("haplo_E_raw_means"),by.x = 0, by.y = 0)
rownames(results) <- results$'Row.names'
results <-results[,-which(names(results) %in% c("Row.names"))]
results <- results[order(results$"FDR"),]
#print(dim(results))
#print(head(results))

haplo_specific_exp <- final_exp %>% dplyr::select(amppd_haplos[amppd_haplos$ltrack_haplo_major !='E',]$fid)
haplo_specific_exp$haplo_not_E_raw_means <- rowMeans(haplo_specific_exp)

results <- merge(x = results, y = haplo_specific_exp %>% select("haplo_not_E_raw_means"),by.x = 0, by.y = 0)
rownames(results) <- results$'Row.names'
results <-results[,-which(names(results) %in% c("Row.names"))]
results <- results[order(results$"FDR"),]
print(dim(results))
print(head(results))


print("check with logFC cutoff:")
print(dim(results[which(results$FDR<0.05 & (results$logFC) > log2(cutoff)),]))


print(head(results[which(results$FDR<0.05 & (results$logFC) > log2(cutoff)),]))
print(dim(results[which(results$FDR<0.05 & (results$logFC) < -log2(cutoff)),]))
print(head(results[which(results$FDR<0.05 & (results$logFC) < -log2(cutoff)),]))


In [None]:
# (re)Run DE analysis with glmTreat to filter for logFC

treat <- glmTreat(fit,contrast = contr.matrix, lfc = log2(cutoff))

topTagGene_treat <- topTags(treat,n=Inf)
head(topTagGene_treat$table)

is.de <- decideTestsDGE(treat)
summary(is.de)

# merge with the p values before thresholded testing

results_treat <- topTagGene_treat$table


treat_merge_vals <- results_treat %>% select("unshrunk.logFC","PValue","FDR")
colnames(treat_merge_vals) <- paste0("threshold_test_",colnames(treat_merge_vals))
print(head(treat_merge_vals))

merged_results <- merge(x = results, y = treat_merge_vals, by = 0)
merged_results <- merged_results[order(merged_results$'FDR',merged_results$'threshold_test_FDR'),]

rownames(merged_results) <- merged_results$'Row.names'

merged_results <- merged_results[,2:length(colnames(merged_results))]


print(dim(results))
print(dim(treat_merge_vals))
print(dim(merged_results))
print(head(merged_results))


print(dim(merged_results[which(merged_results$FDR<0.05 & (merged_results$logFC) > log2(cutoff)),]))
print(head(merged_results[which(merged_results$FDR<0.05 & (merged_results$logFC) > log2(cutoff)),]))
print(dim(merged_results[which(merged_results$FDR<0.05 & (merged_results$logFC) < -log2(cutoff)),]))
print(head(merged_results[which(merged_results$FDR<0.05 & (merged_results$logFC) < -log2(cutoff)),]))



print(dim(merged_results[which(merged_results$threshold_test_FDR<0.05 & (merged_results$logFC) > log2(cutoff)),]))
print(head(merged_results[which(merged_results$threshold_test_FDR<0.05 & (merged_results$logFC) > log2(cutoff)),]))
print(dim(merged_results[which(merged_results$threshold_test_FDR<0.05 & (merged_results$logFC) < -log2(cutoff)),]))
print(head(merged_results[which(merged_results$threshold_test_FDR<0.05 & (merged_results$logFC) < -log2(cutoff)),]))


write.csv(merged_results, paste0(WRKDIR,"/expression/amppd_haplogroup_E_diff_exp_edgeR_results.csv"), row.names=TRUE)

### G

In [None]:
#groupA,groupB,groupC,groupE,groupG,groupH,groupI,groupJ,groupL,groupN,groupO,groupQ,groupR,groupT

#E, G, I, J and R
contr.matrix <- makeContrasts(GvsEverything = groupG-(groupA+groupC+groupE+groupH+groupI+groupJ+groupL+groupN+groupQ+groupR+groupT)/11,
                              levels = colnames(design))
print(contr.matrix)

res <- glmQLFTest(fit, contrast=contr.matrix)

topTagGene <- topTags(res,n=Inf)
print(head(topTagGene$table))

is.de <- decideTestsDGE(res)
summary(is.de)

plotMD(res, status=is.de)

In [None]:
#default, no logFC filter
dim(topTagGene$table[which(topTagGene$table$FDR<0.05 & topTagGene$table$logFC < 0),])
dim(topTagGene$table[which(topTagGene$table$FDR<0.05 & topTagGene$table$logFC > 0),])

#try with a logFC cutoff
cutoff = 4
print(log2(cutoff))
is.de <- decideTestsDGE(res,lfc=log2(cutoff))
print(summary(is.de))



print(dim(topTagGene$table[which(topTagGene$table$FDR<0.05 & (topTagGene$table$logFC) > log2(cutoff)),]))
print(dim(topTagGene$table[which(topTagGene$table$FDR<0.05 & (topTagGene$table$logFC) < -log2(cutoff)),]))

In [None]:
results <- topTagGene$table
#print(head(results))

haplo_specific_exp <- final_exp %>% dplyr::select(amppd_haplos[amppd_haplos$ltrack_haplo_major =='G',]$fid)
haplo_specific_exp$haplo_G_raw_means <- rowMeans(haplo_specific_exp)

results <- merge(x = results, y = haplo_specific_exp %>% select("haplo_G_raw_means"),by.x = 0, by.y = 0)
rownames(results) <- results$'Row.names'
results <-results[,-which(names(results) %in% c("Row.names"))]
results <- results[order(results$"FDR"),]
#print(dim(results))
#print(head(results))

haplo_specific_exp <- final_exp %>% dplyr::select(amppd_haplos[amppd_haplos$ltrack_haplo_major !='G',]$fid)
haplo_specific_exp$haplo_not_G_raw_means <- rowMeans(haplo_specific_exp)

results <- merge(x = results, y = haplo_specific_exp %>% select("haplo_not_G_raw_means"),by.x = 0, by.y = 0)
rownames(results) <- results$'Row.names'
results <-results[,-which(names(results) %in% c("Row.names"))]
results <- results[order(results$"FDR"),]
print(dim(results))
print(head(results))


print("check with logFC cutoff:")
print(dim(results[which(results$FDR<0.05 & (results$logFC) > log2(cutoff)),]))


print(head(results[which(results$FDR<0.05 & (results$logFC) > log2(cutoff)),]))
print(dim(results[which(results$FDR<0.05 & (results$logFC) < -log2(cutoff)),]))
print(head(results[which(results$FDR<0.05 & (results$logFC) < -log2(cutoff)),]))


In [None]:
# (re)Run DE analysis with glmTreat to filter for logFC

treat <- glmTreat(fit,contrast = contr.matrix, lfc = log2(cutoff))

topTagGene_treat <- topTags(treat,n=Inf)
head(topTagGene_treat$table)

is.de <- decideTestsDGE(treat)
summary(is.de)

# merge with the p values before thresholded testing

results_treat <- topTagGene_treat$table


treat_merge_vals <- results_treat %>% select("unshrunk.logFC","PValue","FDR")
colnames(treat_merge_vals) <- paste0("threshold_test_",colnames(treat_merge_vals))
print(head(treat_merge_vals))

merged_results <- merge(x = results, y = treat_merge_vals, by = 0)
merged_results <- merged_results[order(merged_results$'FDR',merged_results$'threshold_test_FDR'),]

rownames(merged_results) <- merged_results$'Row.names'

merged_results <- merged_results[,2:length(colnames(merged_results))]


print(dim(results))
print(dim(treat_merge_vals))
print(dim(merged_results))
print(head(merged_results))


print(dim(merged_results[which(merged_results$FDR<0.05 & (merged_results$logFC) > log2(cutoff)),]))
print(head(merged_results[which(merged_results$FDR<0.05 & (merged_results$logFC) > log2(cutoff)),]))
print(dim(merged_results[which(merged_results$FDR<0.05 & (merged_results$logFC) < -log2(cutoff)),]))
print(head(merged_results[which(merged_results$FDR<0.05 & (merged_results$logFC) < -log2(cutoff)),]))



print(dim(merged_results[which(merged_results$threshold_test_FDR<0.05 & (merged_results$logFC) > log2(cutoff)),]))
print(head(merged_results[which(merged_results$threshold_test_FDR<0.05 & (merged_results$logFC) > log2(cutoff)),]))
print(dim(merged_results[which(merged_results$threshold_test_FDR<0.05 & (merged_results$logFC) < -log2(cutoff)),]))
print(head(merged_results[which(merged_results$threshold_test_FDR<0.05 & (merged_results$logFC) < -log2(cutoff)),]))


write.csv(merged_results, paste0(WRKDIR,"/expression/amppd_haplogroup_G_diff_exp_edgeR_results.csv"), row.names=TRUE)

### I

In [None]:
#groupA,groupB,groupC,groupE,groupG,groupH,groupI,groupJ,groupL,groupN,groupO,groupQ,groupR,groupT

#E, G, I, J and R
contr.matrix <- makeContrasts(IvsEverything = groupI-(groupA+groupC+groupE+groupG+groupH+groupJ+groupL+groupN+groupQ+groupR+groupT)/11,
                              levels = colnames(design))
print(contr.matrix)

res <- glmQLFTest(fit, contrast=contr.matrix)

topTagGene <- topTags(res,n=Inf)
print(head(topTagGene$table))

is.de <- decideTestsDGE(res)
summary(is.de)

plotMD(res, status=is.de)

In [None]:
#default, no logFC filter
dim(topTagGene$table[which(topTagGene$table$FDR<0.05 & topTagGene$table$logFC < 0),])
dim(topTagGene$table[which(topTagGene$table$FDR<0.05 & topTagGene$table$logFC > 0),])

#try with a logFC cutoff
cutoff = 4
print(log2(cutoff))
is.de <- decideTestsDGE(res,lfc=log2(cutoff))
print(summary(is.de))



print(dim(topTagGene$table[which(topTagGene$table$FDR<0.05 & (topTagGene$table$logFC) > log2(cutoff)),]))
print(dim(topTagGene$table[which(topTagGene$table$FDR<0.05 & (topTagGene$table$logFC) < -log2(cutoff)),]))

In [None]:
results <- topTagGene$table
#print(head(results))

haplo_specific_exp <- final_exp %>% dplyr::select(amppd_haplos[amppd_haplos$ltrack_haplo_major =='I',]$fid)
haplo_specific_exp$haplo_I_raw_means <- rowMeans(haplo_specific_exp)

results <- merge(x = results, y = haplo_specific_exp %>% select("haplo_I_raw_means"),by.x = 0, by.y = 0)
rownames(results) <- results$'Row.names'
results <-results[,-which(names(results) %in% c("Row.names"))]
results <- results[order(results$"FDR"),]
#print(dim(results))
#print(head(results))

haplo_specific_exp <- final_exp %>% dplyr::select(amppd_haplos[amppd_haplos$ltrack_haplo_major !='I',]$fid)
haplo_specific_exp$haplo_not_I_raw_means <- rowMeans(haplo_specific_exp)

results <- merge(x = results, y = haplo_specific_exp %>% select("haplo_not_I_raw_means"),by.x = 0, by.y = 0)
rownames(results) <- results$'Row.names'
results <-results[,-which(names(results) %in% c("Row.names"))]
results <- results[order(results$"FDR"),]
print(dim(results))
print(head(results))


print("check with logFC cutoff:")
print(dim(results[which(results$FDR<0.05 & (results$logFC) > log2(cutoff)),]))


print(head(results[which(results$FDR<0.05 & (results$logFC) > log2(cutoff)),]))
print(dim(results[which(results$FDR<0.05 & (results$logFC) < -log2(cutoff)),]))
print(head(results[which(results$FDR<0.05 & (results$logFC) < -log2(cutoff)),]))


In [None]:
# (re)Run DE analysis with glmTreat to filter for logFC

treat <- glmTreat(fit,contrast = contr.matrix, lfc = log2(cutoff))

topTagGene_treat <- topTags(treat,n=Inf)
head(topTagGene_treat$table)

is.de <- decideTestsDGE(treat)
summary(is.de)

# merge with the p values before thresholded testing

results_treat <- topTagGene_treat$table


treat_merge_vals <- results_treat %>% select("unshrunk.logFC","PValue","FDR")
colnames(treat_merge_vals) <- paste0("threshold_test_",colnames(treat_merge_vals))
print(head(treat_merge_vals))

merged_results <- merge(x = results, y = treat_merge_vals, by = 0)
merged_results <- merged_results[order(merged_results$'FDR',merged_results$'threshold_test_FDR'),]

rownames(merged_results) <- merged_results$'Row.names'

merged_results <- merged_results[,2:length(colnames(merged_results))]


print(dim(results))
print(dim(treat_merge_vals))
print(dim(merged_results))
print(head(merged_results))


print(dim(merged_results[which(merged_results$FDR<0.05 & (merged_results$logFC) > log2(cutoff)),]))
print(head(merged_results[which(merged_results$FDR<0.05 & (merged_results$logFC) > log2(cutoff)),]))
print(dim(merged_results[which(merged_results$FDR<0.05 & (merged_results$logFC) < -log2(cutoff)),]))
print(head(merged_results[which(merged_results$FDR<0.05 & (merged_results$logFC) < -log2(cutoff)),]))



print(dim(merged_results[which(merged_results$threshold_test_FDR<0.05 & (merged_results$logFC) > log2(cutoff)),]))
print(head(merged_results[which(merged_results$threshold_test_FDR<0.05 & (merged_results$logFC) > log2(cutoff)),]))
print(dim(merged_results[which(merged_results$threshold_test_FDR<0.05 & (merged_results$logFC) < -log2(cutoff)),]))
print(head(merged_results[which(merged_results$threshold_test_FDR<0.05 & (merged_results$logFC) < -log2(cutoff)),]))


write.csv(merged_results, paste0(WRKDIR,"/expression/amppd_haplogroup_I_diff_exp_edgeR_results.csv"), row.names=TRUE)

### J

In [None]:
#groupA,groupB,groupC,groupE,groupG,groupH,groupI,groupJ,groupL,groupN,groupO,groupQ,groupR,groupT

#E, G, I, J and R
contr.matrix <- makeContrasts(JvsEverything = groupJ-(groupA+groupC+groupE+groupG+groupH+groupI+groupL+groupN+groupQ+groupR+groupT)/11,
                              levels = colnames(design))
print(contr.matrix)

res <- glmQLFTest(fit, contrast=contr.matrix)

topTagGene <- topTags(res,n=Inf)
print(head(topTagGene$table))

is.de <- decideTestsDGE(res)
summary(is.de)

plotMD(res, status=is.de)

In [None]:
#default, no logFC filter
dim(topTagGene$table[which(topTagGene$table$FDR<0.05 & topTagGene$table$logFC < 0),])
dim(topTagGene$table[which(topTagGene$table$FDR<0.05 & topTagGene$table$logFC > 0),])

#try with a logFC cutoff
cutoff = 4
print(log2(cutoff))
is.de <- decideTestsDGE(res,lfc=log2(cutoff))
print(summary(is.de))



print(dim(topTagGene$table[which(topTagGene$table$FDR<0.05 & (topTagGene$table$logFC) > log2(cutoff)),]))
print(dim(topTagGene$table[which(topTagGene$table$FDR<0.05 & (topTagGene$table$logFC) < -log2(cutoff)),]))

In [None]:
results <- topTagGene$table
#print(head(results))

haplo_specific_exp <- final_exp %>% dplyr::select(amppd_haplos[amppd_haplos$ltrack_haplo_major =='J',]$fid)
haplo_specific_exp$haplo_J_raw_means <- rowMeans(haplo_specific_exp)

results <- merge(x = results, y = haplo_specific_exp %>% select("haplo_J_raw_means"),by.x = 0, by.y = 0)
rownames(results) <- results$'Row.names'
results <-results[,-which(names(results) %in% c("Row.names"))]
results <- results[order(results$"FDR"),]
#print(dim(results))
#print(head(results))

haplo_specific_exp <- final_exp %>% dplyr::select(amppd_haplos[amppd_haplos$ltrack_haplo_major !='J',]$fid)
haplo_specific_exp$haplo_not_J_raw_means <- rowMeans(haplo_specific_exp)

results <- merge(x = results, y = haplo_specific_exp %>% select("haplo_not_J_raw_means"),by.x = 0, by.y = 0)
rownames(results) <- results$'Row.names'
results <-results[,-which(names(results) %in% c("Row.names"))]
results <- results[order(results$"FDR"),]
print(dim(results))
print(head(results))


print("check with logFC cutoff:")
print(dim(results[which(results$FDR<0.05 & (results$logFC) > log2(cutoff)),]))


print(head(results[which(results$FDR<0.05 & (results$logFC) > log2(cutoff)),]))
print(dim(results[which(results$FDR<0.05 & (results$logFC) < -log2(cutoff)),]))
print(head(results[which(results$FDR<0.05 & (results$logFC) < -log2(cutoff)),]))

write.csv(results, paste0(WRKDIR,"/expression/amppd_haplogroup_J_diff_exp_edgeR_results.csv"), row.names=TRUE)

In [None]:
# (re)Run DE analysis with glmTreat to filter for logFC

treat <- glmTreat(fit,contrast = contr.matrix, lfc = log2(cutoff))

topTagGene_treat <- topTags(treat,n=Inf)
head(topTagGene_treat$table)

is.de <- decideTestsDGE(treat)
summary(is.de)

# merge with the p values before thresholded testing

results_treat <- topTagGene_treat$table


treat_merge_vals <- results_treat %>% select("unshrunk.logFC","PValue","FDR")
colnames(treat_merge_vals) <- paste0("threshold_test_",colnames(treat_merge_vals))
print(head(treat_merge_vals))

merged_results <- merge(x = results, y = treat_merge_vals, by = 0)
merged_results <- merged_results[order(merged_results$'FDR',merged_results$'threshold_test_FDR'),]

rownames(merged_results) <- merged_results$'Row.names'

merged_results <- merged_results[,2:length(colnames(merged_results))]


print(dim(results))
print(dim(treat_merge_vals))
print(dim(merged_results))
print(head(merged_results))


print(dim(merged_results[which(merged_results$FDR<0.05 & (merged_results$logFC) > log2(cutoff)),]))
print(head(merged_results[which(merged_results$FDR<0.05 & (merged_results$logFC) > log2(cutoff)),]))
print(dim(merged_results[which(merged_results$FDR<0.05 & (merged_results$logFC) < -log2(cutoff)),]))
print(head(merged_results[which(merged_results$FDR<0.05 & (merged_results$logFC) < -log2(cutoff)),]))



print(dim(merged_results[which(merged_results$threshold_test_FDR<0.05 & (merged_results$logFC) > log2(cutoff)),]))
print(head(merged_results[which(merged_results$threshold_test_FDR<0.05 & (merged_results$logFC) > log2(cutoff)),]))
print(dim(merged_results[which(merged_results$threshold_test_FDR<0.05 & (merged_results$logFC) < -log2(cutoff)),]))
print(head(merged_results[which(merged_results$threshold_test_FDR<0.05 & (merged_results$logFC) < -log2(cutoff)),]))


write.csv(merged_results, paste0(WRKDIR,"/expression/amppd_haplogroup_J_diff_exp_edgeR_results.csv"), row.names=TRUE)