# Chromosome Y Gene Differential Expression in between NABEC Y-PAR Masked and Unmasked Data
- **Author(s)** - Frank Grenn
- **Date Started** - August 2021
- **Quick Description:** Differential gene expression between nabec data quantified with a reference genome with Y-PARs masked and with Y-PARs unmasked/included

In [None]:
library(data.table)
library(dplyr)
library(edgeR)


In [None]:
WRKDIR <- "$PATH/chrY"

## Get sample list

In [None]:
samples <- list.files("$PATH/quants_PAR_masked")
samples <- samples[grepl("fctx",samples)]

## Get PAR masked data

In [None]:
counts_PAR_mask <- as.data.frame(fread("$PATH/quants_PAR_masked_matrix.csv"))
rownames(counts_PAR_mask) <- counts_PAR_mask$'Geneid'
print(dim(counts_PAR_mask))
print(counts_PAR_mask[1:5,1:5])



## Get PAR unmasked data

In [None]:
counts_default_ref <- as.data.frame(fread("$PATH/quants_default_ref_matrix.csv"))
rownames(counts_default_ref) <- counts_default_ref$'Geneid'
print(dim(counts_default_ref))
print(counts_default_ref[1:5,1:5])


## Get sample data and combine masked and unmasked datasets

In [None]:

#get the sample names from the pheno file
covs <- fread("$PATH/sample_info_new_id.txt")
print(dim(covs))
print(head(covs))

In [None]:
print(length(covs$new_id))
print(length(samples))

dim((covs[which(covs$new_id %in% samples)]))

print(unique(covs$Gender))

print(dim(covs[covs$Gender=="male"]))
print(dim(covs[covs$Gender=="female"]))

male_covs <- covs[(which(covs$new_id %in% samples & covs$Gender == "male")),]
print(dim(male_covs))
print(head(male_covs))


head(male_covs)

In [None]:
#get male columns 
male_cols <- colnames(counts_PAR_mask)[(colnames(counts_PAR_mask) %in% male_covs$new_id)]
#select male columns from counts
counts_PAR_mask_male <- counts_PAR_mask[,male_cols]
print(dim(counts_PAR_mask_male))

#get male columns 
male_cols <- colnames(counts_default_ref)[(colnames(counts_default_ref) %in% male_covs$new_id)]
#select male columns from counts
counts_default_male <- counts_default_ref[,male_cols]
print(dim(counts_default_male))


In [None]:

print(head(counts_PAR_mask_male))
colnames(counts_PAR_mask_male) <- paste0(colnames(counts_PAR_mask_male),"_PAR_mask")
print(head(counts_PAR_mask_male))

print(head(counts_default_male))
colnames(counts_default_male) <- paste0(colnames(counts_default_male),"_default")
print(head(counts_default_male))


In [None]:
final_exp <- merge(counts_PAR_mask_male, counts_default_male, by=0, all= TRUE)
rownames(final_exp) <- final_exp$'Row.names'
print(dim(final_exp))
print(head(final_exp))
final_exp <- final_exp[,2:length(colnames(final_exp))]

In [None]:

mask_covs <- male_covs
mask_covs$new_id <- paste0(mask_covs$new_id,"_PAR_mask")
mask_covs$masked <- "PAR_masked"
def_covs <- male_covs
def_covs$new_id <- paste0(def_covs$new_id,"_default")
def_covs$masked <- "default"

final_covs <- rbind(mask_covs, def_covs)
rownames(final_covs) <- final_covs$new_id
print(dim(final_covs))
print(head(final_covs))
print(tail(final_covs))

#remove rows with all zeros
print("how many after removing all zero rows")
print(dim(final_exp[rowSums(final_exp[])>0,]))
final_exp <- final_exp[rowSums(final_exp[])>0,]

In [None]:
#use dplyr select to reorder columns 
final_exp <- final_exp %>% dplyr::select(final_covs$new_id)
print(dim(final_exp))
print(dim(final_covs))

#check if colnames of cntTable match rownames of demogdiag metadata and if they are in the same order
print("check if data cols are same as meta data rows")
print(all(colnames(final_exp) == rownames(final_covs)))

## Differential expression analysis with edgeR

In [None]:
dge <- DGEList(counts=final_exp, samples = final_covs, group = final_covs$masked)


In [None]:
design <- model.matrix(~0+group,data = dge$samples)


In [None]:
# filter out low expressed genes

#print(dim(dge))
#isexpr <- rowSums(cpm(final_exp) > 10) >= 2
#dge <- dge[isexpr,]
#print(dim(dge))

keep <- filterByExpr(dge, design)
print(table(keep))
print(dim(dge$counts))
dge <- dge[keep, , keep.lib.sizes=FALSE]
print(dim(dge$counts))

In [None]:
AveLogCPM <- aveLogCPM(dge)
hist(AveLogCPM)

In [None]:
dge <- calcNormFactors(dge)
print(head(dge$samples))

In [None]:
pch <- c(0,1)
colors <- c("red", "blue")
plotMDS(dge, col=colors[dge$samples$group], pch = pch[dge$samples$group])

legend("topleft", legend=levels(dge$samples$group), pch = pch, col=colors, ncol=2)

In [None]:
plotMD(dge, column=1)

In [None]:
dge <- estimateDisp(dge, design, robust=TRUE)

In [None]:
plotBCV(dge)

In [None]:
fit <- glmQLFit(dge, design, robust=TRUE)
head(fit$coefficients)

In [None]:
plotQLDisp(fit)

In [None]:
summary(fit$df.prior)

In [None]:
head(dge$samples)

In [None]:
head(design)

In [None]:
contr.matrix <- makeContrasts(maskedvsdefault = groupPAR_masked-groupdefault,levels = colnames(design))
print(contr.matrix)

In [None]:
res <- glmQLFTest(fit, contrast=contr.matrix)

In [None]:
topTagGene <- topTags(res,n=Inf)
head(topTagGene$table)

In [None]:
is.de <- decideTestsDGE(res)
summary(is.de)

In [None]:
plotMD(res, status=is.de)

In [None]:
names(res)

#### filter by logFC

In [None]:
#distribution of the log fold change
summary(topTags(res,n=Inf)$table$logFC)

In [None]:
#default, no logFC filter
dim(topTagGene$table[which(topTagGene$table$FDR<0.05 & topTagGene$table$logFC < 0),])
dim(topTagGene$table[which(topTagGene$table$FDR<0.05 & topTagGene$table$logFC > 0),])

In [None]:
#try with a logFC cutoff
cutoff = 4
print(log2(cutoff))
is.de <- decideTestsDGE(res,lfc=log2(cutoff))
print(summary(is.de))



print(dim(topTagGene$table[which(topTagGene$table$FDR<0.05 & (topTagGene$table$logFC) > log2(cutoff)),]))
print(dim(topTagGene$table[which(topTagGene$table$FDR<0.05 & (topTagGene$table$logFC) < -log2(cutoff)),]))

#### merge the group raw count means to the topTagGene$table

In [None]:
results <- topTagGene$table
print(head(results))

In [None]:
temp_mask_exp <- counts_PAR_mask %>% select(male_covs$new_id)
print(dim(temp_mask_exp))
temp_mask_exp$PAR_masked_means <- rowMeans(temp_mask_exp)

results <- merge(x = results, y = temp_mask_exp %>% select("PAR_masked_means"),by.x = 0, by.y = 0)
rownames(results) <- results$'Row.names'
results <-results[,-which(names(results) %in% c("Row.names"))]
results <- results[order(results$"FDR"),]
print(dim(results))
print(head(results))

In [None]:
temp_ummask_exp <- counts_default_ref %>% select(male_covs$new_id)
print(dim(temp_ummask_exp))
temp_ummask_exp$PAR_ummasked_means <- rowMeans(temp_ummask_exp)

results <- merge(x = results, y = temp_ummask_exp %>% select("PAR_ummasked_means"),by.x = 0, by.y = 0)
rownames(results) <- results$'Row.names'
results <-results[,-which(names(results) %in% c("Row.names"))]
results <- results[order(results$"FDR"),]
print(dim(results))
print(head(results))

In [None]:
print(dim(results[which(results$FDR<0.05 & (results$logFC) > log2(cutoff)),]))


print(head(results[which(results$FDR<0.05 & (results$logFC) > log2(cutoff)),]))
print(dim(results[which(results$FDR<0.05 & (results$logFC) < -log2(cutoff)),]))
print(head(results[which(results$FDR<0.05 & (results$logFC) < -log2(cutoff)),]))

In [None]:
(results[which(results$FDR<0.05 & (results$logFC) > log2(cutoff)),])

## (re)Run DE analysis with glmTreat to filter for logFC

In [None]:
treat <- glmTreat(fit,contrast = contr.matrix, lfc = log2(cutoff))

In [None]:
plotQLDisp(treat)

In [None]:
topTagGene_treat <- topTags(treat,n=Inf)
head(topTagGene_treat$table)

In [None]:
is.de <- decideTestsDGE(treat)
summary(is.de)

In [None]:
plotMD(treat, status=is.de)

#### merge with the p values before thresholded testing

In [None]:
results_treat <- topTagGene_treat$table
print(head(results_treat))

In [None]:
treat_merge_vals <- results_treat %>% select("unshrunk.logFC","PValue","FDR")
colnames(treat_merge_vals) <- paste0("threshold_test_",colnames(treat_merge_vals))
print(head(treat_merge_vals))

In [None]:
merged_results <- merge(x = results, y = treat_merge_vals, by = 0)
merged_results <- merged_results[order(merged_results$'FDR',merged_results$'threshold_test_FDR'),]

rownames(merged_results) <- merged_results$'Row.names'

merged_results <- merged_results[,2:length(colnames(merged_results))]


print(dim(results))
print(dim(treat_merge_vals))
print(dim(merged_results))
print(head(merged_results))

In [None]:
print(dim(merged_results[which(merged_results$FDR<0.05 & (merged_results$logFC) > log2(cutoff)),]))

print(head(merged_results[which(merged_results$FDR<0.05 & (merged_results$logFC) > log2(cutoff)),]))
print(dim(merged_results[which(merged_results$FDR<0.05 & (merged_results$logFC) < -log2(cutoff)),]))
print(head(merged_results[which(merged_results$FDR<0.05 & (merged_results$logFC) < -log2(cutoff)),]))

In [None]:
print(dim(merged_results[which(merged_results$threshold_test_FDR<0.05 & (merged_results$logFC) > log2(cutoff)),]))

print(head(merged_results[which(merged_results$threshold_test_FDR<0.05 & (merged_results$logFC) > log2(cutoff)),]))
print(dim(merged_results[which(merged_results$threshold_test_FDR<0.05 & (merged_results$logFC) < -log2(cutoff)),]))
print(head(merged_results[which(merged_results$threshold_test_FDR<0.05 & (merged_results$logFC) < -log2(cutoff)),]))

In [None]:
write.csv(merged_results, paste0(WRKDIR,"/expression/requant_diff_exp_edgeR_results.csv"), row.names=TRUE)