# Quality control of samples and probes
Author: Jose Jaime Martinez-Magana

This script performs the quality control of the samples with data from 450K arrays

In [None]:
# request computational resources
srun --pty --mem=32G -p interactive bash
# load environment
module load miniconda
# activate environment
conda activate ewas_saliva

In [None]:
# load R
# add log messages
R
paste("Start loading data")
# load libraries
# load idat files in minfi format
library(minfi)
library(EpiSCORE)
library(EpiDISH)
library(magrittr)
library(plyr)
library(dplyr)
library(wateRmelon)

# set directory for input idat files
idat_p='/vast/palmer/scratch/montalvo-ortiz/jjm262/epigenomics/databases/raw'
# set directory for input of phenofile
pheno_p='/vast/palmer/scratch/montalvo-ortiz/jjm262/epigenomics/pheno/pheno_450k_ses_file_v02062023.csv'
# set directory for output of beta matrices
out_p='/vast/palmer/scratch/montalvo-ortiz/jjm262/epigenomics/databases/qced/qced_data_v02062023'
paste0("Start analysis of data:",Sys.time(),"---","###Analysis path[",idat_p,"]###")
paste0("QCed data will be saved to:",Sys.time(),"---","###Analysis path[",out_p,"]###")

In [None]:
# code for extracting the array information and building a target file for reading the data
files=list.files(idat_p, recursive=TRUE, pattern="idat")
slide=strsplit(files, "_") %>% sapply(extract2, 1)
array=strsplit(files, "_") %>% sapply(extract2, 2)
SampleID=paste(slide,"_",array, sep="")
targets=file.path(idat_p, SampleID)
samplesheet=data.frame(Sentrix_ID=slide,
                       Sentrix_Position=array,
                       Sample_name=SampleID,
                       Pool_ID=rep(1:length(SampleID)),
                       Sample_Plate=rep(1:length(SampleID)),
                       Sample_Well=rep(1:length(SampleID)),
                       Basename=targets)
# removing duplicates, this step is necesary because we have two different chanels, read and green
samplesheet = samplesheet[!duplicated(samplesheet$Basename),]
# create an empty list to store the results
out=list()
# reading idat files with minfi
paste0("Start reading idat files:",Sys.time())
out$rgset=read.metharray.exp(targets=samplesheet,
                         extended = TRUE,
                         verbose = TRUE)
paste0("End reading idat files:",Sys.time())

In [None]:
# load sample sheet with phenotype information
pheno=read.csv(pheno_p)
# add the column identifing the samples to SampleID
pheno$SampleID=pheno$ID
# get the loaded samples from idat path
idat_samples=rownames(out$rgset@colData)
paste0("Checking if samples in  phenofile are in idat files")
paste0("Revise your phenofile if you see samples in FALSE")
missing_samples=pheno[!pheno$SampleID %in% idat_samples,]$SampleID
paste0("The following samples are missing")
missing_samples
paste0("Warning: this script will subset the samples to include only those found in the phenofile")
# extract samples to keep
common_samples=pheno[pheno$SampleID %in% idat_samples,]$SampleID
out$rgset=subsetByLoci(rgSet=out$rgset[,common_samples])
f_samples=rownames(out$rgset@colData)
f_samples1=length(rownames(out$rgset@colData))
paste0("A total of: ",f_samples1, " will be keep for further analysis")
# subsetting pheno to common samples
pheno=pheno[pheno$SampleID %in% common_samples,]
# ordering pheno based on rgset data
pheno_or=pheno[match(pheno$SampleID, f_samples),]
# adding array and slide as pheno rowname
rownames(pheno_or)=pheno_or$SampleID
out$pheno=pheno_or

In [None]:
# imputing sex with minfi
# mapping array to genome
out$predicted_sex=getSex(mapToGenome(out$rgset))
# adding predicted sex to pheno file
# recoding sex prediction
out$pheno$predicted_sex2=ifelse(out$predicted_sex$predictedSex == "F", 0, 1)
paste0('Warning, this script expects that sex is codify as 0 for females and 1 for males')
out$pheno$predicted_sex=out$predicted_sex$predictedSex2
# cheking sex prediction
sex_mismatch=out$pheno[!out$pheno$predicted_sex==out$pheno$gender,]$SampleID
# add sex mismatches samples to output
out$sex_mismatch=sex_mismatch
non_sex_mismatch=out$pheno[out$pheno$predicted_sex==out$pheno$gender,]$SampleID
paste0('Warning removing: ',sex_mismatch, ' for sex mismatch')
paste0('Removing sex mismatched samples from rgset')
out$rgset=subsetByLoci(rgSet=out$rgset[,non_sex_mismatch])
paste0('Removing sex mismatched samples from phenofile')
out$pheno=out$pheno[rownames(out$pheno) %in% non_sex_mismatch,]

In [None]:
# use non-specific probes for 450k from github: https://github.com/sirselim/illumina450k_filtering
cre_450k=read.csv('/gpfs/gibbs/project/montalvo-ortiz/jjm262/analysis/epigenomics/ses/databases/illumina450k_filtering/48639-non-specific-probes-Illumina450k.csv')
mul_450k=read.table('/gpfs/gibbs/project/montalvo-ortiz/jjm262/analysis/epigenomics/ses/databases/illumina450k_filtering/HumanMethylation450_15017482_v.1.1_hg19_bowtie_multimap.txt', header=FALSE)
exclude_probes=unique(c(cre_450k$TargetID, mul_450k$V1))

In [None]:
# removing bead and low quality probes based on RAMWAS pipeline: https://github.com/andreyshabalin/ramwas
lb=getNBeads(out$rgset) < 3
pi1=getProbeInfo(out$rgset, type = "I")
pi2=getProbeInfo(out$rgset, type = "II")
ex1=pi1$Name[rowMeans(lb[pi1$AddressA,] | lb[pi1$AddressB,]) > 0.01]
ex2=pi2$Name[rowMeans(lb[pi2$AddressA,]) > 0.01]
exclude_bds=unique(c(ex1, ex2))
# low detection p-value
hp=detectionP(out$rgset) > 0.01
exclude_hpv=rownames(hp)[rowMeans(hp) > 0.01]
keep_samples=colMeans(hp) < 0.01
paste0('Removing samples and bad cpg sites from rgset')
out$rgset=subsetByLoci(
    rgSet = out$rgset[,keep_samples],
    excludeLoci = c(exclude_probes, exclude_bds, exclude_hpv))
# add excluded probes to output
out$excluded_probes=unique(c(exclude_probes, exclude_bds, exclude_hpv))
paste0('Removing samples from phenofile')
bad_quality_samples=out$pheno[!rownames(out$pheno) %in% colnames(out$rgset),]$SampleID
# add bad quality samples to output
out$bad_quality_samples=bad_quality_samples
non_bad_quality_samples=out$pheno[rownames(out$pheno) %in% colnames(out$rgset),]$SampleID
paste0('Warning removing: ', bad_quality_samples, ' for bad quality samples')
# matching sample order again
out$pheno=out$pheno[rownames(out$pheno) %in% non_bad_quality_samples,]
out$pheno=pheno[match(rownames(out$pheno),colnames(out$rgset)),]

In [None]:
# starting normalization 
# fixing outliers
out$rgsetraw=fixMethOutliers(preprocessRaw(out$rgset))
# BMIQ normalization
out$rgsetraw_bmiq=BMIQ(out$rgsetraw)

In [None]:
# estimating cell-type proportions
# estimating cell type proportions
# using epidish
out$cell_epidish=hepidish(beta.m=out$rgsetraw_bmiq,
                ref1.m=centEpiFibIC.m,
                ref2.m=centBloodSub.m,
                h.CT.idx=3, method = 'RPC')
##Construction of DNAm matrix at gene-level
avDNAm.m=constAvBetaTSS(beta.m=out$rgsetraw_bmiq, type="450k")
out$cell_episcore=wRPC(avDNAm.m,
                       mrefSkin.m,
                       useW = TRUE,
                       wth=0.4,
                       maxit = 100)

In [None]:
# estimating pcs with control probes based on RAMWAS pipeline: https://github.com/andreyshabalin/ramwas
control_type=unique(getManifest(out$rgset)@data$TypeControl$Type)
control_set=getControlAddress(out$rgset, controlType=control_type)
probe_data=rbind(getRed(out$rgset)[control_set,], getGreen(out$rgset)[control_set,])
data=probe_data-rowMeans(probe_data)
out$eig=prcomp(data)

In [None]:
# get unmethylated and methylated proportions
out$covariates_umm=getQC(out$rgsetraw)

In [None]:
# saving output
# saving the whole output
saveRDS(out,paste0(out_p,".rds"))
# saving the txt beta matrice for glint
 write.table(out$rgsetraw_bmiq, file=paste0(out_p,"_for_glint.txt"), quote=FALSE, sep="\t")