From a0b318ac9bd639fe98d55814c962e10a29ee5584 Mon Sep 17 00:00:00 2001 From: piyalkarum Date: Mon, 29 Apr 2024 14:54:07 +0200 Subject: [PATCH] new functions and bug fixes --- DESCRIPTION | 4 +- NEWS.md | 2 + R/detect.R | 2 +- R/raw_process.R | 66 +++++++++++++++-------- docs/404.html | 2 +- docs/LICENSE.html | 2 +- docs/articles/index.html | 2 +- docs/articles/rCNV.html | 4 +- docs/authors.html | 2 +- docs/index.html | 2 +- docs/news/index.html | 4 +- docs/pkgdown.yml | 2 +- docs/reference/ADnorm.html | 2 +- docs/reference/ADtable.html | 2 +- docs/reference/Rplot003.png | Bin 9383 -> 8824 bytes docs/reference/ad.correct.html | 2 +- docs/reference/allele.freq.html | 2 +- docs/reference/allele.info.html | 2 +- docs/reference/alleleINF.html | 2 +- docs/reference/cnv.html | 2 +- docs/reference/cpm.normal.html | 2 +- docs/reference/depthVsSample.html | 2 +- docs/reference/dup.plot.html | 2 +- docs/reference/dup.validate.html | 2 +- docs/reference/dupGet.html | 2 +- docs/reference/exportVCF.html | 2 +- docs/reference/get.miss.html | 2 +- docs/reference/gt.format.html | 2 +- docs/reference/h.zygosity.html | 2 +- docs/reference/hetTgen.html | 2 +- docs/reference/index.html | 2 +- docs/reference/maf.html | 23 ++++++-- docs/reference/norm.fact.html | 2 +- docs/reference/power.bias.html | 2 +- docs/reference/readVCF.html | 2 +- docs/reference/relatedness.html | 2 +- docs/reference/sig.hets.html | 2 +- docs/reference/sim.als.html | 2 +- docs/reference/vcf.stat.html | 2 +- docs/reference/vst.html | 2 +- docs/reference/vstPermutation-3.png | Bin 28875 -> 26209 bytes docs/reference/vstPermutation.html | 79 ++++++++++++++++------------ docs/search.json | 2 +- man/maf.Rd | 18 +++++-- tests/testthat/Rplots.pdf | Bin 0 -> 806807 bytes tests/testthat/test-ad.R | 2 +- tests/testthat/test-allele.R | 2 +- tests/testthat/test-allele_tests.R | 2 +- tests/testthat/test-cpm.R | 2 +- tests/testthat/test-depthVsSample.R | 2 +- tests/testthat/test-dupGet.R | 2 +- tests/testthat/test-maf.R | 2 +- tests/testthat/test-readVCF.R | 2 +- tests/testthat/test-sim.R | 2 +- tests/testthat/test-vcf.R | 2 +- 55 files changed, 177 insertions(+), 111 deletions(-) create mode 100644 tests/testthat/Rplots.pdf diff --git a/DESCRIPTION b/DESCRIPTION index b8d9b6a..4065f2c 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,8 +1,8 @@ Package: rCNV Type: Package Title: Detect Copy Number Variants from SNPs Data -Version: 1.2.9000 -Date: 2023-08-01 +Version: 1.3.9000 +Date: 2024-04-30 Language: en-US Authors@R: c(person(given="Piyal",family="Karunarathne", email="piyalkarumail@yahoo.com", role = c("aut", "cre"),comment = c(ORCID = "0000-0002-1934-145X")),person(given="Qiujie",family="Zhou", email="qiujie.zhou@ebc.uu.se", role = c("aut"),comment = c(ORCID = "0000-0001-7351-2371")),person("Klaus", "Schliep", email="klaus.schliep@gmail.com", role = c("aut"), comment = c(ORCID = "0000-0003-2941-0161")),person(given="Pascal", family="Milesi", role = "aut",comment = c(ORCID = "0000-0001-8580-4291"))) Maintainer: Piyal Karunarathne diff --git a/NEWS.md b/NEWS.md index cbb25f4..f386063 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,7 @@ # rCNV 1.3.0 (third update) * vstPermutation function added +* maf modified to remove multi-allelic sites +* FIT correction added # rCNV 1.2.0 (second update) * relatedness function optimized diff --git a/R/detect.R b/R/detect.R index 58b6dcb..3b6dd7f 100644 --- a/R/detect.R +++ b/R/detect.R @@ -219,7 +219,7 @@ dupGet<-function(data,test=c("z.het","z.05","z.all","chi.het","chi.05","chi.all" #data check data<-as.data.frame(data) if(!any(colnames(data)=="propHet")){ - stop("please provide the data with the output of allele.info()") + stop("please provide the output of allele.info()") } else { if(!(any(colnames(data)=="eH.pval"))) { ht<-sig.hets(data,plot=F,verbose=verbose) diff --git a/R/raw_process.R b/R/raw_process.R index 61d98c1..0db9a49 100644 --- a/R/raw_process.R +++ b/R/raw_process.R @@ -133,37 +133,59 @@ gg<-function(x){ #' a bi-allelic matrix when loci are multi-allelic #' #' @param h.table allele depth table generated from the function \code{hetTgen} -#' @param AD logical. If TRUE a allele depth table similar to \code{hetTgen} -#' output will be returns; If \code{FALSE}, individual AD values per SNP will be +#' @param drop.multi logical. If TRUE, all the sites with more than two allele will be +#' dropped. If FALSE \code{default}, the the two alleles with the highest allele +#' frequencies will be retained. See details for more information. +#' @param AD logical. If TRUE, an allele depth table similar to \code{hetTgen} +#' output will be returned; If \code{FALSE}, individual AD values per SNP will be #' returned in a list. #' @param verbose logical. Show progress #' #' @return A data frame or a list of minimum allele frequency removed allele depth #' +#' @details +#' This function either drops all sites with more than two alleles or keep alleles \code{drop.multi=TRUE} +#' with the highest allele frequencies in the case of multi-allelic sites \code{drop.multi=FALSE}. +#' Note that, in the latter case, all sites will be retained, essentially assuming that multi-allelic sites +#' are also bi-allelic by dropping the alleles with minimum frequencies. +#' This function can only be used on the output of the \code{hetTgen()}. If you need to remove the multi-allelic sites +#' from the vcf, use \code{rCNV:::non_bi_rm()} function. We recommend using other programs (e.g., vcftools, GATK, etc.) for removing multiallelic sites faster. +#' +#' #' @author Piyal Karunarathne #' #' @examples #' \dontrun{mf<-maf(ADtable)} #' #' @export -maf<-function(h.table,AD=TRUE,verbose=TRUE){ - htab<-h.table[,-c(1:3)] - if(verbose){ - glt<-apply_pb(htab,1,function(X){ - gg<-do.call(rbind,lapply(X,function(x){as.numeric(strsplit(x,",")[[1]])})) - while(ncol(gg)>2){gg<-gg[,-which.min(colMeans(proportions(gg,1),na.rm=T))]} - if(AD){return(paste0(gg[,1],",",gg[,2]))}else{return(gg)} - }) - }else{ - glt<-apply(htab,1,function(X){ - gg<-do.call(rbind,lapply(X,function(x){as.numeric(strsplit(x,",")[[1]])})) - if(ncol(gg)>2)gg<-gg[,-which.min(colMeans(proportions(gg,1),na.rm=T))] - if(AD){return(paste0(gg[,1],",",gg[,2]))}else{return(gg)} - }) +maf<-function(h.table,drop.multi=FALSE,AD=TRUE,verbose=TRUE){ + h.table<-data.frame(h.table) + warning(paste0("There are ",sum(nchar(h.table$ALT)>1),"multi-allelic sites")) + if(drop.multi){ + h.table<-h.table[-which(nchar(h.table$ALT)>1),] # drop multi-allelic sites + message(paste0("\n",sum(nchar(h.table$ALT)>1),"multi-allelic sites were removed")) + return(h.table) + } else { # drop alleles with maf when multi-allelic + htab<-h.table[,-c(1:4)] + if(verbose){ + glt<-apply_pb(htab,1,function(X){suppressWarnings({ + gg<-do.call(rbind,lapply(X,function(x){as.numeric(strsplit(x,",")[[1]])})) + while(ncol(gg)>2){gg<-gg[,-which.min(colMeans(proportions(gg,1),na.rm=T))]} + if(AD){return(paste0(gg[,1],",",gg[,2]))}else{return(gg)} + }) + }) + }else{ + glt<-apply(htab,1,function(X){ + gg<-do.call(rbind,lapply(X,function(x){as.numeric(strsplit(x,",")[[1]])})) + if(ncol(gg)>2)gg<-gg[,-which.min(colMeans(proportions(gg,1),na.rm=T))] + if(AD){return(paste0(gg[,1],",",gg[,2]))}else{return(gg)} + }) + } + glt<-data.frame(h.table[,1:4],t(glt)) + colnames(glt)<-colnames(h.table) + message("Minimum frequency alleles were removed in multi-allelic sites") + return(glt) } - glt<-data.frame(h.table[,1:3],t(glt)) - colnames(glt)<-colnames(h.table) - return(glt) } @@ -225,7 +247,7 @@ hetTgen <- function(vcf,info.type=c("AD","AD-tot","GT","GT-012","GT-AB","DP"),ve if(inherits(vcf,"list")){vcf<-vcf$vcf} if(inherits(vcf,"data.frame")){vcf<-data.table::data.table(vcf)} if(any(nchar(vcf$ALT)>1)){ - warning("vcf file contains multi-allelic variants: only bi-allelic SNPs allowed\nUse maf() to remove non-bi-allilic snps") + warning("vcf file contains multi-allelic variants: \nonly bi-allelic SNPs allowed\nUse maf() to remove non-bi-allilic snps or drop minumum frequency alleles") } if(inherits(vcf,"list")){vcf<-vcf$vcf} @@ -512,9 +534,9 @@ ad.correct<-function(het.table,gt.table=NULL,odd.correct=TRUE,verbose=TRUE){ x<-gt.table[,n] Y<-data.frame(do.call(cbind,data.table::tstrsplit(X,","))) y<-which(x=="0/0" & Y$X2>0) - rr<-range(Y$X2[y])#range of depth in miss classified snps + rr<-range(Y$X2[y])#range of depth in mis classified snps Y[y,]<-0 - ll<-length(y)#number of miss classifications + ll<-length(y)#number of mis classifications out<-paste0(Y$X1,",",Y$X2) return(out) }) diff --git a/docs/404.html b/docs/404.html index 7308685..3e057dd 100644 --- a/docs/404.html +++ b/docs/404.html @@ -31,7 +31,7 @@ rCNV - 1.2.9000 + 1.3.9000