meyer-lab-cshl · HannahVMeyer · Jul 7, 2020 · May 27, 2020 · May 29, 2020 · May 29, 2020
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: plinkQC
 Type: Package
 Title: Genotype Quality Control with 'PLINK'
-Version: 0.3.1
+Version: 0.3.2
 Authors@R:
     person("Hannah", "Meyer", email = "hannah.v.meyer@gmail.com",
     role = c("aut", "cre"), comment = c(ORCID = "0000-0003-4564-0899"))
@@ -43,5 +43,5 @@ SystemRequirements:
     plink (1.9)
 Encoding: UTF-8
 LazyData: true
-RoxygenNote: 6.1.1
+RoxygenNote: 7.1.0
 VignetteBuilder: knitr
diff --git a/NEWS.md b/NEWS.md
@@ -1,46 +1,60 @@
+# plinkQC 0.3.2
+## minor changes
+* Add checks and tests in `evaluate_check_ancestry` for missing non-reference samples
+[5c03971](https://github.com/meyer-lab/plinkQC/commit/5c03971b92341b5a8cb26a1d5f2e37d637f5cbd7)
+* Clarified tutorials:
+  * R internal file.copy instead of systems `cp` command to make run on windows: [6c91ef3](https://github.com/meyer-lab/plinkQC/commit/6c91ef3368f601658967f88bc20db94a28307fce),
+  fixes [#26](https://github.com/meyer-lab/plinkQC/issues/26) 
+  * Add additional filter for A->T, C->G variants as suggested in [#24](https://github.com/meyer-lab/plinkQC/issues/24); [11e0375](https://github.com/meyer-lab/plinkQC/commit/11e03756852215915310a836d2e73dbcedf12bfc)
+* Fixed plotting issues in PCA plot of ancestry check:
+  * allow for supplying names of European reference population [ae09e64](https://github.com/meyer-lab/plinkQC/commit/ae09e64ee2eaeb8337b60b24b126a9c049d6315f)
+  * provide argument to specify number of populations per legend row [c7fe85d](https://github.com/meyer-lab/plinkQC/commit/c7fe85d555c256bf8be3843803ff1cc445f7898f)
+  * Ensure correct ordering of population colors when reference population is not HapMap [86275b7](https://github.com/meyer-lab/plinkQC/commit/86275b7c281835681f38ab16672db61c288cfed9)
+
 # plinkQC 0.3.1
 ## minor changes
-* Fixed dead links in vignettes (caused by migration of repository): da987d8f225aa6aca0596b9c4f6a2484b102bdb6
-* Added note about chrY in Hapmap data (vignette): e8afbb9842ed9421461a8114ac0a00f7955cf0c0
+
+* Fixed dead links in vignettes (caused by migration of repository): [da987d8](https://github.com/meyer-lab/plinkQC/commit/da987d8f225aa6aca0596b9c4f6a2484b102bdb6)
+* Added note about chrY in Hapmap data (vignette): [e8afbb9](https://github.com/meyer-lab/plinkQC/commit/e8afbb9842ed9421461a8114ac0a00f7955cf0c0)
 * Added note about recommended use of plink1.9 (vs 2.0):
-b69d3d71d23e9b161176a635fcb2b5a2b524591f
+[b69d3d7](https://github.com/meyer-lab/plinkQC/commit/b69d3d71d23e9b161176a635fcb2b5a2b524591f)
 
 # plinkQC 0.3.0
 ## major changes
 * Relationship filter can handle more complicated relationship scenarios as
-  observed in plant genotype sets (fixed #11)
+  observed in plant genotype sets (fixed [#11](https://github.com/meyer-lab/plinkQC/issues/11))
 * code moved to the meyer-lab repository: https://github.com/meyer-lab-cshl/plinkQC
 
 # plinkQC 0.2.3
 ## major changes
 * Enable return of overview plots as ggplot object
-  (fixes #6 in ab2e840f0f22ccdeb5317475698aa58e7eecd345 and
-  101e74e318752b2f8038330245d875f6afd5b57b)
+  (fixes  [#6](https://github.com/meyer-lab/plinkQC/issues/6) in [ab2e840](https://github.com/meyer-lab/plinkQC/commit/ab2e840f0f22ccdeb5317475698aa58e7eecd345) and
+  [101e74e](https://github.com/meyer-lab/plinkQC/commit/101e74e318752b2f8038330245d875f6afd5b57b))
 * Relationship filter now deals with more complicated relationship scenarios as
-  observed in plant genotype sets (fixed #11)
+  observed in plant genotype sets (addresses [#11](https://github.com/meyer-lab/plinkQC/issues/11))
 
 ## minor changes
 * give user option to choose maf threshold for relatedness filtering (relates to
-  #3)
+   [#3](https://github.com/meyer-lab/plinkQC/issues/3))
 
 ## bug fixes
-* Include check for zero related individuals fixing #3 in
-  1445a88d41d985e73ffdd161144229683f8352cd
+* Include check for zero related individuals fixing  [#3](https://github.com/meyer-lab/plinkQC/issues/3) in
+  [1445a88](https://github.com/meyer-lab/plinkQC/commit/1445a88d41d985e73ffdd161144229683f8352cd)
 * Include check in case all samples fail perIndividual QC in
-   894acc1fa03dadfe0ad2028888142171bcc641eb and
-   04642246d18ed4eaac5b9d9a6931d1ecb08308e8)
+   [894acc1](https://github.com/meyer-lab/plinkQC/commit/894acc1fa03dadfe0ad2028888142171bcc641eb) and
+   [0464224](https://github.com/meyer-lab/plinkQC/commit/04642246d18ed4eaac5b9d9a6931d1ecb08308e8))
 * Include checks for diagonal derived relationship estimates, and estimate data
-  containing only related individuals; fixes #11
+  containing only related individuals; addresses  [#11](https://github.com/meyer-lab/plinkQC/issues/11)
 * Fix command for genotype conversion in 1000Genomes vignette, addressing issue
-  #10
+   [#10](https://github.com/meyer-lab/plinkQC/issues/10)
 * fix missing rownames error for overviewPerIndividualQC, when relatedness check
-  was included (issue #16, fc7a38b1f2b345d9c6c5d69f5dcf0bc57a857f62)
-* fix vignette mismatch (issue #16, 09dcd59e77178b35747aae81a5c1988712e20de9)
+  was included (issue [#16](https://github.com/meyer-lab/plinkQC/issues/16), [fc7a38b](https://github.com/meyer-lab/plinkQC/commit/fc7a38b1f2b345d9c6c5d69f5dcf0bc57a857f62))
+* fix vignette mismatch (issue [#16](https://github.com/meyer-lab/plinkQC/issues/16), [09dcd59](https://github.com/meyer-lab/plinkQC/commit/09dcd59e77178b35747aae81a5c1988712e20de9))
 
 # plinkQC 0.2.2
 ## minor changes
 * Fix IDs written to fail.IDs file: previous versions wrote IID,IID, now fixed
-to FID, IID (fixes #2).
+to FID, IID (fixes [#2](https://github.com/meyer-lab/plinkQC/issues/2)).
 
 # plinkQC 0.2.1
 ## minor changes

diff --git a/R/individualQC.R b/R/individualQC.R
@@ -916,11 +916,13 @@ check_relatedness <- function(indir, name, qcdir=indir, highIBDTh=0.1875,
 
 check_ancestry <- function(indir, name, qcdir=indir, prefixMergedDataset,
                            europeanTh=1.5,
+                           refPopulation=c("CEU", "TSI"),
                            refSamples=NULL, refColors=NULL,
                            refSamplesFile=NULL, refColorsFile=NULL,
                            refSamplesIID="IID", refSamplesPop="Pop",
                            refColorsColor="Color", refColorsPop="Pop",
                            studyColor="#2c7bb6",
+                           legend_labels_per_row=6,
                            run.check_ancestry=TRUE,
                            interactive=FALSE, verbose=verbose,
                            path2plink=NULL, showPlinkOutput=TRUE) {
@@ -934,6 +936,7 @@ check_ancestry <- function(indir, name, qcdir=indir, prefixMergedDataset,
         fail <- evaluate_check_ancestry(qcdir=qcdir, indir=indir, name=name,
                                         prefixMergedDataset=prefixMergedDataset,
                                         europeanTh=europeanTh,
+                                        refPopulation=refPopulation,
                                         refSamples=refSamples,
                                         refColors=refColors,
                                         refSamplesFile=refSamplesFile,
@@ -943,6 +946,7 @@ check_ancestry <- function(indir, name, qcdir=indir, prefixMergedDataset,
                                         refColorsColor=refColorsColor,
                                         refColorsPop=refColorsPop,
                                         studyColor=studyColor,
+                                        legend_labels_per_row=legend_labels_per_row,
                                         interactive=interactive)
         return(fail)
 }
@@ -1789,6 +1793,9 @@ run_check_ancestry <- function(indir, prefixMergedDataset,
 #' considered to be of European descent and samples outside this radius of
 #' non-European descent. The radius is computed as the maximum Euclidean distance
 #' of European reference samples to the centre of European reference samples.
+#' @param refPopulation [vector] Vector with population identifiers of European
+#' reference population. Identifiers have to be corresponding to population IDs
+#' [refColorsPop] in refColorsfile/refColors.
 #' @param refSamples [data.frame] Dataframe with sample identifiers
 #' [refSamplesIID] corresponding to IIDs in prefixMergedDataset.eigenvec and
 #' population identifier [refSamplesPop] corresponding to population IDs
@@ -1820,6 +1827,7 @@ run_check_ancestry <- function(indir, prefixMergedDataset,
 #' IDs in refColors/refColorsFile.
 #' @param studyColor [character] Colour to be used for study population if plot
 #' is TRUE.
+#' @param legend_labels_per_row [integer] Number of population names per row in PCA plot.
 #' @param interactive [logical] Should plots be shown interactively? When
 #' choosing this option, make sure you have X-forwarding/graphical interface
 #' available for interactive plotting. Alternatively, set interactive=FALSE and
@@ -1849,6 +1857,8 @@ evaluate_check_ancestry <- function(indir, name, prefixMergedDataset,
                                     refSamplesIID="IID", refSamplesPop="Pop",
                                     refColorsColor="Color", refColorsPop="Pop",
                                     studyColor="#2c7bb6",
+                                    refPopulation=c("CEU", "TSI"),
+                                    legend_labels_per_row=6,
                                     interactive=FALSE) {
 
     prefix <- makepath(indir, name)
@@ -1866,11 +1876,20 @@ evaluate_check_ancestry <- function(indir, name, prefixMergedDataset,
     if (!file.exists(paste(out, ".eigenvec", sep=""))){
         stop("plink --pca output file: ", out, ".eigenvec does not exist.")
     }
-    testNumerics(numbers=europeanTh, positives=europeanTh)
+    testNumerics(numbers=c(europeanTh, legend_labels_per_row),
+                 positives=c(europeanTh, legend_labels_per_row))
     pca_data <- data.table::fread(paste(out, ".eigenvec", sep=""),
                                   stringsAsFactors=FALSE, data.table=FALSE)
     colnames(pca_data) <- c("FID", "IID", paste("PC",1:(ncol(pca_data)-2),
                                                 sep=""))
+    if (!any(samples$IID %in% pca_data$IID)) {
+        stop("There are no ", prefix, ".fam samples in the prefixMergedDataset")
+    }
+    if (!all(samples$IID %in% pca_data$IID)) {
+        stop("Not all ", prefix, ".fam samples are present in the",
+             "prefixMergedDataset")
+    }
+
     #pca_data$IID <- as.character(pca_data$IID)
     #pca_data$FID <- as.character(pca_data$FID)
 
@@ -1926,6 +1945,12 @@ evaluate_check_ancestry <- function(indir, name, prefixMergedDataset,
              refColors; missing population codes: ", paste(missing,
                                                            collapse=","))
     }
+    if (!all(refPopulation %in% refColors$Pop)) {
+        missing <- refPopulation[!refPopulation %in% refColors$Pop]
+        stop("Not all refPopulation populations found in population code of
+             refColors; missing population codes: ", paste(missing,
+                                                           collapse=","))
+    }
     refSamples <- merge(refSamples, refColors, by="Pop", all.X=TRUE)
 
     ## Combine pca data and population information ####
@@ -1936,14 +1961,15 @@ evaluate_check_ancestry <- function(indir, name, prefixMergedDataset,
         stop("There are samples in the prefixMergedDataset that cannot be found
              in refSamples or ", prefix, ".fam")
     }
-    data_all <- data_all[order(data_all$Pop, decreasing=FALSE),]
 
-    refColors <- rbind(refColors, c(name, studyColor))
-    data_all$Color <- as.factor(data_all$Color)
-    data_all$Pop <- factor(data_all$Pop, levels=refColors$Pop)
+    colors <-  dplyr::select_(data_all, ~Pop, ~Color)
+    colors <- colors[!duplicated(colors$Pop),]
+    colors <- colors[order(colors$Color),]
+    colors$Pop <- factor(colors$Pop, levels=unique(colors$Pop))
+    data_all$Pop <- factor(data_all$Pop, levels=levels(colors$Pop))
 
     ## Find mean coordinates and distances of reference Europeans ####
-    all_european <- dplyr::filter_(data_all, ~Pop %in% c("CEU", "TSI"))
+    all_european <- dplyr::filter_(data_all, ~Pop %in% refPopulation)
     euro_pc1_mean <- mean(all_european$PC1)
     euro_pc2_mean <- mean(all_european$PC2)
 
@@ -1959,17 +1985,17 @@ evaluate_check_ancestry <- function(indir, name, prefixMergedDataset,
     non_europeans <- dplyr::filter_(data_name, ~euclid_dist >
                                         (max_euclid_dist * europeanTh))
     fail_ancestry <- dplyr::select_(non_europeans, ~FID, ~IID)
+    legend_rows <- round(nrow(colors)/legend_labels_per_row)
     p_ancestry <- ggplot()
-    p_ancestry <- p_ancestry + geom_point(data=data_all,
-                                          aes_string(x='PC1', y='PC2',
-                                                     color='Pop')) +
+    p_ancestry <- p_ancestry +
+        geom_point(data=data_all,
+                   aes_string(x='PC1', y='PC2', color='Pop')) +
         geom_point(data=dplyr::filter_(data_all, ~Pop != name),
-                   aes_string(x='PC1', y='PC2',
-                              color='Pop'),
+                   aes_string(x='PC1', y='PC2', color='Pop'),
                    size=1) +
-        scale_color_manual(values=refColors$Color,
+        scale_color_manual(values=colors$Color,
                            name="Population") +
-        guides(color=guide_legend(nrow=2, byrow=TRUE)) +
+        guides(color=guide_legend(nrow=legend_rows, byrow=TRUE)) +
         ggforce::geom_circle(aes(x0=euro_pc1_mean, y0=euro_pc2_mean,
                                  r=(max_euclid_dist * europeanTh))) +
         ggtitle("PCA on combined reference and study genotypes") +

diff --git a/inst/extdata/data.fail-IBD.IDs b/inst/extdata/data.fail-IBD.IDs
diff --git a/man/check_ancestry.Rd b/man/check_ancestry.Rd
diff --git a/man/check_het_and_miss.Rd b/man/check_het_and_miss.Rd
diff --git a/man/check_hwe.Rd b/man/check_hwe.Rd
diff --git a/man/check_maf.Rd b/man/check_maf.Rd
diff --git a/man/check_relatedness.Rd b/man/check_relatedness.Rd