R/do_eggla_gwas.R

#' Perform GWAS using PLINK2 (and BCFtools)
#'
#' Format VCF file(s) by filtering out all variants
#' not satisfaying "--min-alleles 2 --max-alleles 2 --types snps"
#' and setting IDs (if no annotation file using VEP is provided)
#' with "%CHROM:%POS:%REF:%ALT" (see https://samtools.github.io/bcftools/).
#' GWAS is performed on the formatted VCF file(s) by PLINK2 software
#' (https://www.cog-genomics.org/plink/2.0).
#'
#' @param data Path to the phenotypes stored as a CSV file.
#' @param results_zip Path to the zip archives generated by `run_eggla()`.
#' @param id_column Name of the column where sample/individual IDs are stored.
#' @param traits One or multiple traits, *i.e.*, columns' names from `data`, to be analysed separately.
#' @param covariates One or several covariates, *i.e.*, columns' names from `data`, to be used.
#' @param vcfs Path to the "raw" VCF file(s) containing
#'   the genotypes of the individuals to be analysed.
#' @param path Directory in which computation will occur and where output files will be saved.
#' @param vep Path to the VEP annotation file to be used to set variants RSIDs and add gene SYMBOL, etc.
#' @param bin_path A named list containing the path to the PLINK2 and BCFtools binaries
#'   For PLINK2, an URL to the binary can be provided (see https://www.cog-genomics.org/plink/2.0).
#' @param threads Number of threads to be used by BCFtools.
#' @param quiet A logical indicating whether to suppress the output.
#'
#' @import data.table
#' @importFrom stats as.formula p.adjust
#' @importFrom utils download.file unzip
#' @importFrom rlang hash
# @importFrom future.apply future_lapply
#'
#' @return Path to results file.
#'
#' @export
#'
#' @examples
#' if (interactive()) {
#'   data("bmigrowth")
#'   fwrite(
#'     x = bmigrowth,
#'     file = file.path(tempdir(), "bmigrowth.csv")
#'   )
#'   results_archives <- run_eggla(
#'     data = fread(file.path(tempdir(), "bmigrowth.csv")),
#'     id_variable = "ID",
#'     age_days_variable = NULL,
#'     age_years_variable = "age",
#'     weight_kilograms_variable = "weight",
#'     height_centimetres_variable = "height",
#'     sex_variable = "sex",
#'     covariates = NULL,
#'     male_coded_zero = FALSE,
#'     random_complexity = 1,
#'     parallel = FALSE,
#'     parallel_n_chunks = 1,
#'     working_directory = tempdir()
#'   )
#'   do_eggla_gwas(
#'     data = "/tmp/bmigrowth.csv",
#'     results_zip = results_archives,
#'     id_column = "ID",
#'     traits = c("slope_.*", "auc_.*"),
#'     covariates = c("sex"),
#'     vcfs = list.files(
#'       path = file.path(tempdir(), "vcf"),
#'       pattern = "\\.vcf$|\\.vcf.gz$",
#'       full.names = TRUE
#'     ),
#'     path = tempdir(),
#'     vep = NULL,
#'     bin_path = list(
#'       bcftools = "/usr/bin/bcftools",
#'       plink2 = "/usr/bin/plink2"
#'     ),
#'     threads = 1
#'   )
#' }
do_eggla_gwas <- function(
  data,
  results_zip,
  id_column,
  traits,
  covariates,
  vcfs,
  path,
  vep = NULL,
  bin_path = list(
    bcftools = "/usr/bin/bcftools",
    plink2 = "/usr/bin/plink2"
  ),
  threads = 1,
  quiet = FALSE
) {
  INFO <- TEST <- P <- NULL # no visible binding for global variable from data.table
  path <- normalizePath(path)
  results_zip <- normalizePath(results_zip)
  dir.create(
    path = path,
    recursive = TRUE,
    mode = "0775",
    showWarnings = FALSE
  )
  if (grepl("^http.*\\.zip$", bin_path[["plink2"]]) & !file.exists(sprintf("%s/plink2", path))) {
    zip_file <- sprintf("%s/plink2.zip", path)
    is_plink_downloaded <- try(
      expr = {
        utils::download.file(url = bin_path[["plink2"]], destfile = zip_file)
        utils::unzip(
          zipfile = zip_file,
          exdir = path,
          files = "plink2"
        )
        unlink(zip_file)
        Sys.chmod(sprintf("%s/plink2", path), "0777")
      },
      silent = TRUE
    )
    bin_path[["plink2"]] <- sprintf("%s/plink2", path)
    if (inherits(is_plink_downloaded, "try-error") & !file.exists(sprintf("%s/plink2", path))) {
      stop(
        "Error downloading PLINK2 binary. ",
        "Please check the download URL at https://www.cog-genomics.org/plink/2.0."
      )
    }
  }

  plink_version <- try(
    expr = system(sprintf("%s --version", bin_path[["plink2"]]), intern = TRUE),
    silent = TRUE
  )
  if (inherits(plink_version, "try-error")) stop("Please check PLINK binary path!")

  bcftools_version <- try(
    expr = system(sprintf("%s --version", bin_path[["bcftools"]]), intern = TRUE)[1],
    silent = TRUE
  )
  if (inherits(bcftools_version, "try-error")) stop("Please check BCFTools binary path!")

  dt <- data.table::merge.data.table(
    x = data.table::fread(data),
    y = data.table::setnames(
      x = data.table::rbindlist(lapply(
        X = results_zip,
        path = path,
        FUN = function(izip, path) {
          utils::unzip(
            zipfile = izip,
            files = "derived-slopes.csv",
            exdir = path
          )
          utils::unzip(
            zipfile = izip,
            files = "derived-aucs.csv",
            exdir = path
          )
          on.exit(unlink(file.path(path, c("derived-slopes.csv", "derived-aucs.csv"))))
          data.table::merge.data.table(
            x = data.table::fread(file.path(path, "derived-slopes.csv")),
            y = data.table::fread(file.path(path, "derived-aucs.csv")),
            by = "egg_id"
          )
        }
      )),
      old = "egg_id",
      new = id_column
    ),
    by = id_column
  )

  dt <- dt[
    j = unique(na.exclude(.SD)),
    .SDcols = grep(paste(
      c(id_column, sprintf("^%s$", unique(c(traits, covariates)))),
      collapse = "|"
    ), names(dt), value = TRUE)
  ]

  data.table::setnames(x = dt, old = id_column, new = "#IID")

  traits <- grep(
    pattern = paste(sprintf("^%s$", unique(traits)), collapse = "|"),
    x = names(dt),
    value = TRUE
  )
  covariates <- grep(
    pattern = paste(sprintf("^%s$", unique(covariates)), collapse = "|"),
    x = names(dt),
    value = TRUE
  )
  formula <- stats::as.formula(sprintf(
    fmt = "`%s` ~ %s",
    paste(traits, collapse = "` + `"),
    paste(covariates, collapse = " + ")
  ))

  tmpdir <- file.path(tempdir(), "gwas_plink2")
  dir.create(path = tmpdir, recursive = TRUE, mode = "0777")
  on.exit(unlink(tmpdir, recursive = TRUE))

  if (length(sex_covariate <- grep("^sex", covariates, value = TRUE, ignore.case = TRUE)) > 1) {
    stop(sprintf(
      "Only one column containing \"sex\" can exist in the model, not %s: \"%s\"",
      length(sex_covariate),
      paste(sex_covariate, collapse = '", "')
    ))
  }

  binary_wrongly_coded <- dt[
    j = names(which(sapply(
      X = .SD,
      FUN = function(.col) {
        data.table::uniqueN(.col) == 2 && 0 %in% unique(.col)
      }
    ))),
    .SDcols = c(traits)
  ]

  if (length(binary_wrongly_coded) > 0) {
    stop(
      "Binary traits must be coded as 1 or 2!\n",
      sprintf("Please check: \"%s\"", paste(binary_wrongly_coded, collapse = '", "'))
    )
  }

  basename_file <- file.path(tmpdir, rlang::hash(formula))

  data.table::fwrite(
    x = dt[j = unique(.SD), .SDcols = "#IID"],
    file = sprintf("%s.samples", basename_file),
    sep = " ",
    col.names = FALSE
  )

  data.table::fwrite(
    x = dt[j = unique(.SD), .SDcols = c("#IID", traits)],
    file = sprintf("%s.pheno", basename_file),
    sep = " "
  )

  if (length(covariates_not_sex <- setdiff(covariates, sex_covariate)) > 0) {
    data.table::fwrite(
      x = dt[j = unique(.SD), .SDcols = c("#IID", covariates_not_sex)],
      file = sprintf("%s.cov", basename_file),
      sep = " "
    )
  }

  if (length(sex_covariate) > 0) {
    if (length(sex_levels <- unique(dt[[sex_covariate]])) == 2 & 0 %in% sex_levels) {
      warning(
        "Sex must be coded: '1'/'M'/'m' = male, '2'/'F'/'f' = female, 'NA'/'0' = missing! ",
        "'0' have been recoded as '2', i.e., female."
      )
      dt[
        j = c(sex_covariate) := lapply(.SD, function(x) c("0" = 2, "1" = 1)[as.character(x)]),
        .SDcols = sex_covariate
      ]
    }
    data.table::fwrite(
      x = data.table::setnames(
        x = data.table::copy(dt)[j = unique(.SD), .SDcols = c("#IID", sex_covariate)],
        old = sex_covariate,
        new = "SEX"
      ),
      file = sprintf("%s.sex", basename_file),
      sep = " "
    )
  }

  if (!quiet) message("Formatting VCFs and performing PLINK2 regression ...")
  if (nzchar(system.file(package = "future.apply"))) {
    eggla_lapply <- function(X, basename_file, vep_file, bin_path, FUN) {
      future.apply::future_lapply(
        X = X,
        basename_file = basename_file,
        vep_file = vep_file,
        bin_path = bin_path,
        future.globals = FALSE,
        future.packages = "data.table",
        FUN = FUN
      )
    }
  } else {
    eggla_lapply <- function(X, basename_file, vep_file, bin_path, FUN) {
      lapply(
        X = X,
        basename_file = basename_file,
        vep_file = vep_file,
        bin_path = bin_path,
        FUN = FUN
      )
    }
  }

  list_results <- eggla_lapply(
    X = vcfs,
    basename_file = basename_file,
    vep_file = vep,
    bin_path = bin_path,
    FUN = function(vcf, basename_file, vep_file, bin_path) {
      vcf_file <- sprintf("%s__%s", basename_file, basename(vcf))
      results_file <- sub("\\.vcf.gz", "", vcf_file)

      cmd <- paste(
        bin_path[["bcftools"]],
          "+fill-tags", vcf,
       "|",
        bin_path[["bcftools"]],
          "view",
          # "--min-af 0.05",
          # "--exclude 'INFO/INFO < 0.8'",
          "--min-alleles 2 --max-alleles 2 --types snps",
          "--force-samples",
          "--samples-file", sprintf("%s.samples", basename_file)
      )

      if (!is.null(vep_file) && file.exists(vep_file)) {
        cmd <- paste(
          cmd,
          "|",
          bin_path[["bcftools"]],
            "annotate",
            "--annotations", vep_file,
            "--header-lines", sub("_formatted.tsv.gz", ".header", vep_file),
            "--columns CHROM,POS,Gene,Symbol,rsid",
          "|",
          bin_path[["bcftools"]],
            "annotate",
            "--set-id '%INFO/rsid'",
          "|",
          bin_path[["bcftools"]],
            "annotate",
            "--set-id +'%CHROM:%POS:%REF:%ALT'",
            "--output-type z --output", vcf_file
        )
      } else {
        cmd <- paste(
          cmd,
          "|",
          bin_path[["bcftools"]],
            "annotate",
            "--set-id '%CHROM:%POS:%REF:%ALT'",
            "--output-type z --output", vcf_file
        )
      }

      system(cmd)

      system(paste(c(
        bin_path[["plink2"]],
        "--vcf", vcf_file, "dosage=DS",
        "--mach-r2-filter",
        "--threads", threads,
        "--glm sex",
        if (file.exists(sprintf("%s.cov", basename_file))) c("--covar", sprintf("%s.cov", basename_file)) else "allow-no-covars",
        if (file.exists(sprintf("%s.samples", basename_file))) c("--keep", sprintf("%s.samples", basename_file)),
        if (file.exists(sprintf("%s.sex", basename_file))) c("--update-sex", sprintf("%s.sex", basename_file)),
        if (file.exists(sprintf("%s.pheno", basename_file))) c("--pheno", sprintf("%s.pheno", basename_file)),
        "--covar-variance-standardize",
        "--silent",
        "--out", results_file
      ), collapse = " "))

      annot <- data.table::setnames(
        x = data.table::fread(
          cmd = paste(bin_path[["bcftools"]], "view --drop-genotypes", vcf_file),
          skip = "#CHROM"
        ),
        old = function(x) sub("^#", "", x)
      )

      if (any(grepl("^INFO$", names(annot)))) {
        annot <- annot[
          j = list(
            .SD,
            data.table::rbindlist(
              l = lapply(
                X = strsplit(INFO, ";"),
                FUN = function(x) {
                  all_fields <- strsplit(x, "=")
                  out <- data.table::transpose(all_fields[sapply(all_fields, length) > 1])
                  data.table::setnames(x = data.table::setDT(do.call("rbind.data.frame", out[-1])), old = out[[1]])
                }
              ),
              use.names = TRUE,
              fill = TRUE
            )[
              j = lapply(.SD, function(x) {
                xout <- as.character(x)
                data.table::fifelse(
                  test = xout %in% c(".", "-"),
                  yes = NA_character_,
                  no = xout
                )
              })
            ]
          ),
          .SDcols = !intersect(c("INFO", "QUAL", "FILTER"), names(annot))
        ]
      }

      if (length(qual_filter_cols <- intersect(c("QUAL", "FILTER"), names(annot))) > 0) {
        annot <- annot[j = .SD, .SDcols = !c(qual_filter_cols)]
      }

      data.table::setnames(
        x = annot,
        old = function(x) sub("^\\.SD\\.\\.*", "", x)
      )

      results <- data.table::setnames(
        x = data.table::rbindlist(
          l = lapply(
            X = (function(x) `names<-`(x, sub("[^.]*\\.", "", x)))(
              list.files(
                path = dirname(results_file),
                pattern = sprintf("%s\\..*\\.glm\\..*", basename(results_file)),
                full.names = TRUE
              )
            ),
            FUN = data.table::fread
          ),
          idcol = "trait_model"
        ),
        old = function(x) sub("^#", "", x)
      )[TEST %in% "ADD" & !is.na(P), -c("TEST")]

      output_results_file <- sprintf("%s.results.gz", results_file)

      data.table::fwrite(
        x = data.table::merge.data.table(
          x = results,
          y = annot,
          by = c("CHROM", "POS", "ID", "REF", "ALT"), # intersect(names(results), names(annot))
          all.x = TRUE
        ),
        file = output_results_file
      )
      if (!quiet) message(sprintf("Results written in \"%s\"", output_results_file))

      output_results_file
    }
  )

  if (!quiet) message("Aggregating PLINK2 results ...")

  results_file <- file.path(path, "gwas.csv.gz")

  data.table::fwrite(
    x = data.table::setcolorder(
      x = data.table::rbindlist(
        l = lapply(list_results, data.table::fread),
        use.names = TRUE
      )[
        j = `:=`(
          FDR = stats::p.adjust(P, method = "BH"),
          Bonferroni = stats::p.adjust(P, method = "bonferroni"),
          covariates = covariates
        ),
        by = "trait_model"
      ][order(P)],
      neworder = c("trait_model", "covariates")
    ),
    file = results_file
  )

  if (!quiet) message(sprintf("Writing results to \"%s\"!", results_file))

  writeLines(
    text = c(R.version.string, plink_version, bcftools_version),
    con = file.path(path, "gwas_software.txt")
  )

  invisible(results_file)
}