# 44. Batch heritability with GCTA and SLURM

In [1]:
# submit_heritability_jobs.R

# Load necessary libraries
library(data.table)
library(stringr)
library(tools)
library(bsseq)

Loading required package: BiocGenerics


Attaching package: ‘BiocGenerics’


The following objects are masked from ‘package:stats’:

    IQR, mad, sd, var, xtabs


The following objects are masked from ‘package:base’:

    anyDuplicated, aperm, append, as.data.frame, basename, cbind,
    colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
    get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
    match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
    Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
    table, tapply, union, unique, unsplit, which.max, which.min


Loading required package: GenomicRanges

Loading required package: stats4

Loading required package: S4Vectors


Attaching package: ‘S4Vectors’


The following objects are masked from ‘package:data.table’:

    first, second


The following object is masked from ‘package:utils’:

    findMatches


The following objects are masked from ‘package:base’:

    expand.grid, I, unname

In [2]:
# Function to generate SbLURM script
generate_slurm_script <- function(args, tag_pt1, tag_pt2, partition, acct, mem_per_cpu = "4G",
                                  cluster_specific_parameters = TRUE, nodes = NULL,
                                  ntasks_per_node = NULL, time = "24:00:00",
                                  module_load_conda = FALSE, run_script_path) {
  cpus_per_task <- if (is.null(args$num_cores) || args$num_cores == "all") {
    "#SBATCH --exclusive\n"
  } else {
    paste0("#SBATCH --cpus-per-task=", args$num_cores, "\n")
  }

  # Always set mem_per_cpu flag, even in exclusive mode
  mem_allocation <- paste0("#SBATCH --mem-per-cpu=", mem_per_cpu, "\n")

  # Construct argument string with proper quoting
  args_string <- paste("--", names(args), "=", shQuote(as.character(args)), sep = "", collapse = " ")
  #args_string <- paste(args_string, " --tag_pt1=", shQuote(tag_pt1), sep = "")
  #args_string <- paste(args_string, " --tag_pt2=", shQuote(tag_pt2), sep = "")

  # Absolute path to run_heritability.R
  run_script_abs_path <- normalizePath(run_script_path, mustWork = FALSE)
  if (!file.exists(run_script_abs_path)) {
    stop(paste("run_heritability.R not found at path:", run_script_abs_path))
  }

  slurm_script <- paste0(
    "#!/bin/bash\n",
    if (cluster_specific_parameters) paste0(
      "#SBATCH --partition=", partition, "\n"
      #"#SBATCH -A ", acct, "\n"
    ),
    if (!is.null(nodes) && !is.null(ntasks_per_node)) paste0(
      "#SBATCH --nodes=", nodes, "\n",
      "#SBATCH --ntasks-per-node=", ntasks_per_node, "\n"
    ),
    cpus_per_task,
    mem_allocation,
    "#SBATCH --output=slurm_output_", tag_pt1, "-", tag_pt2, ".out\n",
    "#SBATCH --job-name=", tag_pt1, "-", tag_pt2, "\n",
    "#SBATCH --time=", time, "\n",
    if (module_load_conda) "module load conda\n",
    "echo 'Activating Conda environment.'\n",
    "module load conda\n",
    "conda activate mwas\n",
    "echo 'Executing Rscript with arguments: Rscript ", run_script_abs_path, " ", args_string, "'\n",
    "Rscript ", shQuote(run_script_abs_path), " ", args_string, "\n"
  )

  return(list(script = slurm_script, path = paste0("slurm_scripts/", tag_pt1, "-", tag_pt2, ".sh")))
}

# Cluster-specific parameters
cluster_specific_parameters <- TRUE
acct <- "jhu152"               # Adjust as needed
time <- "48:00:00"             # Adjust as needed
partition <- "shared"          # Adjust as needed
module_load_conda <- FALSE     # Set to TRUE if you need to load Conda module
nodes <- 1                     # Adjust as needed
ntasks_per_node <- 1           # Adjust as needed

# Overwrite flag (set by user)
overwrite <- FALSE

# Define absolute paths
project_dir <- normalizePath(getwd())  # Assuming the script is run from the project root
scripts_dir <- file.path(project_dir, "scripts")
slurm_scripts_dir <- file.path(project_dir, "slurm_scripts")
heritability_results_dir <- file.path(project_dir, "heritability_results")
run_script_path <- file.path("run_heritability.R")  # Adjust if located elsewhere

# Create necessary directories if they don't exist
dir.create(slurm_scripts_dir, showWarnings = FALSE)
dir.create(heritability_results_dir, showWarnings = FALSE)

# Load and preprocess matched_df
matched_df_path <- file.path(project_dir, "09.5-OUT_matched_SNP_meth_cov_chunked_JHPCE.csv")
if (!file.exists(matched_df_path)) {
  stop(paste("Matched data file not found at path:", matched_df_path))
}
matched_df <- fread(matched_df_path)
matched_df <- matched_df[order(matched_df$SNP_data), ]

# Update file paths as needed
matched_df$SNP_data <- gsub("/expanse/lustre/projects/jhu152/naglemi/mwas/gwas/",
                            "/dcs04/lieber/statsgen/shizhong/michael/mwas/gwas/",
                            matched_df$SNP_data)
matched_df$methylation_data <- gsub("/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/",
                                    "/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno/",
                                    matched_df$methylation_data)
matched_df$cov_file <- gsub("/expanse/lustre/projects/jhu152/naglemi/mwas/full_covariates/",
                            "/dcs04/lieber/statsgen/mnagle/mwas/full_covariates/",
                            matched_df$cov_file)
matched_df$modified_methylation_data <- gsub("/expanse/lustre/projects/jhu152/naglemi/","/dcs04/lieber/statsgen/mnagle/",
                                             matched_df$modified_methylation_data)

# Subset to Chr 6
chr_of_interest <- 6
matched_df_chr6 <- matched_df[Chr == chr_of_interest]

# For initial testing, select only the first five rows
#test_run <- TRUE
#if (test_run) {
#  matched_df_chr6 <- matched_df_chr6[1:100]
#}

# Initialize a data.table to store job submission logs
job_log <- data.table(
  row = integer(),
  tag_pt1 = character(),
  tag_pt2 = character(),
  script_path = character(),
  sbatch_output = character(),
  status = character()
)

In [None]:
# Loop over each row in matched_df_chr6 to submit jobs
for (i in 1:nrow(matched_df_chr6)) {

  # Extract variables from the current row
  row <- matched_df_chr6[i]
  snp_data_path <- row$SNP_data
  methylation_data_path <- row$methylation_data
  cov_file <- row$cov_file
  modified_methylation_data <- row$modified_methylation_data
  chr <- row$Chr
  chunk_start <- row$chunk_start
  chunk_end <- row$chunk_end
  region <- row$region  # Assuming 'region' corresponds to the desired naming

  # Prepare output directory for this job
  outdir <- file.path(heritability_results_dir, paste0("heritability_", region))
  dir.create(outdir, recursive = TRUE, showWarnings = FALSE)

  # Define constant arguments
  constant_args <- list(
    Chr = chr,
    SNP_data = snp_data_path,
    methylation_data = methylation_data_path,
    cov_file = cov_file,
    modified_methylation_data = modified_methylation_data,
    wind = "10000",  # Adjust as needed
    chunk_start = chunk_start,
    chunk_end = chunk_end,
    gwas = paste0(dirname(matched_df_chr6$SNP_data[i]), "/"),    # **IMPORTANT:** Adjust to your GWAS path
    gcta = "/dcs04/lieber/statsgen/shizhong/software/gcta/gcta-1.94.1-linux-kernel-3-x86_64/gcta-1.94.1",     # **IMPORTANT:** Adjust to your GCTA executable path
    num_cores = 1                # Adjust if needed
    # outdir = outdir  # **REMOVE THIS LINE**
  )

  # Generate unique tags for the job
  snp_base <- file_path_sans_ext(basename(snp_data_path))
  meth_base <- file_path_sans_ext(basename(methylation_data_path))
  datetime_str <- format(Sys.time(), "%Y%m%d-%H%M%S")
  tag_pt1 <- paste(snp_base, meth_base, "herit", sep = "-")
  tag_pt2 <- datetime_str

  # Set the tags in constant_args
  constant_args$tag_pt1 <- tag_pt1
  constant_args$tag_pt2 <- tag_pt2

  # Define output file path
  output_file <- file.path(outdir, paste0(tag_pt1, "-", tag_pt2, "_heritability_results.RDS"))

  # Check if output file already exists
  if (!overwrite && file.exists(output_file)) {
    message("File ", output_file, " already exists. Skipping job ", i, "...")
    next
  }

  # Initialize error_message
  error_message <- NULL

  # Generate SLURM script
  script_info <- tryCatch({
    generate_slurm_script(
      args = constant_args,
      tag_pt1 = tag_pt1,
      tag_pt2 = tag_pt2,
      partition = partition,
      acct = acct,
      mem_per_cpu = "4G",
      cluster_specific_parameters = cluster_specific_parameters,
      nodes = nodes,
      ntasks_per_node = ntasks_per_node,
      time = time,
      module_load_conda = module_load_conda,
      run_script_path = run_script_path
    )
  }, error = function(e) {
    warning(paste("Error generating SLURM script for job", i, ":", e$message))
    error_message <<- e$message
    return(NULL)
  })

  # If script generation failed, log and skip
  if (is.null(script_info)) {
    job_log <- rbind(job_log, data.table(
      row = i,
      tag_pt1 = tag_pt1,
      tag_pt2 = tag_pt2,
      script_path = NA,
      sbatch_output = paste("Error generating script:", error_message),
      status = "Failed"
    ))
    next
  }

  # Write the SLURM script to a file
  writeLines(script_info$script, script_info$path)

  # Submit the job using sbatch and capture the output
  sbatch_result <- system2("sbatch", args = script_info$path, stdout = TRUE, stderr = TRUE)

  # Determine the status based on sbatch output
  if (grepl("^Submitted batch job", sbatch_result)) {
    status <- "Submitted"
  } else {
    status <- "Failed"
  }

  # Log the job submission details
  job_log <- rbind(job_log, data.table(
    row = i,
    tag_pt1 = tag_pt1,
    tag_pt2 = tag_pt2,
    script_path = script_info$path,
    sbatch_output = sbatch_result,
    status = status
  ))

  # Inform the user
  message(paste0("Submitted job ", i, " with tag ", tag_pt1, "-", tag_pt2, ". Status: ", status))

  # If submission failed, skip the sleep
  if (status == "Failed") {
    next
  }

  # Implement job submission limits and intervals if necessary
  sleeptime <- 301
  message(paste0("Sleeping for ", sleeptime, " seconds to avoid overloading the scheduler."))
  Sys.sleep(sleeptime)
  cat("\n")
}

# Save the job submission log
job_log_path <- file.path(slurm_scripts_dir, "job_submission_log.csv")
fwrite(job_log, file = job_log_path)

message("Job submission for Chr ", chr_of_interest, " completed. Check '", job_log_path, "' for details.")


Submitted job 1 with tag libd_chr6-chr6_AA-herit-20241008-215138. Status: Submitted

Sleeping for 301 seconds to avoid overloading the scheduler.






Submitted job 2 with tag libd_chr6-chr6_AA-herit-20241008-215639. Status: Submitted

Sleeping for 301 seconds to avoid overloading the scheduler.






Submitted job 3 with tag libd_chr6-chr6_AA-herit-20241008-220141. Status: Submitted

Sleeping for 301 seconds to avoid overloading the scheduler.






Submitted job 4 with tag libd_chr6-chr6_AA-herit-20241008-220642. Status: Submitted

Sleeping for 301 seconds to avoid overloading the scheduler.






Submitted job 5 with tag libd_chr6-chr6_AA-herit-20241008-221143. Status: Submitted

Sleeping for 301 seconds to avoid overloading the scheduler.

