# Prep SLURM commands

In [1]:
getwd()

In [None]:
library(data.table)
library(stringr)

setwd("../")

#' Generate Chunk Ranges with Fixed Number of Chunks
#'
#' This function divides a range into a specified number of chunks, with each chunk
#' having approximately the same size.
#'
#' @param start The starting point of the range to be divided.
#' @param end The ending point of the range to be divided.
#' @param num_chunks The fixed number of chunks to divide the range into.
#'
#' @return A matrix where each row represents a chunk, with the first column being
#'         the start of the chunk and the second column being the end of the chunk.
#' @examples
#' chunk_fixed_n(1, 100, 5)
#' @export
chunk_fixed_n <- function(start, end, num_chunks) {
  chunk_size <- ceiling((end - start + 1) / num_chunks)
  chunk_ranges <- sapply(1:num_chunks, function(i) {
    chunk_start <- start + (i - 1) * chunk_size
    chunk_end <- min(chunk_start + chunk_size - 1, end)
    c(chunk_start, chunk_end)
  })
  return(chunk_ranges)
}

#' Generate Chunk Ranges with Fixed Chunk Size
#'
#' This function divides a range into chunks up to a given maximum size, dynamically
#' determining the number of chunks based on the range and maximum chunk size.
#'
#' @param start The starting point of the range to be divided.
#' @param end The ending point of the range to be divided.
#' @param max_chunk_size The maximum size that each chunk can have.
#'
#' @return A matrix where each row represents a chunk, with the first column being
#'         the start of the chunk and the second column being the end of the chunk.
#'         The last chunk may be smaller than `max_chunk_size` to fit the range.
#' @examples
#' chunk_fixed_size(1, 100, 20)
#' @export
chunk_fixed_size <- function(start, end, max_chunk_size) {
  if (length(start) > 1 || length(end) > 1) {
    stop("start and end must be single values")
  }
  if (start > end || max_chunk_size <= 0) {
    stop("Invalid arguments: ensure start <= end and max_chunk_size > 0")
  }
  
  num_chunks <- ceiling((end - start + 1) / max_chunk_size)
  chunk_ranges <- matrix(nrow = num_chunks, ncol = 2)
  
  for (i in 1:num_chunks) {
    chunk_start <- start + (i - 1) * max_chunk_size
    chunk_end <- min(chunk_start + max_chunk_size - 1, end)
    chunk_ranges[i, ] <- c(chunk_start, chunk_end)
  }
  
  return(t(chunk_ranges))
}

generate_slurm_script <- function(args, tag_pt1, tag_pt2, partition, acct, mem_per_cpu = "2G", cluster_specific_parameters = TRUE, nodes = NULL, ntasks_per_node = NULL, time = "24:00:00", module_load_conda = FALSE) {
  cpus_per_task <- if (is.null(args$num_cores) || args$num_cores == "all") {
    "#SBATCH --exclusive\n"
  } else {
    paste0("#SBATCH --cpus-per-task=", args$num_cores, "\n")
  }

  # Always set mem_per_cpu flag, even in exclusive mode
  mem_allocation <- paste0("#SBATCH --mem-per-cpu=", mem_per_cpu, "\n")

  args_string <- paste("--", names(args), "=", args, sep = "", collapse = " ")
  args_string <- paste(args_string, " --tag=", tag_pt1, "-", tag_pt2, sep = "")

  slurm_script <- paste0(
    "#!/bin/bash\n",
    if (cluster_specific_parameters) paste0(
      "#SBATCH --partition=", partition, "\n",
      "#SBATCH -A ", acct, "\n"
    ),
    if (!is.null(nodes) && !is.null(ntasks_per_node)) paste0(
      "#SBATCH --nodes=", nodes, "\n",
      "#SBATCH --ntasks-per-node=", ntasks_per_node, "\n"
    ),
    cpus_per_task,
    mem_allocation,
    "#SBATCH --output=slurm_output_", tag_pt1, "-", tag_pt2, ".out\n",
    "#SBATCH --job-name=", tag_pt1, "-", tag_pt2, "\n",
    "#SBATCH --time=", time, "\n",
    if (module_load_conda) "module load conda\n",
    "conda activate mwas\n",
    "echo 'Executing Rscript with arguments: Rscript scripts/CLI.R ", args_string, "'\n",
    "Rscript scripts/CLI.R ", args_string, "\n"
  )

  return(list(script = slurm_script, path = paste0("slurm_scripts/", tag_pt1, "-", tag_pt2, ".sh")))
}

# Define a function to run sacct and retrieve job names and statuses
get_job_info <- function() {
  # Use system2 to call sacct and capture output
  output <- system2("sacct", args = c("--format=JobName%200,State", "--noheader"), stdout = TRUE)
  
  # Split output into lines and then into columns
  job_data <- strsplit(output, "\n")
  job_info <- do.call(rbind, lapply(job_data, function(x) strsplit(x, "\\s+")))
  
  # Convert to data frame and name columns
  job_df <- as.data.frame(do.call(rbind, job_info), stringsAsFactors = FALSE)[, 2:3]
  names(job_df) <- c("JobName", "State")
  
  # Return the data frame
  return(job_df)
}

# Cluster-specific parameters
cluster_specific_parameters <- TRUE # Example condition
acct <- "jhu152"
time <- "48:00:00"
partition <- "shared"

module_load_conda <- FALSE

nodes <- 1
ntasks_per_node <- 1

# Overwrite flag (set by user)
overwrite <- FALSE

# chunk_ranges <- chunk_fixed_size(files$first_meth_index_with_SNP_coverage[i],
#                                  files$last_meth_index_with_SNP_coverage[i],
#                                  1000)
                         

# chunk_ranges <- chunk_ranges[1:2,1:2]

#chunk_ranges <- chunk_fixed_n(1, 10000, 5)

getwd()

matched_df <- fread("../CpGWAS/scripts/09.5-OUT_matched_SNP_meth_cov_chunked_JHPCE.csv")

matched_df <- matched_df[order(matched_df$SNP_data), ]

dim(matched_df)

dim(matched_df)

matched_df$SNP_data <- gsub("/dcs04/lieber/statsgen/shizhong/michael/mwas/gwas/", "/expanse/lustre/projects/jhu152/naglemi/mwas/gwas/", matched_df$SNP_data)
matched_df$methylation_data <- gsub("/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno/", "/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/", matched_df$methylation_data)
matched_df$cov_file <- gsub("/dcs04/lieber/statsgen/mnagle/mwas/full_covariates/", "/expanse/lustre/projects/jhu152/naglemi/mwas/full_covariates/", matched_df$cov_file)

matched_df$modified_methylation_data <- gsub("/dcs04/lieber/statsgen/mnagle/", "/expanse/lustre/projects/jhu152/naglemi/", matched_df$modified_methylation_data)

for(i in 1:nrow(matched_df)){
    
    
    region <- str_split_fixed(
    str_split_fixed(matched_df$methylation_data[i], "pheno/", 2)[, 2],
    "/out", 2)[, 1]
    outdir <- paste0("./output_EXPANSE_a2_", region, "/")
    if(!dir.exists(outdir)) dir.create(outdir)

    chunk_size <- matched_df$chunk_end[i] - matched_df$chunk_start[i] + 1
    
    # Constant Arguments Setup
    constant_args_df <- data.frame(
      outdir = outdir,
      snp_data_path = matched_df$SNP_data[i],
      methylation_data_path = matched_df$methylation_data[i],
      cov = matched_df$cov_file[i],
      verbose = TRUE,
      lambda_choice = "1se",
      alphas = 0.5,
      allow_inefficient_parallelization = FALSE,
      n_fold = 5,
      window_sizes = "10000",
      #window_sizes = "500000",
      save_evaluation_results_each_fold = FALSE,
      save_glmnet_object = FALSE,
      omit_folds_with_na_r = TRUE,
      methInput_rds_path = matched_df$modified_methylation_data[i]
    )
    
    # Varying parameters
    cv_eval_modes <- c("dynamic")
    cores_per_alphas <- c("all") #NA)  # Include NA to signify the default value should be used
    num_cores_options <- c(1)
    
    #for (chunk_range in 2){
    constant_args_df$chunk1 <- 1
    constant_args_df$chunk2 <- chunk_size
    # Loop through each combination
    for (cv_eval_mode in cv_eval_modes) {
      for (cores_per_alpha in cores_per_alphas) {
        for (num_cores in num_cores_options) {
          # Update constant_args_df for the current combination
          constant_args_df$cv_eval_mode <- cv_eval_mode
          constant_args_df$num_cores <- num_cores
          if (!is.na(cores_per_alpha)) {
            constant_args_df$cores_per_alpha <- cores_per_alpha
          } else {
            constant_args_df$cores_per_alpha <- NULL
          }

          # Generate tags
          snp_base <- tools::file_path_sans_ext(basename(constant_args_df$snp_data_path))
          meth_base <- tools::file_path_sans_ext(basename(constant_args_df$methylation_data_path))
          datetime_str <- format(Sys.time(), "%Y%m%d-%H%M%S")
          tag_pt1 <- paste(snp_base, meth_base, format(matched_df$chunk_start[i], scientific = FALSE), format(matched_df$chunk_end[i], scientific = FALSE), cv_eval_mode, paste0(num_cores, "corestotal"), ifelse(is.na(cores_per_alpha), "defaultcore", paste0(cores_per_alpha, "corepera")), sep = "-")
          tag_pt2 <- datetime_str
            
          all_files_in_directory <- list.files(path = constant_args_df$outdir, full.names = TRUE)
          #print(paste("All files in directory: ", constant_args_df$outdir))
          #print(all_files_in_directory)

          # Now check with the specific pattern
          #existing_files_pattern <- paste0("^", tag_pt1, ".*\\.rds$")
          existing_files <- list.files(path = constant_args_df$outdir, pattern = tag_pt1, full.names = TRUE)
          existing_files <- existing_files[grepl("rds", existing_files)]
            
          # Diagnostic print statements for troubleshooting
          #print(paste("Checking for files in: ", constant_args_df$outdir))
          #print(paste("Using pattern: ", tag_pt1))
          #print("Found files with specific pattern: ")
          #print(existing_files)
          
          if (!overwrite && length(existing_files) > 0) {
            #message("File with tag ", tag_pt1, " already exists. Skipping...")
            next
          }
            
          print(i)
            
          # # Now check with the specific pattern
          # #existing_files_pattern <- paste0("^", tag_pt1, ".*\\.rds$")
          # existing_jobs <- get_job_info()$JobName
          # existing_jobs <- existing_jobs[grepl(tag_pt1, existing_jobs)]
            
          # # Diagnostic print statements for troubleshooting
          # print(paste("Checking for jobs on sacct"))
          # print(paste("Using pattern: ", tag_pt1))
          # print("Found jobs with specific pattern: ")
          # print(existing_jobs)
          
          # if (!overwrite && length(existing_jobs) > 0) {
          #   message("Job with tag ", tag_pt1, " already exists. Skipping...")
          #   next
          # }


          # Generate and print SLURM script
          script_info <- generate_slurm_script(args = constant_args_df,
                                               tag_pt1 = tag_pt1,
                                               tag_pt2 = tag_pt2,
                                               partition = partition,
                                               acct = acct,
                                               mem_per_cpu = "2G",
                                               cluster_specific_parameters = cluster_specific_parameters,
                                               nodes = nodes, ntasks_per_node = ntasks_per_node,
                                               time = time, module_load_conda = module_load_conda)

          #cat("\n")
          cat(script_info$script)

          writeLines(script_info$script, script_info$path)

          # Submit the SLURM job using the sbatch command
          system(paste("sbatch", script_info$path))

          # Implement job submission limits and intervals if necessary
          sleeptime <- 61

          message(paste0("Sleeping for ", sleeptime, " seconds"))
          Sys.sleep(sleeptime)
          cat("\n")
        }
      }
    }
}

In [19]:
partition

In [20]:
acct