# Prepare sample methylation subset for benchmarking

In [1]:
#!/usr/bin/env Rscript
#profvis({

#options(error = recover)
#options(error = traceback)

start_time <- Sys.time()  # Start time capture

library(CpGWAS)
library(optparse)

args <- list(
    outdir = "./output/",
    chunk1 = 7801,
    chunk2 = 8000,
    snp_data_path = "/dcs04/lieber/statsgen/shizhong/michael/mwas/gwas/libd_chr1.pgen",
    methylation_data_path = "/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno/dlpfc/out/chr1_all.rda",
    cov = "/dcs04/lieber/statsgen/mnagle/mwas/full_covariates/all_dlpfc.csv",
    verbose = TRUE,
    lambda_choice = "1se",
    alphas = seq(0, 1, 0.25),
    cores_per_alpha = "all",
    num_cores = "all", #future::availableCores(),
    allow_inefficient_parallelization = FALSE,
    n_fold = 5,
    window_sizes = c(2000, 4000, 6000, 8000, 10000),
    tag = format(Sys.time(), "%Y%m%d-%H%M%S"),
    save_evaluation_results_each_fold = FALSE,
    save_glmnet_object = FALSE,
    cv_eval_mode = "dynamic",
    omit_folds_with_na_r = TRUE,
    #methInput_rds_path = "~/data/chr1_AA_methylation_10k_samples.rds",
    maf = 0.01
)


#saveRDS(args, file = file.path(args$outdir, paste0(args$tag, "-args.rds")))
#args <- readRDS("output/libd_chr1-chr1_AA-static-1core-20240129-123107-args.rds")
if(args$num_cores == "all"){
  args$num_cores <- future::availableCores()
}

if(args$verbose) {
  print(args)
}

if(!dir.exists(args$outdir)) {
  dir.create(args$outdir)
}

# Check required arguments
if (is.null(args$snp_data_path) || is.null(args$methylation_data_path)) {
  stop("Paths to both SNP and methylation data are required.")
}

In [5]:
load(args$methylation_data_path)

In [6]:
setGeneric("sampleMethylationSites", function(object, num_sites, seed) {
  standardGeneric("sampleMethylationSites")
})

setMethod(
  "sampleMethylationSites",
  "MethylationInput",
  function(object, num_sites, seed = NULL) {
    if (!is.null(seed)) {
      set.seed(seed)
    }
    
    if (num_sites <= 0 || num_sites > length(object@methylations_positions)) {
      recover()
      stop("The number of sites must be greater than 0 and less than or equal to the total number of methylation sites.")
    }
# 
    set.seed(42)
    selected_indices <- sample(x = 1:length(object@methylations_positions),
                               size = num_sites, replace = FALSE)
    
    # Subset methylations and methylations_positions using selected_indices
    object@methylations <- object@methylations[, selected_indices, drop = FALSE]
    object@methylations_positions <- object@methylations_positions[selected_indices]
    
    object
  }
)


In [7]:
object.size(BSobj2)

17754792 bytes

In [8]:
# Pt. 2: Initialize (or load) MethylationInput object -------------------------------

if (!is.null(args$methInput_rds_path) && file.exists(args$methInput_rds_path)) {
  if(args$verbose) {
    message("Loading MethylationInput object from RDS file: ", args$methInput_rds_path)
  }
  methInput <- reinitializeMethylationInput(rds_path = args$methInput_rds_path,
                                            snp_data_path = args$snp_data_path,
                                            start_site = args$chunk1,
                                            end_site = args$chunk2,
                                            no_cores = args$num_cores)
} else {
  if(args$verbose) {
    message("Creating new MethylationInput object")
  }
  methInput <- new("MethylationInput",
                   BSseq_obj = BSobj2,
                   snp_data_path = args$snp_data_path,
                   cov_path = args$cov,
                   start_site = 1, #args$chunk1,
                   end_site = length(BSobj2@rowRanges@seqnames), #args$chunk2,
                   no_cores = args$num_cores)
  BSobj2 <- means <- sds <- NULL
  methInput <- sampleMethylationSites(methInput, n = 10000, seed = 42)
  saveRDS(methInput, "/dcs04/lieber/statsgen/mnagle/mwas/chr1_dfplc_all_methylation_10k_samples_a3.rds")
}

Creating new MethylationInput object

