In [1]:
# Version to run in console

files_skipped <- 0
files_not_skipped <- 0

library(CpGWAS)
library(data.table)
setDTthreads(6)

library(stringr)
library(optparse)
library(doParallel)
library(foreach)

Loading required package: foreach

Loading required package: iterators

Loading required package: parallel



In [2]:
# library(future.apply)
# plan(multicore)

In [3]:
cl <- makeCluster(6)  # Leave one core free for other tasks
registerDoParallel(cl)

In [4]:
chr_list <- 1:22

this_chr <- 9

machine <- "notlocal"

print(paste0("chr: ", this_chr))
opt <- list(genome_file_index = this_chr,
            data_file = "/expanse/lustre/projects/jhu152/naglemi/mwas/CpGWAS/scripts/12-OUT_matched_SNP_meth_cov_outputs.csv",
            cores = 1)

#if(opt$cores > 1) registerDoParallel(opt$cores)

# Load genome files
genome_files <- list.files("/expanse/lustre/projects/jhu152/naglemi/mwas/gwas_flipped",
                           pattern = "EUR", full.names = TRUE)
genome_files <- genome_files[grepl("allele", genome_files)]
genome_files <- genome_files[grepl("a2", genome_files)]

genome_files <- genome_files[grepl("pvar", genome_files)]

genome_files <- data.table(path = genome_files, Chr = NA)

genome_files$Chr <- str_split_fixed(genome_files$path, "chr", 2)[, 2]
genome_files$Chr <- gsub("\\.pvar", "", genome_files$Chr)

genome_files$Chr <- as.integer(genome_files$Chr)
genome_files <- genome_files[order(genome_files$Chr), ]

df <- fread(opt$data_file)

if(machine == "local"){
  df$path <- gsub("..//output_EXPANSE_a2", "~/data/stage1_output_a3/output_EXPANSE_dt", df$path)
}


df$path <- gsub("_a2_", "_dt_", df$path)
df$path <- gsub("\\.rds", "_dt_combined.csv", df$path)

print("Starting genome file processing")
# Process the specified genome file
g <- opt$genome_file_index
print(paste("Processing genome file index:", g))

paths <- list(
  pvar_path = genome_files[g]$path,
  pgen_path = gsub("pvar", "pgen", genome_files[g]$path),
  psam_path = gsub("pvar", "psam", genome_files[g]$path)
)

chr <- as.numeric(gsub("\\.pvar", "",
                       stringr::str_split_fixed(paths$pvar_path, "chr", 2)[, 2]))

my_SNPs <- CpGWAS::loadSNPData(paths$pvar_path, paths$pgen_path, paths$psam_path)
setkey(my_SNPs$pvar_dt, POS)
df_this_chr <- df[which(df$Chr == genome_files[g]$Chr), ]


summary_stats_list <- list.files("/expanse/lustre/projects/jhu152/naglemi/mwas/gwas", pattern = "alleleprocessed", full.names = TRUE)
summary_stats_list <- summary_stats_list[grepl("a3", summary_stats_list)]

# Pre-load all summary stats files into a list and clean/standardize column names
summary_stats_data <- lapply(summary_stats_list, function(path) {
  cols_to_load <- c("CHR", "BP", "SNP", "A1", "A2", "Z", "SE", "PVAL")
  stats <- suppressWarnings(data.table::fread(path, select = cols_to_load))
  
  # Rename column if needed (uncomment if column name needs changing from "#CHROM" to "CHR")
  # setnames(stats, "#CHROM", "CHR")
  
  # Set key for efficient subsetting
  setkey(stats, BP)
  
  # Subset to only include rows where 'CHR' matches the 'chr' variable
  stats <- stats[CHR == chr]
  
  # Clean and standardize column names if needed
  # clean_and_standardize_colnames(stats)
  
  return(stats)
})

summary_stats_data <- lapply(summary_stats_data, function(stats) stats[`CHR` == genome_files[g]$Chr])

print("Loaded SNP data")
print("Files for this Chr:")
print(nrow(df_this_chr))

[1] "chr: 9"
[1] "Starting genome file processing"
[1] "Processing genome file index: 9"
[1] "Loaded SNP data"
[1] "Files for this Chr:"
[1] 486


In [5]:
# for(j in 1:nrow(df_this_chr)){
# #for(j in 1:3){
#   print(paste0("File number: ", j))
  
#   start_time <- Sys.time()  # Start timing the loop iteration
  
#   if (grepl("empty", df_this_chr$path[j])) {
#     message(paste0("No model for ", df_this_chr$path[j]))
#     next
#   }
  
#   all_files_exist <- TRUE
#   outnames <- vector("character", length(summary_stats_list))
  
#   # Check if output files already exist
#   for (k in 1:length(summary_stats_list)) {
#     outnames[k] <- gsub("\\.rds$", paste0("_", basename(tools::file_path_sans_ext(summary_stats_list[[k]])), "_results.rds"), df_this_chr$path[j])
#     if (!file.exists(outnames[k])) {
#       all_files_exist <- FALSE
#       break  # Exit the loop early if any file does not exist
#     }
#   }
  
#   # Load the CSV file
#   load_csv_start <- Sys.time()
#   my_csv <- fread(df_this_chr$path[j])
#   cgs <- levels(factor(my_csv$cg))
#   cgs <- as.integer(cgs)
  
#   setkey(my_csv,
#          #cg,
#          features)
  
#   my_csv_list <- split(my_csv, by = "cg")
  
#   # Ensure that 'cgs' is ordered and unique if it's not already
#   cgs <- unique(cgs[order(cgs)])
  
#   #print(paste("Loaded csv file:", df_this_chr$path[j]))
#   load_csv_end <- Sys.time()
#   #print(paste("Time to load CSV:", load_csv_end - load_csv_start))
  
#   # Process each summary statistics file
#   for (k in 1:length(summary_stats_list)) {
#     process_start <- Sys.time()
    
#     outname <- gsub("_dt_combined.csv", paste0("_", basename(tools::file_path_sans_ext(summary_stats_list[[k]])), "_results.rds"), df_this_chr$path[j])

#     summary_stats <- summary_stats_data[[k]]
    
#     matching_bps <- unique(my_csv$features)

#     # Subset summary_stats to only include rows with BPs in the matching_bps vector
#     summary_stats_sub <- summary_stats[BP %in% matching_bps]
    
#     MWASmodels <- vector("list", length(cgs))
    
    
#     # Now, use this pre-structured data in your loop to eliminate the bottleneck
#     for (i in seq_len(length(cgs))) {
#       print(i)
#       print(Sys.time())
#       this_cg <- cgs[i]
#       this_cg_data <- my_csv_list[[as.character(this_cg)]]
#       if (!is.null(this_cg_data)) {
#         MWASmodels[[i]] <- process_model_csv(this_cg_data, my_SNPs, summary_stats_sub)
#         MWASmodels[[i]]['bp'] <- MWASmodels[[i]]['bp'] <- this_cg
#       }
#     }
  
    
#     MWASmodels <- Filter(Negate(is.null), MWASmodels)  # Remove NULL entries
#     results_end <- Sys.time()
#     # print(paste("Time to create MWAS results:", results_end - results_start))
    
#     # Save the results
#     save_start <- Sys.time()
#     #saveRDS(results, outname)
    
#     df <- do.call(rbind, MWASmodels)
#     df <- as.data.frame(df)
    
#     outname_csv <- gsub("rds", "csv", outname)
#     fwrite(df, outname_csv)
#     save_end <- Sys.time()
#     #print(paste("Saved results to:", outname))
#     #print(paste("Time to save results:", save_end - save_start))
    
#     process_end <- Sys.time()
#     #print(paste("Total time to process summary stats file", k, ":", process_end - process_start))
#   }
  
#   loop_end_time <- Sys.time()  # End timing the loop iteration
#   print(paste("Total time for file number", j, ":", loop_end_time - start_time))
# }

In [10]:
Sys.time()

[1] "2024-08-29 06:46:31 PDT"

In [11]:
library(foreach)
library(doParallel)
library(data.table)

# Set up parallel backend
n_cores <- 4
cl <- makeCluster(n_cores)
registerDoParallel(cl)

# Export entire environment
foreach(j = 1:10, .packages = c("data.table", "CpGWAS"), .export = ls()) %dopar% {
    
  my_SNPs <- CpGWAS::loadSNPData(paths$pvar_path, paths$pgen_path, paths$psam_path)
  setkey(my_SNPs$pvar_dt, POS)
  print(paste0("File number: ", j))
  
  start_time <- Sys.time()

  if (grepl("empty", df_this_chr$path[j])) {
    message(paste0("No model for ", df_this_chr$path[j]))
    next
  }
  
  all_files_exist <- TRUE
  outnames <- vector("character", length(summary_stats_list))

  for (k in 1:length(summary_stats_list)) {
    outnames[k] <- gsub("\\.rds$", paste0("_", basename(tools::file_path_sans_ext(summary_stats_list[[k]])), "_results.rds"), df_this_chr$path[j])
    if (!file.exists(outnames[k])) {
      all_files_exist <- FALSE
      break
    }
  }

  load_csv_start <- Sys.time()
  my_csv <- fread(df_this_chr$path[j])
  cgs <- unique(as.integer(my_csv$cg))
  setkey(my_csv, features)
  my_csv_list <- split(my_csv, by = "cg")
  load_csv_end <- Sys.time()

  for (k in 1:length(summary_stats_list)) {
    process_start <- Sys.time()

    outname <- gsub("_dt_combined.csv", paste0("_", basename(tools::file_path_sans_ext(summary_stats_list[[k]])), "_results.rds"), df_this_chr$path[j])
    summary_stats <- summary_stats_data[[k]]
    matching_bps <- unique(my_csv$features)
    summary_stats_sub <- summary_stats[BP %in% matching_bps]

    MWASmodels <- vector("list", length(cgs))

    for (i in seq_len(length(cgs))) {
      this_cg_data <- my_csv_list[[as.character(cgs[i])]]
      if (!is.null(this_cg_data)) {
        MWASmodels[[i]] <- process_model_csv(this_cg_data, my_SNPs, summary_stats_sub)
        MWASmodels[[i]]['bp'] <- cgs[i]
      }
    }

    MWASmodels <- Filter(Negate(is.null), MWASmodels)
    
    df <- do.call(rbind, MWASmodels)
    df <- as.data.frame(df)
    outname_csv <- gsub("rds", "csv", outname)
    fwrite(df, outname_csv)

    process_end <- Sys.time()
  }
  
  loop_end_time <- Sys.time()
  print(paste("Total time for file number", j, ":", loop_end_time - start_time))
}

# Stop the cluster
stopCluster(cl)

“already exporting variable(s): df, df_this_chr, my_SNPs, paths, summary_stats_data, summary_stats_list”


In [12]:
Sys.time() # started 9:42am

[1] "2024-08-29 06:52:52 PDT"

In [13]:
Sys.time()

[1] "2024-08-29 06:56:42 PDT"

In [14]:
library(foreach)
library(doParallel)
library(data.table)

# Set up parallel backend
n_cores <- 4
cl <- makeCluster(n_cores)
registerDoParallel(cl)

# Export entire environment
foreach(j = 1:10, .packages = c("data.table", "CpGWAS"), .export = ls()) %dopar% {
    
  my_SNPs <- CpGWAS::loadSNPData(paths$pvar_path, paths$pgen_path, paths$psam_path)
  setkey(my_SNPs$pvar_dt, POS)
  print(paste0("File number: ", j))
  
  start_time <- Sys.time()

  if (grepl("empty", df_this_chr$path[j])) {
    message(paste0("No model for ", df_this_chr$path[j]))
    next
  }
  
  all_files_exist <- TRUE
  outnames <- vector("character", length(summary_stats_list))

  for (k in 1:length(summary_stats_list)) {
    outnames[k] <- gsub("\\.rds$", paste0("_", basename(tools::file_path_sans_ext(summary_stats_list[[k]])), "_results.rds"), df_this_chr$path[j])
    if (!file.exists(outnames[k])) {
      all_files_exist <- FALSE
      break
    }
  }

  load_csv_start <- Sys.time()
  my_csv <- fread(df_this_chr$path[j])
  cgs <- unique(as.integer(my_csv$cg))
  setkey(my_csv, features)
  my_csv_list <- split(my_csv, by = "cg")
  load_csv_end <- Sys.time()

  for (k in 1:length(summary_stats_list)) {
    process_start <- Sys.time()

    outname <- gsub("_dt_combined.csv", paste0("_", basename(tools::file_path_sans_ext(summary_stats_list[[k]])), "_results.rds"), df_this_chr$path[j])
    summary_stats <- summary_stats_data[[k]]
    matching_bps <- unique(my_csv$features)
    summary_stats_sub <- summary_stats[BP %in% matching_bps]

    MWASmodels <- vector("list", length(cgs))

    for (i in seq_len(length(cgs))) {
      this_cg_data <- my_csv_list[[as.character(cgs[i])]]
      if (!is.null(this_cg_data)) {
        MWASmodels[[i]] <- process_model_csv(this_cg_data, my_SNPs, summary_stats_sub)
        MWASmodels[[i]]['bp'] <- cgs[i]
      }
    }

    MWASmodels <- Filter(Negate(is.null), MWASmodels)
    
    df <- do.call(rbind, MWASmodels)
    df <- as.data.frame(df)
    outname_csv <- gsub("rds", "csv", outname)
    fwrite(df, outname_csv)

    process_end <- Sys.time()
  }
  
  loop_end_time <- Sys.time()
  print(paste("Total time for file number", j, ":", loop_end_time - start_time))
}

# Stop the cluster
stopCluster(cl)

“already exporting variable(s): df, df_this_chr, my_SNPs, paths, summary_stats_data, summary_stats_list”


ERROR: Error in unserialize(socklist[[n]]): error reading from connection


In [None]:
Sys.time() # started 9:42am

In [None]:
my_SNPs <- CpGWAS::loadSNPData(paths$pvar_path, paths$pgen_path, paths$psam_path)
setkey(my_SNPs$pvar_dt, POS)

In [4]:
summary_stats_list

In [None]:
files_skipped <- 0
files_not_skipped <- 0

library(CpGWAS)
library(data.table)
library(stringr)
library(optparse)
library(doParallel)
library(foreach)

chr_list <- 1:22

for(this_chr in chr_list){
print(paste0("chr: ", this_chr))
opt <- list(genome_file_index = this_chr,
            data_file = "/expanse/lustre/projects/jhu152/naglemi/mwas/CpGWAS/scripts/12-OUT_matched_SNP_meth_cov_outputs.csv",
            cores = 1)

#if(opt$cores > 1) registerDoParallel(opt$cores)

# Load genome files
genome_files <- list.files("/expanse/lustre/projects/jhu152/naglemi/mwas/gwas_flipped",
                           pattern = "EUR", full.names = TRUE)
genome_files <- genome_files[grepl("allele", genome_files)]
genome_files <- genome_files[grepl("a2", genome_files)]

genome_files <- genome_files[grepl("pvar", genome_files)]

genome_files <- data.table(path = genome_files, Chr = NA)

genome_files$Chr <- str_split_fixed(genome_files$path, "chr", 2)[, 2]
genome_files$Chr <- gsub("\\.pvar", "", genome_files$Chr)

genome_files$Chr <- as.integer(genome_files$Chr)
genome_files <- genome_files[order(genome_files$Chr), ]

df <- fread(opt$data_file)
df$path <- gsub("_a2_", "_dt_", df$path)
df$path <- gsub("\\.rds", "_dt_combined.csv", df$path)

print("Starting genome file processing")
# Process the specified genome file
g <- opt$genome_file_index
print(paste("Processing genome file index:", g))

paths <- list(
  pvar_path = genome_files[g]$path,
  pgen_path = gsub("pvar", "pgen", genome_files[g]$path),
  psam_path = gsub("pvar", "psam", genome_files[g]$path)
)

chr <- as.numeric(gsub("\\.pvar", "",
            stringr::str_split_fixed(paths$pvar_path, "chr", 2)[, 2]))

my_SNPs <- CpGWAS::loadSNPData(paths$pvar_path, paths$pgen_path, paths$psam_path)
setkey(my_SNPs$pvar_dt, `#CHROM`, POS)
df_this_chr <- df[which(df$Chr == genome_files[g]$Chr), ]


summary_stats_list <- list.files("/expanse/lustre/projects/jhu152/naglemi/mwas/gwas", pattern = "alleleprocessed", full.names = TRUE)
summary_stats_list <- summary_stats_list[grepl("a3", summary_stats_list)]

# Pre-load all summary stats files into a list and clean/standardize column names
summary_stats_data <- lapply(summary_stats_list, function(path) {
   stats <- suppressWarnings(data.table::fread(path))
   
   # Rename column if needed (uncomment if column name needs changing from "#CHROM" to "CHR")
   # setnames(stats, "#CHROM", "CHR")
   
   # Set key for efficient subsetting
   setkey(stats, SNP, CHR)
   
   # Subset to only include rows where 'CHR' matches the 'chr' variable
   stats <- stats[CHR == chr]

   # Clean and standardize column names if needed
   # clean_and_standardize_colnames(stats)
   
   return(stats)
})

summary_stats_data <- lapply(summary_stats_data, function(stats) stats[`CHR` == genome_files[g]$Chr])

print("Loaded SNP data")
print("Files for this Chr:")
print(nrow(df_this_chr))

for(j in 1:nrow(df_this_chr)){
#for(j in 1:10){
  print(paste0("File number: ", j))
  
  start_time <- Sys.time()  # Start timing the loop iteration

  if (grepl("empty", df_this_chr$path[j])) {
    message(paste0("No model for ", df_this_chr$path[j]))
    next
  }

  all_files_exist <- TRUE
  outnames <- vector("character", length(summary_stats_list))

  # Check if output files already exist
  for (k in 1:length(summary_stats_list)) {
    outnames[k] <- gsub("\\.rds$", paste0("_", basename(tools::file_path_sans_ext(summary_stats_list[[k]])), "_results.rds"), df_this_chr$path[j])
    if (!file.exists(outnames[k])) {
      all_files_exist <- FALSE
      break  # Exit the loop early if any file does not exist
    }
  }

  #if (all_files_exist) {
  #  print(paste("All output files already exist for", df_this_chr$path[j], "- Skipping"))
  #  next  # Skip to the next file if all outputs already exist
  #}

  # Load the CSV file
  load_csv_start <- Sys.time()
  my_csv <- fread(df_this_chr$path[j])
  setkey(my_csv, cg)
  cgs <- levels(factor(my_csv$cg))
  #print(paste("Loaded csv file:", df_this_chr$path[j]))
  load_csv_end <- Sys.time()
  #print(paste("Time to load CSV:", load_csv_end - load_csv_start))

  # Process each summary statistics file
  for (k in 1:length(summary_stats_list)) {
    process_start <- Sys.time()
    
    outname <- gsub("_dt_combined.csv", paste0("_", basename(tools::file_path_sans_ext(summary_stats_list[[k]])), "_results.rds"), df_this_chr$path[j])
    if(file.exists(outname)){
        files_skipped <- files_skipped + 1
        next
    } else {
        files_not_skipped <- files_not_skipped + 1
    }
    summary_stats <- summary_stats_data[[k]]

    MWASmodels <- vector("list", length(cgs))

    # Process each CpG site
    for (i in seq_along(cgs)) {
    #for(i in 1:10){
      this_cg <- cgs[i]
      
      process_cg_start <- Sys.time()

      this_cg_data <- my_csv[cg == this_cg]
      MWASmodels[[i]] <- process_model_csv(this_cg_data, my_SNPs, summary_stats)

      if(is.null(MWASmodels[[i]])){
        next
      }
      
      MWASmodels[[i]]['bp'] <- this_cg

      process_cg_end <- Sys.time()
  #    print(paste("Time to process CpG:", this_cg, ":", process_cg_end - process_cg_start))
    }

    # Create MWAS results
    results_start <- Sys.time()
    results <- MWASresults(MWASmodels, paths$pvar_path, paths$pgen_path, paths$psam_path, summary_stats_list[[k]], df_this_chr$path[j])
    results@MWASmodels <- Filter(Negate(is.null), results@MWASmodels)  # Remove NULL entries
    results_end <- Sys.time()
   # print(paste("Time to create MWAS results:", results_end - results_start))
    
    # Save the results
    save_start <- Sys.time()
    #saveRDS(results, outname)
    
    df <- do.call(rbind, results@MWASmodels)
    df <- as.data.frame(df)

    outname_csv <- gsub("rds", "csv", outname)
    fwrite(df, outname_csv)
    save_end <- Sys.time()
    #print(paste("Saved results to:", outname))
    #print(paste("Time to save results:", save_end - save_start))

    process_end <- Sys.time()
    #print(paste("Total time to process summary stats file", k, ":", process_end - process_start))
  }
  
  loop_end_time <- Sys.time()  # End timing the loop iteration
  print(paste("Total time for file number", j, ":", loop_end_time - start_time))
}
}

opt <- list(genome_file_index = this_chr,
            data_file = "/expanse/lustre/projects/jhu152/naglemi/mwas/CpGWAS/scripts/12-OUT_matched_SNP_meth_cov_outputs.csv",
            cores = 1)

#if(opt$cores > 1) registerDoParallel(opt$cores)

# Load genome files
genome_files <- list.files("/expanse/lustre/projects/jhu152/naglemi/mwas/gwas_flipped",
                           pattern = "EUR", full.names = TRUE)
genome_files <- genome_files[grepl("allele", genome_files)]
genome_files <- genome_files[grepl("a2", genome_files)]

genome_files <- genome_files[grepl("pvar", genome_files)]

genome_files <- data.table(path = genome_files, Chr = NA)

genome_files$Chr <- str_split_fixed(genome_files$path, "chr", 2)[, 2]
genome_files$Chr <- gsub("\\.pvar", "", genome_files$Chr)

genome_files$Chr <- as.integer(genome_files$Chr)
genome_files <- genome_files[order(genome_files$Chr), ]

df <- fread(opt$data_file)
df$path <- gsub("_a2_", "_dt_", df$path)
df$path <- gsub("\\.rds", "_dt_combined.csv", df$path)

print("Starting genome file processing")
# Process the specified genome file
g <- opt$genome_file_index
print(paste("Processing genome file index:", g))

paths <- list(
  pvar_path = genome_files[g]$path,
  pgen_path = gsub("pvar", "pgen", genome_files[g]$path),
  psam_path = gsub("pvar", "psam", genome_files[g]$path)
)

chr <- as.numeric(gsub("\\.pvar", "",
            stringr::str_split_fixed(paths$pvar_path, "chr", 2)[, 2]))

my_SNPs <- CpGWAS::loadSNPData(paths$pvar_path, paths$pgen_path, paths$psam_path)
setkey(my_SNPs$pvar_dt, `#CHROM`, POS)
df_this_chr <- df[which(df$Chr == genome_files[g]$Chr), ]


summary_stats_list <- list.files("/expanse/lustre/projects/jhu152/naglemi/mwas/gwas", pattern = "alleleprocessed", full.names = TRUE)
summary_stats_list <- summary_stats_list[grepl("a3", summary_stats_list)]

# Pre-load all summary stats files into a list and clean/standardize column names
summary_stats_data <- lapply(summary_stats_list, function(path) {
   stats <- suppressWarnings(data.table::fread(path))
   
   # Rename column if needed (uncomment if column name needs changing from "#CHROM" to "CHR")
   # setnames(stats, "#CHROM", "CHR")
   
   # Set key for efficient subsetting
   setkey(stats, SNP, CHR)
   
   # Subset to only include rows where 'CHR' matches the 'chr' variable
   stats <- stats[CHR == chr]

   # Clean and standardize column names if needed
   # clean_and_standardize_colnames(stats)
   
   return(stats)
})

summary_stats_data <- lapply(summary_stats_data, function(stats) stats[`CHR` == genome_files[g]$Chr])

print("Loaded SNP data")
print("Files for this Chr:")
print(nrow(df_this_chr))

for(j in 1:nrow(df_this_chr)){
#for(j in 1:10){
  print(paste0("File number: ", j))
  
  start_time <- Sys.time()  # Start timing the loop iteration

  if (grepl("empty", df_this_chr$path[j])) {
    message(paste0("No model for ", df_this_chr$path[j]))
    next
  }

  all_files_exist <- TRUE
  outnames <- vector("character", length(summary_stats_list))

  # Check if output files already exist
  for (k in 1:length(summary_stats_list)) {
    outnames[k] <- gsub("\\.rds$", paste0("_", basename(tools::file_path_sans_ext(summary_stats_list[[k]])), "_results.rds"), df_this_chr$path[j])
    if (!file.exists(outnames[k])) {
      all_files_exist <- FALSE
      break  # Exit the loop early if any file does not exist
    }
  }

  #if (all_files_exist) {
  #  print(paste("All output files already exist for", df_this_chr$path[j], "- Skipping"))
  #  next  # Skip to the next file if all outputs already exist
  #}

  # Load the CSV file
  load_csv_start <- Sys.time()
  my_csv <- fread(df_this_chr$path[j])
  setkey(my_csv, cg)
  cgs <- levels(factor(my_csv$cg))
  #print(paste("Loaded csv file:", df_this_chr$path[j]))
  load_csv_end <- Sys.time()
  #print(paste("Time to load CSV:", load_csv_end - load_csv_start))

  # Process each summary statistics file
  for (k in 1:length(summary_stats_list)) {
    process_start <- Sys.time()
    
    outname <- gsub("_dt_combined.csv", paste0("_", basename(tools::file_path_sans_ext(summary_stats_list[[k]])), "_results.rds"), df_this_chr$path[j])
    #if(file.exists(outname)) next
    summary_stats <- summary_stats_data[[k]]

    MWASmodels <- vector("list", length(cgs))

    # Process each CpG site
    for (i in seq_along(cgs)) {
    #for(i in 1:10){
      this_cg <- cgs[i]
      
      process_cg_start <- Sys.time()

      this_cg_data <- my_csv[cg == this_cg]
      MWASmodels[[i]] <- process_model_csv(this_cg_data, my_SNPs, summary_stats)

      if(is.null(MWASmodels[[i]])){
        next
      }
      
      MWASmodels[[i]]['bp'] <- this_cg

      process_cg_end <- Sys.time()
  #    print(paste("Time to process CpG:", this_cg, ":", process_cg_end - process_cg_start))
    }

    # Create MWAS results
    results_start <- Sys.time()
    results <- MWASresults(MWASmodels, paths$pvar_path, paths$pgen_path, paths$psam_path, summary_stats_list[[k]], df_this_chr$path[j])
    results@MWASmodels <- Filter(Negate(is.null), results@MWASmodels)  # Remove NULL entries
    results_end <- Sys.time()
   # print(paste("Time to create MWAS results:", results_end - results_start))
    
    # Save the results
    save_start <- Sys.time()
    #saveRDS(results, outname)
    
    df <- do.call(rbind, results@MWASmodels)
    df <- as.data.frame(df)

    outname_csv <- gsub("rds", "csv", outname)
    fwrite(df, outname_csv)
    save_end <- Sys.time()
    #print(paste("Saved results to:", outname))
    #print(paste("Time to save results:", save_end - save_start))

    process_end <- Sys.time()
    #print(paste("Total time to process summary stats file", k, ":", process_end - process_start))
  }
  
  loop_end_time <- Sys.time()  # End timing the loop iteration
  print(paste("Total time for file number", j, ":", loop_end_time - start_time))
}

Version without timing

In [None]:
                             
# for(j in 1:nrow(df_this_chr)){
#   print(paste0("File number: ", j))
#   if (grepl("empty", df_this_chr$path[j])) {
#     message(paste0("no model for ", df_this_chr$path[j]))
#     next
#   }

#   all_files_exist <- TRUE
#   outnames <- vector("character", length(summary_stats_list))

#   for (k in 1:length(summary_stats_list)) {
#     outnames[k] <- gsub("\\.rds$", paste0("_", basename(tools::file_path_sans_ext(summary_stats_list[[k]])), "_results.rds"), df_this_chr$path[j])
#     if (!file.exists(outnames[k])) {
#       all_files_exist <- FALSE
#       break  # Exit the loop early if any file does not exist
#     }
#   }

#   #if (all_files_exist) {
#    # print(paste("All output files already exist for", df_this_chr$path[j], "- Skipping"))
#    # next  # Skip to the next file if all outputs already exist
#   #}
#   #
#   my_csv <- fread(df_this_chr$path[j])
#   setkey(my_csv, cg)
#   cgs <- levels(factor(my_csv$cg))
#   print(paste("Loaded csv file:", df_this_chr$path[j]))

#   for (k in 1:length(summary_stats_list)) {
#     outname <- gsub("_dt_combined.csv", paste0("_", basename(tools::file_path_sans_ext(summary_stats_list[[k]])), "_results.rds"), df_this_chr$path[j])
#     #if(file.exists(outname)) next
#     summary_stats <- summary_stats_data[[k]]

#     MWASmodels <- vector("list", length(cgs))
# #    if (is.null(summary_stats)) {
# #      summary_stats <- suppressWarnings(fread(summary_stats_list[[k]]))
# #      summary_stats <- clean_and_standardize_colnames(summary_stats)
# #    }

#     # for (i in seq_along(my_rds@models)) {
#     #   this_MethylationBase <- my_rds@models[[i]]
#     #   if(length(this_MethylationBase@snpWeights) == 0){
#     #     next
#     #   }
#     #   MWASmodels[[i]] <- process_model(this_MethylationBase, my_SNPs, summary_stats)
#     #   MWASmodels[[i]]['bp'] <- this_MethylationBase@methylationPosition
#     #   #pb$tick()
#     # }

#     for (i in seq_along(cgs)) {
#       this_cg <- cgs[i]
      
#       this_cg_data <- my_csv[cg == this_cg]
#       MWASmodels[[i]] <- process_model_csv(this_cg_data, my_SNPs, summary_stats)

#       if(is.null(MWASmodels[[i]])){
#         next
#       }
      
#       MWASmodels[[i]]['bp'] <- MWASmodels[[i]]['bp'] <- this_cg
#       #pb$tick()
#     }

#     results <- MWASresults(MWASmodels, paths$pvar_path, paths$pgen_path, paths$psam_path, summary_stats_list[[k]], df_this_chr$path[j])

    
#     # results <- foreach(i = seq_along(my_rds@models), .packages = c("CpGWAS", "data.table"), .combine='c') %dopar% {
#     # this_MethylationBase <- my_rds@models[[i]]
#     # if (length(this_MethylationBase@snpWeights) == 0) {
#     #   return(NULL)  # Return NULL for models that should be skipped
#     # }
#     # MWASmodel <- process_model(this_MethylationBase, my_SNPs, summary_stats)
#     # MWASmodel['bp'] <- this_MethylationBase@methylationPosition
#     # return(MWASmodel)
#     # }
    
#     results@MWASmodels <- Filter(Negate(is.null), results@MWASmodels)  # Remove NULL entries
    
#     saveRDS(results, outname)

#     #filtered_models <- Filter(Negate(is.null), results@MWASmodels)
#     df <- do.call(rbind, results@MWASmodels)
#     df <- as.data.frame(df)

#     outname_csv <- gsub("rds", "csv", outname)
#     fwrite(df, outname_csv)
    
#     print(paste("Saved results to:", outname))
#   }
# }