# Again try running stage 2 MWAS

In [None]:
library(CpGWAS)
library(data.table)
library(stringr)
library(optparse)
library(doParallel)
library(foreach)

opt <- list(genome_file_index = 21,
            data_file = "/expanse/lustre/projects/jhu152/naglemi/mwas/CpGWAS/scripts/12-OUT_matched_SNP_meth_cov_outputs.csv",
            cores = 6)

if(opt$cores > 1) registerDoParallel(opt$cores)

df <- fread(opt$data_file)

# Load genome files
genome_files <- list.files("/expanse/lustre/projects/jhu152/naglemi/mwas/gwas_flipped",
                           pattern = "EUR", full.names = TRUE)
genome_files <- genome_files[grepl("allele", genome_files)]

genome_files <- genome_files[grepl("pvar", genome_files)]

genome_files <- data.table(path = genome_files, Chr = NA)

genome_files$Chr <- str_split_fixed(genome_files$path, "chr", 2)[, 2]
genome_files$Chr <- gsub(".pvar", "", genome_files$Chr)

genome_files$Chr <- as.integer(genome_files$Chr)
genome_files <- genome_files[order(genome_files$Chr), ]



print("Starting genome file processing")
# Process the specified genome file
g <- opt$genome_file_index
print(paste("Processing genome file index:", g))

paths <- list(
  pvar_path = genome_files[g]$path,
  pgen_path = gsub("pvar", "pgen", genome_files[g]$path),
  psam_path = gsub("pvar", "psam", genome_files[g]$path)
)

chr <- as.numeric(gsub("\\.pvar", "",
            stringr::str_split_fixed(paths$pvar_path, "chr", 2)[, 2]))

my_SNPs <- CpGWAS::loadSNPData(paths$pvar_path, paths$pgen_path, paths$psam_path)
setkey(my_SNPs$pvar_dt, `#CHROM`, POS)
df_this_chr <- df[which(df$Chr == genome_files[g]$Chr), ]

summary_stats_list <- list.files("/expanse/lustre/projects/jhu152/naglemi/mwas/gwas_flipped", pattern = "alleleprocessed", full.names = TRUE)

# Pre-load all summary stats files into a list and clean/standardize column names
summary_stats_data <- lapply(summary_stats_list, function(path) {
   stats <- suppressWarnings(data.table::fread(path))
   
   # Rename column if needed (uncomment if column name needs changing from "#CHROM" to "CHR")
   # setnames(stats, "#CHROM", "CHR")
   
   # Set key for efficient subsetting
   setkey(stats, SNP, CHR)
   
   # Subset to only include rows where 'CHR' matches the 'chr' variable
   stats <- stats[CHR == chr]

   # Clean and standardize column names if needed
   # clean_and_standardize_colnames(stats)
   
   return(stats)
})

summary_stats_data <- lapply(summary_stats_data, function(stats) stats[`CHR` == genome_files[g]$Chr])

print("Loaded SNP data")
print("Files for this Chr:")
print(nrow(df_this_chr))
                             

for(j in 1:nrow(df_this_chr)){
  if(j %% 10 == 0) print(j)
  print(paste0("File number: ", j))
  if (grepl("empty", df_this_chr$path[j])) {
    message(paste0("no model for ", df_this_chr$path[j]))
    next
  }

  all_files_exist <- TRUE
  outnames <- vector("character", length(summary_stats_list))

  for (k in 1:length(summary_stats_list)) {
    outnames[k] <- gsub("\\.rds$", paste0("_", basename(tools::file_path_sans_ext(summary_stats_list[[k]])), "_results.rds"), df_this_chr$path[j])
    if (!file.exists(outnames[k])) {
      all_files_exist <- FALSE
      break  # Exit the loop early if any file does not exist
    }
  }

  if (all_files_exist) {
    print(paste("All output files already exist for", df_this_chr$path[j], "- Skipping"))
    next  # Skip to the next file if all outputs already exist
  }
  #
  my_rds <- tryCatch({
    readRDS(df_this_chr$path[j])
  }, error = function(e) {
    # Print an error message and skip this iteration
    message("ALERT!!! Error reading RDS file: ", e$message)
    return(NULL)  # Return NULL to signal failure
  })
    
  # Check if the readRDS call returned NULL (which indicates an error)
  if (is.null(my_rds)) {
    next  # Skip the rest of this loop iteration
  }

  print(paste("Loaded RDS file:", df_this_chr$path[j]))

  for (k in 1:length(summary_stats_list)) {
    outname <- gsub("\\.rds$", paste0("_", basename(tools::file_path_sans_ext(summary_stats_list[[k]])), "_results.rds"), df_this_chr$path[j])
    if(file.exists(outname)) next
    summary_stats <- summary_stats_data[[k]]

    MWASmodels <- foreach(this_MethylationBase = my_rds@models, .packages = c("data.table", "CpGWAS"), .combine = 'c', .export = c("my_SNPs", "summary_stats")) %dopar% {
      
      if(length(this_MethylationBase@snpWeights) == 0) {
        return(list(z=NA, p=NA, n=NA, bp=this_MethylationBase@methylationPosition))
      }
      
      model <- process_model(this_MethylationBase, my_SNPs, summary_stats)
      
      if(is.null(model)) {
        return(list(z=NA, p=NA, n=NA, bp=this_MethylationBase@methylationPosition))
      }
      
      model['bp'] <- this_MethylationBase@methylationPosition
      return(list(model))  # Return as a single list
    }
      
    # MWASmodels <- future_lapply(seq_along(my_rds@models), function(i) {
    #   library(CpGWAS)
    #   library(data.table)
    #   this_MethylationBase <- my_rds@models[[i]]
      
    #   # Skip if no SNP weights
    #   if (length(this_MethylationBase@snpWeights) == 0) {
    #     return(NULL)
    #   }
      
    #   # Process the model
    #   model_result <- process_model(this_MethylationBase, my_SNPs, summary_stats)
      
    #   # Skip if processing returns NULL
    #   if (is.null(model_result)) {
    #     return(NULL)
    #   }
      
    #   # Add bp information
    #   model_result['bp'] <- this_MethylationBase@methylationPosition
    #   return(model_result)
    # }, future.seed = TRUE)  # Set future.seed to TRUE if random number generation is involved
    
    # # Filter out NULL results
    # MWASmodels <- Filter(Negate(is.null), MWASmodels)


    #MWASmodels <- vector("list", length(my_rds@models))
    #    if (is.null(summary_stats)) {
    #      summary_stats <- suppressWarnings(fread(summary_stats_list[[k]]))
    #      summary_stats <- clean_and_standardize_colnames(summary_stats)
    #    }

    # Version 1 - not working
    # Using foreach to parallelize the loop
    # results <- foreach(i = seq_along(my_rds@models), .combine='c', .packages=c("data.table", "CpGWAS")) %dopar% {
    #   this_MethylationBase <- my_rds@models[[i]]
    #   if(length(this_MethylationBase@snpWeights) == 0){
    #     return(NULL)  # Skip this iteration entirely if no SNP weights
    #   }
      
    #   model_result <- process_model(this_MethylationBase, my_SNPs, summary_stats)
    #   if(is.null(model_result)){
    #     return(NULL)  # Skip this iteration if model processing returns NULL
    #   }
      
    #   model_result['bp'] <- this_MethylationBase@methylationPosition
    #   return(model_result)
    # }
    
    # # Filter out NULL entries and reassign to MWASmodels
    # MWASmodels <- Filter(Negate(is.null), results)


    # Version 2 not parallel
    # for (i in seq_along(my_rds@models)) {
    #   this_MethylationBase <- my_rds@models[[i]]
    #   if(length(this_MethylationBase@snpWeights) == 0){
    #     next
    #   }
    #   MWASmodels[[i]] <- process_model(this_MethylationBase, my_SNPs, summary_stats)
    #   MWASmodels[[i]]['bp'] <- this_MethylationBase@methylationPosition
    #   #pb$tick()
    # }

    for (i in seq_along(my_rds@models)) {
      this_MethylationBase <- my_rds@models[[i]]
      if(length(this_MethylationBase@snpWeights) == 0){
        next
      }
      MWASmodels[[i]] <- process_model(this_MethylationBase, my_SNPs, summary_stats)

      if(is.null(MWASmodels[[i]])){
        next
      }
      
      MWASmodels[[i]]['bp'] <- this_MethylationBase@methylationPosition
      #pb$tick()
    }

    # results <- MWASresults(MWASmodels, paths$pvar_path, paths$pgen_path, paths$psam_path, summary_stats_list[[k]], df_this_chr$path[j])

      
    # version 3 not working  
    #results <- foreach(i = seq_along(my_rds@models), .packages = c("CpGWAS", "data.table"), .combine='c') %dopar% {
    # this_MethylationBase <- my_rds@models[[i]]
    # if (length(this_MethylationBase@snpWeights) == 0) {
    #   return(NULL)  # Return NULL for models that should be skipped
    # }
    # MWASmodel <- process_model(this_MethylationBase, my_SNPs, summary_stats)
    # MWASmodel['bp'] <- this_MethylationBase@methylationPosition
    # return(MWASmodel)
    # }
    results@MWASmodels <- Filter(Negate(is.null), results@MWASmodels)  # Remove NULL entries
    
    saveRDS(results, outname)

    #filtered_models <- Filter(Negate(is.null), results@MWASmodels)
    df <- do.call(rbind, results@MWASmodels)
    df <- as.data.frame(df)

    outname_csv <- gsub("rds", "csv", outname)
    fwrite(df, outname_csv)
    
    print(paste("Saved results to:", outname))
  }
}

In [None]:
New approach

In [None]:
for(j in 1:nrow(df_this_chr)){
  if(j %% 10 == 0) print(j)
  print(paste0("File number: ", j))
  if (grepl("empty", df_this_chr$path[j])) {
    message(paste0("no model for ", df_this_chr$path[j]))
    next
  }

  all_files_exist <- TRUE
  outnames <- vector("character", length(summary_stats_list))

  for (k in 1:length(summary_stats_list)) {
    outnames[k] <- gsub("\\.rds$", paste0("_", basename(tools::file_path_sans_ext(summary_stats_list[[k]])), "_results.rds"), df_this_chr$path[j])
    if (!file.exists(outnames[k])) {
      all_files_exist <- FALSE
      break  # Exit the loop early if any file does not exist
    }
  }

  if (all_files_exist) {
    print(paste("All output files already exist for", df_this_chr$path[j], "- Skipping"))
    next  # Skip to the next file if all outputs already exist
  }
  #
  my_rds <- tryCatch({
    readRDS(df_this_chr$path[j])
  }, error = function(e) {
    # Print an error message and skip this iteration
    message("ALERT!!! Error reading RDS file: ", e$message)
    return(NULL)  # Return NULL to signal failure
  })
    
  # Check if the readRDS call returned NULL (which indicates an error)
  if (is.null(my_rds)) {
    next  # Skip the rest of this loop iteration
  }

  print(paste("Loaded RDS file:", df_this_chr$path[j]))

  for (k in 1:length(summary_stats_list)) {
    outname <- gsub("\\.rds$", paste0("_", basename(tools::file_path_sans_ext(summary_stats_list[[k]])), "_results.rds"), df_this_chr$path[j])
    if(file.exists(outname)) next
    summary_stats <- summary_stats_data[[k]]

    MWASmodels <- foreach(this_MethylationBase = my_rds@models, .packages = c("data.table", "CpGWAS"), .combine = 'c', .export = c("my_SNPs", "summary_stats")) %dopar% {
      
      if(length(this_MethylationBase@snpWeights) == 0) {
        return(list(z=NA, p=NA, n=NA, bp=this_MethylationBase@methylationPosition))
      }
      
      model <- process_model(this_MethylationBase, my_SNPs, summary_stats)
      
      if(is.null(model)) {
        return(list(z=NA, p=NA, n=NA, bp=this_MethylationBase@methylationPosition))
      }
      
      model['bp'] <- this_MethylationBase@methylationPosition
      return(list(model))  # Return as a single list
    }

    results@MWASmodels <- Filter(Negate(is.null), results@MWASmodels)  # Remove NULL entries
    
    saveRDS(results, outname)

    #filtered_models <- Filter(Negate(is.null), results@MWASmodels)
    df <- do.call(rbind, results@MWASmodels)
    df <- as.data.frame(df)

    outname_csv <- gsub("rds", "csv", outname)
    fwrite(df, outname_csv)
    
    print(paste("Saved results to:", outname))
  }
}

In [None]:
Sys.time() # Started at 4L17

In [None]:
traceback()

In [None]:
# Estimate memory usage for key objects
mem_my_SNPs <- object.size(my_SNPs)
mem_summary_stats <- object.size(summary_stats)
mem_this_MethylationBase <- mean(sapply(my_rds@models, object.size))

# Calculate total memory required per core
mem_per_core <- mem_my_SNPs + mem_summary_stats + mem_this_MethylationBase

# Convert to MB
mem_per_core_MB <- mem_per_core / (1024^2)

# Print memory usage per core in MB
print(mem_per_core_MB)


In [None]:
length(results@MWASmodels)

In [None]:
names(attributes(results))

In [None]:
head(results)

In [None]:
head(summary_stats)