# Again try running stage 2 MWAS

In [1]:
library(CpGWAS)
library(data.table)
library(stringr)
library(optparse)
library(doParallel)
library(foreach)

Loading required package: foreach

Loading required package: iterators

Loading required package: parallel



In [2]:
cl <- makeCluster(120)
registerDoParallel(cl)

In [None]:
for(chr in 1:22){
    opt <- list(genome_file_index = chr,
                data_file = "/expanse/lustre/projects/jhu152/naglemi/mwas/CpGWAS/scripts/12-OUT_matched_SNP_meth_cov_outputs.csv",
                cores = 120)
    
    df <- fread(opt$data_file)
    
    # Load genome files
    genome_files <- list.files("/expanse/lustre/projects/jhu152/naglemi/mwas/gwas_flipped",
                               pattern = "EUR", full.names = TRUE)
    genome_files <- genome_files[grepl("allele", genome_files)]
    genome_files <- genome_files[grepl("pvar", genome_files)]
    genome_files <- data.table(path = genome_files, Chr = NA)
    genome_files$Chr <- str_split_fixed(genome_files$path, "chr", 2)[, 2]
    genome_files$Chr <- gsub(".pvar", "", genome_files$Chr)
    genome_files$Chr <- as.integer(genome_files$Chr)
    genome_files <- genome_files[order(genome_files$Chr), ]
    
    print("Starting genome file processing")
    g <- opt$genome_file_index
    print(paste("Processing genome file index:", g))
    
    paths <- list(
      pvar_path = genome_files[g]$path,
      pgen_path = gsub("pvar", "pgen", genome_files[g]$path),
      psam_path = gsub("pvar", "psam", genome_files[g]$path)
    )

    df_this_chr <- df[which(df$Chr == genome_files[g]$Chr), ]
    
    summary_stats_list <- list.files("/expanse/lustre/projects/jhu152/naglemi/mwas/gwas_flipped", pattern = "alleleprocessed", full.names = TRUE)
    
    summary_stats_data <- lapply(summary_stats_list, function(path) {
       stats <- suppressWarnings(data.table::fread(path))
       setkey(stats, SNP, CHR)
       stats <- stats[CHR == genome_files[g]$Chr]
       return(stats)
    })
    
    df_parts <- split(df_this_chr, cut(seq_len(nrow(df_this_chr)), opt$cores, labels = FALSE))

# Parallel processing
    foreach(part_idx = seq_along(df_parts), .packages = c("data.table", "CpGWAS")) %dopar% {
      df_part <- df_parts[[part_idx]]
    
      # Load SNPs within each worker
      my_SNPs <- CpGWAS::loadSNPData(paths$pvar_path, paths$pgen_path, paths$psam_path)
      
      for(j in 1:nrow(df_part)) {
        if(j %% 10 == 0) print(j)
        print(paste0("File number: ", j))
        if (grepl("empty", df_part$path[j])) {
          message(paste0("no model for ", df_part$path[j]))
          next
        }
    
        all_files_exist <- TRUE
        outnames <- vector("character", length(summary_stats_list))
    
        for (k in 1:length(summary_stats_list)) {
          outnames[k] <- gsub("\\.rds$", paste0("_", basename(tools::file_path_sans_ext(summary_stats_list[[k]])), "_results-par.rds"), df_part$path[j])
          if (!file.exists(outnames[k])) {
            all_files_exist <- FALSE
            break
          }
        }
    
        if (all_files_exist) {
          print(paste("All output files already exist for", df_part$path[j], "- Skipping"))
          next
        }

        # Load my_rds within each worker
        my_rds <- tryCatch({
          readRDS(df_part$path[j])
        }, error = function(e) {
          message("ALERT!!! Error reading RDS file: ", e$message)
          return(NULL)
        })
    
        if (is.null(my_rds)) {
          next
        }
    
        print(paste("Loaded RDS file:", df_part$path[j]))
    
        for (k in 1:length(summary_stats_list)) {
          outname <- gsub("\\.rds$", paste0("_", basename(tools::file_path_sans_ext(summary_stats_list[[k]])), "_results-par.rds"), df_part$path[j])
          if(file.exists(outname)) next
          summary_stats <- summary_stats_data[[k]]
    
          MWASmodels <- vector("list", length(my_rds@models))
    
          for (i in seq_along(my_rds@models)) {
            this_MethylationBase <- my_rds@models[[i]]
            if (length(this_MethylationBase@snpWeights) == 0) {
              next
            }
            MWASmodels[[i]] <- process_model(this_MethylationBase, my_SNPs, summary_stats)
            if (is.null(MWASmodels[[i]])) {
              next
            }
            MWASmodels[[i]]["bp"] <- this_MethylationBase@methylationPosition
          }

          MWASmodels <- Filter(Negate(is.null), MWASmodels)
          results <- MWASresults(MWASmodels, paths$pvar_path, paths$pgen_path, paths$psam_path, summary_stats_list[[k]], df_this_chr$path[j])
          saveRDS(results, outname)
    
          df <- do.call(rbind, results@MWASmodels)
          df <- as.data.frame(df)
    
          outname_csv <- gsub("rds", "csv", outname)
          fwrite(df, outname_csv)
    
          print(paste("Saved results to:", outname))
        }
      }
    }
    

    }

In [None]:
stopCluster(cl)

In [None]:
Sys.time() #started 1:23pm with 6 cores