# Use GWAS summary stats for SNP->CpG->trait MWAS

In [1]:
library(CpGWAS)
library(data.table)
library(stringr)

In [2]:
genome_files <- list.files("/expanse/lustre/projects/jhu152/naglemi/mwas/gwas",
                           pattern = "EUR", full.names = TRUE)
genome_files <- genome_files[grepl("pvar", genome_files)]

In [3]:
genome_files <- data.table(path = genome_files,
                           Chr = NA)

In [4]:
genome_files$Chr <- str_split_fixed(genome_files$path,
                                    "chr",
                                    2)[,2]
genome_files$Chr <- gsub(".pvar", "", genome_files$Chr)

In [5]:
genome_files$Chr <- as.integer(genome_files$Chr)
genome_files <- genome_files[order(genome_files$Chr), ]

In [6]:
df <- fread("12-OUT_matched_SNP_meth_cov_outputs.csv")

In [7]:
summary_stats_list <-  list.files("/expanse/lustre/projects/jhu152/naglemi/mwas/gwas",
                                  pattern = "stat", full.names = TRUE)

In [8]:
#   FOR THIS TO BE EFFICIENT WE MUST LOAD IN SUMMARY STATS FIRST INSTEAD OF PASSING PATH

In [9]:
# Pre-load all summary stats files into a list and clean/standardize column names
summary_stats_data <- lapply(summary_stats_list, function(path) {
  stats <- suppressWarnings(data.table::fread(path))
  clean_and_standardize_colnames(stats)
})

In [None]:
# Loop over the loaded objects instead of paths
for(i in 1:nrow(genome_files)){
    print(genome_files[i])
    paths <- list(pvar_path = genome_files[i]$path,
                  pgen_path = gsub("pvar", "pgen", genome_files[i]$path),
                  psam_path = gsub("pvar", "psam", genome_files[i]$path))

    my_SNPs <- CpGWAS::loadSNPData(paths$pvar_path, paths$pgen_path, paths$psam_path)
    df_this_chr <- df[which(df$Chr == genome_files[i]$Chr), ]

    for(j in 1:nrow(df_this_chr)){
        print(df_this_chr$path[j])
        if(grepl("empty", df_this_chr$path[j])){
            message(paste0("no model for ", df_this_chr$path[j]))
            next
        }

        my_rds <- readRDS(df_this_chr$path[j])

        for(k in 1:length(summary_stats_list)){
            print(summary_stats_list[[k]])
            summary_stats <- summary_stats_data[[k]]  # Use pre-loaded and cleaned summary stats

            results <- process_MWAS_models(my_rds = my_rds, my_SNPs = my_SNPs, paths = paths,
                                           summary_stats_path = summary_stats_list[[k]],  # Use the path string
                                           rds_path = df_this_chr$path[j],
                                           summary_stats = summary_stats)

            outname <- gsub("\\.rds$", 
                            paste0("_", 
                                   basename(tools::file_path_sans_ext(results@summary_stats_path)), 
                                   "_results.rds"), 
                            results@rds_path)
            
            message(paste0("saving to ", outname))
            # saveRDS(results, outname)
        }
    }
}

                                                                  path   Chr
                                                                <char> <int>
1: /expanse/lustre/projects/jhu152/naglemi/mwas/gwas/ref_EUR_chr1.pvar     1
[1] "..//output_EXPANSE_a2_caud/libd_chr1-chr1_AA-libd_chr1-chr1_AA-8982-28981-dynamic-1corestotal-allcorepera-20240415-104419.rds"
[1] "/expanse/lustre/projects/jhu152/naglemi/mwas/gwas/gwas_stat_bp"


saving to ..//output_EXPANSE_a2_caud/libd_chr1-chr1_AA-libd_chr1-chr1_AA-8982-28981-dynamic-1corestotal-allcorepera-20240415-104419_gwas_stat_bp_results.rds



[1] "/expanse/lustre/projects/jhu152/naglemi/mwas/gwas/gwas_stat_mdd"


saving to ..//output_EXPANSE_a2_caud/libd_chr1-chr1_AA-libd_chr1-chr1_AA-8982-28981-dynamic-1corestotal-allcorepera-20240415-104419_gwas_stat_mdd_results.rds



Why so slow? Are we still reloading summary_stats every time?

In [None]:
# Loop over chromosome genome files (pvar/pgen/psam)
#  make list of chromosome files
#. levels factor
#. select and load first set of files
#  # subset big file-matching df to those for the chromosome of interest
#. loop over those, and for each....
##Loop over summary stat files
### Loop over RDS files containing our MethylationBase objects with SNP->CpG models