# Sanity testing stage 2 batch processing code

In [1]:
library(CpGWAS)
library(data.table)
library(stringr)
library(optparse)

# Command line options
# option_list <- list(
#   make_option(c("-g", "--genome_file_index"), type = "integer", default = 1,
#               help = "Index of genome file to process"),
#   make_option(c("-d", "--data_file"), type = "character", default = "/expanse/lustre/projects/jhu152/naglemi/mwas/CpGWAS/scripts/12-OUT_matched_SNP_meth_cov_outputs.csv",
#               help = "Path to data file")
# )

#opt <- parse_args(OptionParser(option_list = option_list))

opt <- list()
opt$data_file <- "/expanse/lustre/projects/jhu152/naglemi/mwas/CpGWAS/scripts/12-OUT_matched_SNP_meth_cov_outputs.csv"
opt$genome_file_index <- 1

In [3]:
# Load genome files
genome_files <- list.files("/expanse/lustre/projects/jhu152/naglemi/mwas/gwas",
                           pattern = "EUR", full.names = TRUE)
genome_files <- genome_files[grepl("pvar", genome_files)]

genome_files <- data.table(path = genome_files, Chr = NA)

genome_files$Chr <- str_split_fixed(genome_files$path, "chr", 2)[, 2]
genome_files$Chr <- gsub(".pvar", "", genome_files$Chr)

genome_files$Chr <- as.integer(genome_files$Chr)
genome_files <- genome_files[order(genome_files$Chr), ]

df <- fread(opt$data_file)

summary_stats_list <- list.files("/expanse/lustre/projects/jhu152/naglemi/mwas/gwas", pattern = "stat", full.names = TRUE)

# Pre-load all summary stats files into a list and clean/standardize column names
summary_stats_data <- lapply(summary_stats_list, function(path) {
  stats <- suppressWarnings(data.table::fread(path))
  colnames(stats) <- gsub("#CHROM", "CHR", colnames(stats))
  clean_and_standardize_colnames(stats)
})

print("Starting genome file processing")
# Process the specified genome file
g <- opt$genome_file_index
print(paste("Processing genome file index:", g))

paths <- list(
  pvar_path = genome_files[g]$path,
  pgen_path = gsub("pvar", "pgen", genome_files[g]$path),
  psam_path = gsub("pvar", "psam", genome_files[g]$path)
)

my_SNPs <- CpGWAS::loadSNPData(paths$pvar_path, paths$pgen_path, paths$psam_path)
setkey(my_SNPs$pvar_dt, `#CHROM`, POS)
df_this_chr <- df[which(df$Chr == genome_files[g]$Chr), ]

summary_stats_data <- lapply(summary_stats_data, function(stats) stats[`CHR` == genome_files[g]$Chr])

print("Loaded SNP data")
print("Files for this Chr:")
print(nrow(df_this_chr))

[1] "Starting genome file processing"
[1] "Processing genome file index: 1"
[1] "Loaded SNP data"
[1] "Files for this Chr:"
[1] 1965


Let's skip ahead to a row for which we are very highly interested....

In [7]:
df_this_chr <- df_this_chr[grepl("chr1_all-libd_chr1-chr1_all-908982-928981-dynamic-1corestotal-allcorepera-caud-20240510-145818.rds", df_this_chr$path), ]

In [8]:
df_this_chr

Chr,population,region,chunk_start,chunk_end,SNP_data,methylation_data,last_meth_value_with_SNP_coverage,first_meth_value_with_SNP_coverage,last_meth_index_with_SNP_coverage,first_meth_index_with_SNP_coverage,cov_file,modified_methylation_data,path
<int>,<chr>,<chr>,<int>,<dbl>,<chr>,<chr>,<int>,<int>,<int>,<int>,<chr>,<chr>,<chr>
1,all,caud,908982,928981,/dcs04/lieber/statsgen/shizhong/michael/mwas/gwas//libd_chr1.pgen,/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno/caud/out/chr1_all.rda,248918358,1069461,2202702,8982,/dcs04/lieber/statsgen/mnagle/mwas/full_covariates/all_caud.csv,/dcs04/lieber/statsgen/mnagle/mwas/pheno/caud/out/chr1_all_908982-928981.rds,..//output_EXPANSE_a2_caud/libd_chr1-chr1_all-libd_chr1-chr1_all-908982-928981-dynamic-1corestotal-allcorepera-caud-20240510-145818.rds


In [2]:
# Expected result

# cg  z	p
# 73418062	-12.52445	5.487211e-36
# 73418161	-15.61757	5.526944e-55
# 73418186	-15.64082	3.837365e-55
# 73418205	-21.06016	1.845558e-98
# 73418313	-19.28061	7.814446e-83

In [10]:
                         
#for(j in 1:nrow(df_this_chr)){
print(paste0("File number: ", j))
if (grepl("empty", df_this_chr$path[j])) {
    message(paste0("no model for ", df_this_chr$path[j]))
    next
}

all_files_exist <- TRUE
outnames <- vector("character", length(summary_stats_list))

for (k in 1:length(summary_stats_list)) {
    outnames[k] <- gsub("\\.rds$", paste0("_", basename(tools::file_path_sans_ext(summary_stats_list[[k]])), "_results.rds"), df_this_chr$path[j])
    if (!file.exists(outnames[k])) {
        all_files_exist <- FALSE
        break  # Exit the loop early if any file does not exist
    }
}

if (all_files_exist) {
    print(paste("All output files already exist for", df_this_chr$path[j], "- Skipping"))
    next  # Skip to the next file if all outputs already exist
}
#
my_rds <- tryCatch({
    readRDS(df_this_chr$path[j])
}, error = function(e) {
# Print an error message and skip this iteration
    message("ALERT!!! Error reading RDS file: ", e$message)
    return(NULL)  # Return NULL to signal failure
})

# Check if the readRDS call returned NULL (which indicates an error)
if (is.null(my_rds)) {
    next  # Skip the rest of this loop iteration
}

print(paste("Loaded RDS file:", df_this_chr$path[j]))

[1] "File number: 1"
[1] "Loaded RDS file: ..//output_EXPANSE_a2_caud/libd_chr1-chr1_all-libd_chr1-chr1_all-908982-928981-dynamic-1corestotal-allcorepera-caud-20240510-145818.rds"


Only want to run the scz summary stats for this sanity test here

In [11]:
summary_stats_list

In [13]:
  #for (k in 1:length(summary_stats_list)) {
k <- 3

In [14]:
outname <- gsub("\\.rds$", paste0("_", basename(tools::file_path_sans_ext(summary_stats_list[[k]])), "_results.rds"), df_this_chr$path[j])
if(file.exists(outname)) next
summary_stats <- summary_stats_data[[k]]

MWASmodels <- vector("list", length(my_rds@models))
if (is.null(summary_stats)) {
  summary_stats <- suppressWarnings(fread(summary_stats_list[[k]]))
  summary_stats <- clean_and_standardize_colnames(summary_stats)
}

In [16]:
my_rds@models[1089]

[[1]]
An object of class "MethylationBase"
Slot "methylationPosition":
[1] 73418313

Slot "windowSize":
[1] 10000

Slot "n_SNPs":
[1] 42

Slot "glmnetModel":
NULL

Slot "snpWeights":
chr1:73409670:T:G chr1:73411882:A:G chr1:73412206:C:T chr1:73414606:G:C 
    -0.0002346703      0.0015688527     -0.0003398210     -0.0004525323 
chr1:73416795:A:G chr1:73417197:A:G chr1:73418139:T:G chr1:73419155:C:T 
    -0.0078607045     -0.0019676466      0.0011079731     -0.0078656450 
chr1:73422680:A:G chr1:73423542:G:A chr1:73423762:T:C chr1:73424910:G:A 
    -0.0004059075     -0.0003679525     -0.0003150577     -0.0022276979 
chr1:73426069:G:A chr1:73426204:G:A chr1:73426737:G:T chr1:73426896:A:G 
    -0.0002387222     -0.0017936772     -0.0017341619     -0.0017662490 
chr1:73426930:A:C chr1:73427141:C:A 
    -0.0018663216     -0.0017452064 

Slot "intercept":
        s0 
0.03110752 

Slot "alpha":
[1] 0.5

Slot "lambda":
[1] 0.004389116

Slot "evaluation_results":
         cor          mse 
0.7818

In [18]:
i <- 1089

In [20]:
#for (i in seq_along(my_rds@models)) {

this_MethylationBase <- my_rds@models[[i]]
SNP_split <- stringr::str_split_fixed(names(this_MethylationBase@snpWeights), ":", 4)
SNP_split[, 1] <- gsub("chr", "", SNP_split[, 1])
SNP_split_dt <- data.table::as.data.table(SNP_split)
data.table::setnames(SNP_split_dt, c("chr", "post", "ref", "alt"))
SNP_split_dt[, `:=`(chr = as.integer(chr), post = as.integer(post))]
data.table::setkey(SNP_split_dt, chr, post)

relevant_SNP_indices <- my_SNPs$pvar_dt[SNP_split_dt, on = .(`#CHROM` = chr, POS = post), which = TRUE, nomatch = 0]
relevant_ids <- my_SNPs$pvar_dt$ID[relevant_SNP_indices]
summary_stats_sub <- summary_stats[relevant_ids, nomatch = 0]

In [21]:
if (!identical(summary_stats_sub$BP, SNP_split_dt$post)) {
    summary_stats_sub <- summary_stats_sub[order(summary_stats_sub$BP), ]
    if (!identical(summary_stats_sub$BP, SNP_split_dt$post)) {
        unmatched_positions <- !SNP_split_dt$post %in% summary_stats_sub$BP
        if (any(unmatched_positions)) {
            SNP_split_dt <- SNP_split_dt[!unmatched_positions, ]
            this_MethylationBase@snpWeights <- this_MethylationBase@snpWeights[!unmatched_positions]
            relevant_SNP_indices <- my_SNPs$pvar_dt[SNP_split_dt, on = .(`#CHROM` = chr, POS = post), which = TRUE, nomatch = 0]
            if (!identical(summary_stats_sub$BP, SNP_split_dt$post)) {
                stop("SNP order does not match even after removing unmatched positions. This should not happen. Code is broken.")
            }
        }
    }
}

if (!identical(SNP_split_dt$alt, summary_stats_sub$A2) | !identical(SNP_split_dt$ref, summary_stats_sub$A1)) {
    not_matching <- which(SNP_split_dt$alt != summary_stats_sub$A2)
    summary_stats_ref_flipped <- SNP_split_dt$ref[not_matching]
    summary_stats_alt_flipped <- SNP_split_dt$alt[not_matching]
    SNP_split_dt[not_matching, `:=`(ref = summary_stats_alt_flipped, alt = summary_stats_ref_flipped)]
    this_MethylationBase@snpWeights[not_matching] <- this_MethylationBase@snpWeights[not_matching] * -1
}

G <- pgenlibr::ReadList(my_SNPs$pgen, variant_subset = relevant_SNP_indices)
#print(paste("Performing MWAS for model index:", i))

In [22]:
mwas(z = summary_stats_sub$BETA, w = this_MethylationBase@snpWeights, G = G)

In [None]:
mwas_out <- mwas(z = summary_stats_sub$BETA, w = this_MethylationBase@snpWeights, G = G)

MWASmodels[[i]] <- mwas_out
}

results <- MWASresults(MWASmodels, paths$pvar_path, paths$pgen_path, paths$psam_path, summary_stats_list[[k]], df_this_chr$path[j])
saveRDS(results, outname)
print(paste("Saved results to:", outname))
}
#}

## Try for all summary stats

In [27]:
for (k in 1:length(summary_stats_list)) {
    #k <- 3
    
    outname <- gsub("\\.rds$", paste0("_", basename(tools::file_path_sans_ext(summary_stats_list[[k]])), "_results.rds"), df_this_chr$path[j])
    #if(file.exists(outname)) next
    summary_stats <- summary_stats_data[[k]]
    
    MWASmodels <- vector("list", length(my_rds@models))
    if (is.null(summary_stats)) {
      summary_stats <- suppressWarnings(fread(summary_stats_list[[k]]))
      summary_stats <- clean_and_standardize_colnames(summary_stats)
    }
    
    i <- 1089
    
    #for (i in seq_along(my_rds@models)) {
    
    this_MethylationBase <- my_rds@models[[i]]
    SNP_split <- stringr::str_split_fixed(names(this_MethylationBase@snpWeights), ":", 4)
    SNP_split[, 1] <- gsub("chr", "", SNP_split[, 1])
    SNP_split_dt <- data.table::as.data.table(SNP_split)
    data.table::setnames(SNP_split_dt, c("chr", "post", "ref", "alt"))
    SNP_split_dt[, `:=`(chr = as.integer(chr), post = as.integer(post))]
    data.table::setkey(SNP_split_dt, chr, post)
    
    relevant_SNP_indices <- my_SNPs$pvar_dt[SNP_split_dt, on = .(`#CHROM` = chr, POS = post), which = TRUE, nomatch = 0]
    relevant_ids <- my_SNPs$pvar_dt$ID[relevant_SNP_indices]
    summary_stats_sub <- summary_stats[relevant_ids, nomatch = 0]
    
    if (!identical(summary_stats_sub$BP, SNP_split_dt$post)) {
        summary_stats_sub <- summary_stats_sub[order(summary_stats_sub$BP), ]
        if (!identical(summary_stats_sub$BP, SNP_split_dt$post)) {
            unmatched_positions <- !SNP_split_dt$post %in% summary_stats_sub$BP
            if (any(unmatched_positions)) {
                SNP_split_dt <- SNP_split_dt[!unmatched_positions, ]
                this_MethylationBase@snpWeights <- this_MethylationBase@snpWeights[!unmatched_positions]
                relevant_SNP_indices <- my_SNPs$pvar_dt[SNP_split_dt, on = .(`#CHROM` = chr, POS = post), which = TRUE, nomatch = 0]
                if (!identical(summary_stats_sub$BP, SNP_split_dt$post)) {
                    stop("SNP order does not match even after removing unmatched positions. This should not happen. Code is broken.")
                }
            }
        }
    }
    
    if (!identical(SNP_split_dt$alt, summary_stats_sub$A2) | !identical(SNP_split_dt$ref, summary_stats_sub$A1)) {
        not_matching <- which(SNP_split_dt$alt != summary_stats_sub$A2)
        summary_stats_ref_flipped <- SNP_split_dt$ref[not_matching]
        summary_stats_alt_flipped <- SNP_split_dt$alt[not_matching]
        SNP_split_dt[not_matching, `:=`(ref = summary_stats_alt_flipped, alt = summary_stats_ref_flipped)]
        this_MethylationBase@snpWeights[not_matching] <- this_MethylationBase@snpWeights[not_matching] * -1
    }
    
    G <- pgenlibr::ReadList(my_SNPs$pgen, variant_subset = relevant_SNP_indices)
    #print(paste("Performing MWAS for model index:", i))
    
    mwas(z = summary_stats_sub$BETA, w = this_MethylationBase@snpWeights, G = G)
    
    mwas_out <- mwas(z = summary_stats_sub$BETA, w = this_MethylationBase@snpWeights, G = G)
    
    MWASmodels[[i]] <- mwas_out
    
    results <- MWASresults(MWASmodels, paths$pvar_path, paths$pgen_path, paths$psam_path, summary_stats_list[[k]], df_this_chr$path[j])
    #print(results)
    saveRDS(results, outname)
    print(paste("Saved results to:", outname))
    
}

[1] "Saved results to: ..//output_EXPANSE_a2_caud/libd_chr1-chr1_all-libd_chr1-chr1_all-908982-928981-dynamic-1corestotal-allcorepera-caud-20240510-145818_gwas_stat_bp_results.rds"
[1] "Saved results to: ..//output_EXPANSE_a2_caud/libd_chr1-chr1_all-libd_chr1-chr1_all-908982-928981-dynamic-1corestotal-allcorepera-caud-20240510-145818_gwas_stat_mdd_results.rds"
[1] "Saved results to: ..//output_EXPANSE_a2_caud/libd_chr1-chr1_all-libd_chr1-chr1_all-908982-928981-dynamic-1corestotal-allcorepera-caud-20240510-145818_gwas_stat_scz_results.rds"


In [38]:
rds1 <- readRDS("..//output_EXPANSE_a2_caud/libd_chr1-chr1_all-libd_chr1-chr1_all-908982-928981-dynamic-1corestotal-allcorepera-caud-20240510-145818_gwas_stat_bp_results.rds")
rds2 <- readRDS("..//output_EXPANSE_a2_caud/libd_chr1-chr1_all-libd_chr1-chr1_all-908982-928981-dynamic-1corestotal-allcorepera-caud-20240510-145818_gwas_stat_mdd_results.rds")
rds3 <- readRDS("..//output_EXPANSE_a2_caud/libd_chr1-chr1_all-libd_chr1-chr1_all-908982-928981-dynamic-1corestotal-allcorepera-caud-20240510-145818_gwas_stat_scz_results.rds")

In [30]:
head(names(attributes(rds1)))

In [36]:
head(rds1@MWASmodels)

In [41]:
rds1@MWASmodels[1085:1089]

In [37]:
rds1@MWASmodels[1089]

In [39]:
rds2@MWASmodels[1089]

In [40]:
rds3@MWASmodels[1089]

In [35]:
rds1@MWASmodels

In [32]:
rds1@MWASmodels