# Use GWAS summary stats for SNP->CpG->trait MWAS

In [1]:
#' MWAS function
#'
#' Performs methylation-wide association study analysis.
#'
#' @param z Z-scores for effect of SNPs on external phenotype.
#' @param w Weights for effect of SNPs on methylation.
#' @param G SNP genotype matrix.
#' @return Named vector containing z-score, p-value, and the number of weights.
#' @export
mwas <- function(z, w, G){   
  if(length(w) > 1){
    # z-scores for effect of SNPs on external phenotype
    #. are weighted according to weights for effect of SNPs on methylation
    z <- z %*% w
    # compute correlation matrix of SNP matrix, which captures LD structure
    z.cor <- cor(G)
    # add small value to diagonal to avoid singular matrix
    #  which may otherwise happen if two SNPs in perfect LD
    z.cor <- z.cor + diag(dim(z.cor)[1])*0.1 
    # variance of correlated variables is weighted sum 
    # multiplying w by corr matrix once gives a vector representing
    #. the variance of each individual SNP and the extent to which they are
    #. influenced by other SNPs. Multiplying again by w sums up pairwise contributions
    #. and reflects total variance of weighted sum.
    #. the first w is automatically transposed by R so we don't have to write t(w)
    se <- sqrt(w %*%  z.cor %*%  w)
    z <- z/se
    p <- pnorm(abs(z), lower.tail=F)*2
    return(c(z=z, p=p, n=length(w)))
  } else {
    p <- pnorm(abs(z), lower.tail=F)*2
    return(c(z=z, p=p, n=1))
  }
}

#' MWASmodel class
#' @export
setClass(
  "MWASmodel",
  representation(
    methylationBase = "MethylationBase",
    #summary_stats = "data.table",
    mwas_out = "numeric"
  )
)

#' MWASmodel constructor
#' @param methylationBase MethylationBase object
#' @param summary_stats Data table of summary statistics
#' @param mwas_out Numeric vector of MWAS output
#' @return MWASmodel object
#' @export
MWASmodel <- function(methylationBase,
                      #summary_stats,
                      mwas_out) {
  new("MWASmodel",
      methylationBase = methylationBase,
      #summary_stats = summary_stats,
      mwas_out = mwas_out)
}

#' Process a single MWAS model
#'
#' @param methylationBase MethylationBase object
#' @param my_SNPs SNP data
#' @param summary_stats Data table of summary statistics
#' @return MWASmodel object
#' @export
#' @importFrom stringr str_split_fixed
#' @importFrom data.table as.data.table setnames setkey
#' @importFrom data.table `%chin%`
#' @importFrom pgenlibr ReadList
process_model <- function(methylationBase, my_SNPs, summary_stats) {
  
  SNP_split <- stringr::str_split_fixed(names(methylationBase@snpWeights), ":", 4)
  SNP_split[,1] <- gsub("chr", "", SNP_split[,1])
  # Convert SNP_split to data.table and set integer types
  SNP_split_dt <- as.data.table(SNP_split)
  setnames(SNP_split_dt, c("chr", "post", "ref", "alt"))
  SNP_split_dt[, `:=`(chr = as.integer(chr), post = as.integer(post))]
  
  setkey(SNP_split_dt, chr, post)
  
  # Use a join with the keys
  relevant_SNP_indices <- my_SNPs$pvar_dt[SNP_split_dt, on = .(`#CHROM` = chr, POS = post), which = TRUE, nomatch = 0]
  # We only want summary stats for the specific SNPs contributing to this
  #. methylation site in our model
  relevant_ids <- my_SNPs$pvar_dt$ID[relevant_SNP_indices]
  
  # Subset summary_stats in constant time using a keyed join
  #recover()
  summary_stats_sub <- summary_stats[relevant_ids, nomatch = 0]
  
  #summary_stats_sub <- summary_stats[SNP %chin% relevant_ids]
  # # z is a vector of the SNP weights from GWAS summary statistics
  # z <- summary_stats_sub$logOR
  # 
  # # w is a vector of the SNP weights from the CpGWAS model
  # w <- methylationBase@snpWeights[relevant_SNP_indices]
  
  # Ensuring the order matches and handling unmatched positions
  if(!identical(summary_stats_sub$BP, as.integer(SNP_split[, 2]))){
    summary_stats_sub <- summary_stats_sub[order(summary_stats_sub$BP), ]
    if(!identical(summary_stats_sub$BP, as.integer(SNP_split[, 2]))){
      # Identify positions in SNP_split not found in summary_stats_sub$BP
      unmatched_positions <- !SNP_split[, 2] %in% summary_stats_sub$BP
      if (any(unmatched_positions)) {
        # Remove rows from SNP_split where positions do not match any in summary_stats_sub$BP
        SNP_split <- SNP_split[!unmatched_positions, ]
        # Assuming you would need to recompute the relevant SNP indices and stats
        relevant_SNP_indices <- my_SNPs$pvar_dt[SNP_split, on = .(`#CHROM` = chr, POS = pos), which = TRUE, nomatch = 0]
        relevant_ids <- my_SNPs$pvar_dt$ID[relevant_SNP_indices]
        summary_stats_sub <- summary_stats[relevant_ids, nomatch = 0]
      }
      # Check again after removing unmatched positions
      if(!identical(summary_stats_sub$BP, as.integer(SNP_split[, 2]))) {
        stop("SNP order does not match even after removing unmatched positions. This should not happen. Code is broken.")
      }
    }
  }
  
  # need to make sure direction is right
  if(!identical(SNP_split[, 4], summary_stats_sub$A2) |
     !identical(SNP_split[, 3], summary_stats_sub$A1)){
    not_matching <- which(SNP_split[, 4] != summary_stats_sub$A2)
    # Flip our data to match the summary stats for these
    summary_stats_ref_flipped <- SNP_split[, 3][not_matching]
    summary_stats_alt_flipped <- SNP_split[, 4][not_matching]
    SNP_split[, 3][not_matching] <- summary_stats_alt_flipped
    SNP_split[, 4][not_matching] <- summary_stats_ref_flipped
    methylationBase@snpWeights[not_matching] <-
      methylationBase@snpWeights[not_matching] * -1
  }
  
  # Subset the genotype data
  G <- pgenlibr::ReadList(my_SNPs$pgen,
                          variant_subset = relevant_SNP_indices)
  
  mwas_out <- mwas(z = summary_stats_sub$BETA,
                   w = methylationBase@snpWeights,
                   G = G)
  
  MWASmodel(methylationBase,
            #summary_stats_sub,
            mwas_out)
}

#' MWASresults class
#' @export
setClass(
  "MWASresults",
  representation(
    MWASmodels = "list",
    pvar_path = "character",
    pgen_path = "character",
    psam_path = "character",
    summary_stats_path = "character",
    rds_path = "character"
  )
)

#' MWASresults constructor
#' @param MWASmodels List of MWASmodel objects
#' @param pvar_path Path to pvar file
#' @param pgen_path Path to pgen file
#' @param psam_path Path to psam file
#' @param summary_stats_path Path to summary statistics file
#' @param rds_path Path to RDS file
#' @return MWASresults object
#' @export
MWASresults <- function(MWASmodels, pvar_path, pgen_path, psam_path, summary_stats_path, rds_path) {
  new("MWASresults",
      MWASmodels = MWASmodels,
      pvar_path = pvar_path,
      pgen_path = pgen_path,
      psam_path = psam_path,
      summary_stats_path = summary_stats_path,
      rds_path = rds_path)
}

#' Clean and standardize column names
#'
#' @param summary_stats Data table of summary statistics
#' @return Data table with standardized column names
#' @export
#' @importFrom stringr str_split
clean_and_standardize_colnames <- function(summary_stats) {
  # Check if the header is tab-delimited while the rest is space-delimited
  if (grepl("\t", colnames(summary_stats)[1])) {
    real_colnames <- str_split(colnames(summary_stats)[1], "\t")[[1]]
    colnames(summary_stats) <- real_colnames
  }
  
  # Standardize column names
  colnames(summary_stats) <- gsub("chr", "CHR", colnames(summary_stats))
  colnames(summary_stats) <- gsub("pos", "BP", colnames(summary_stats))
  colnames(summary_stats) <- gsub("POS", "BP", colnames(summary_stats))
  colnames(summary_stats) <- gsub("MarkerName", "SNP", colnames(summary_stats))
  colnames(summary_stats) <- gsub("ID", "SNP", colnames(summary_stats))
  colnames(summary_stats) <- gsub("LogOR", "logOR", colnames(summary_stats))
  
  # If there's no logOR columns, create one, which will be log of OR column
  # but we only do this if there's already an OR column
  if(!"logOR" %in% colnames(summary_stats)) {
    if("OR" %in% colnames(summary_stats)) {
      summary_stats[, logOR := log(OR)]
    }
  }
  
  colnames(summary_stats) <- gsub("logOR", "BETA", colnames(summary_stats))
  # Convert summary_stats to a keyed data.table for fast lookups
  setkey(summary_stats, SNP)
  
  return(summary_stats)
}

#' Process MWAS models
#'
#' @param my_rds An object containing models
#' @param my_SNPs SNP data
#' @param summary_stats Data table of summary statistics
#' @param paths List of paths to data files
#' @param summary_stats_path Path to summary statistics file
#' @return MWASresults object
#' @export
#' @importFrom progress progress_bar
#' @importFrom data.table setkey
process_MWAS_models <- function(my_rds, my_SNPs, paths, summary_stats_path, rds_path, summary_stats = NULL) {
  # pb <- progress_bar$new(
  #   format = "[:bar] :percent eta: :eta",
  #   total = length(my_rds@models), clear = FALSE, width = 60
  # )
  
  MWASmodels <- vector("list", length(my_rds@models))
  
  if(is.null(summary_stats)) {
    summary_stats <- suppressWarnings(fread(summary_stats_path))
    summary_stats <- clean_and_standardize_colnames(summary_stats)
  }

  for (i in seq_along(my_rds@models)) {
    this_MethylationBase <- my_rds@models[[i]]
    MWASmodels[[i]] <- process_model(this_MethylationBase, my_SNPs, summary_stats)
    #pb$tick()
  }
  
  # Ensure the lengths of my_rds@models and MWASmodels are the same
  stopifnot(length(my_rds@models) == length(MWASmodels))
  
  results <- MWASresults(MWASmodels, paths$pvar_path, paths$pgen_path, paths$psam_path, summary_stats_path, rds_path)
  
  return(results)
}

“undefined slot classes in definition of "MWASmodel": methylationBase(class "MethylationBase")”


In [2]:
library(CpGWAS)
library(data.table)
library(stringr)


Attaching package: ‘CpGWAS’


The following objects are masked _by_ ‘.GlobalEnv’:

    clean_and_standardize_colnames, mwas, MWASmodel, MWASresults,
    process_model, process_MWAS_models




In [3]:
genome_files <- list.files("/expanse/lustre/projects/jhu152/naglemi/mwas/gwas",
                           pattern = "EUR", full.names = TRUE)
genome_files <- genome_files[grepl("pvar", genome_files)]

In [4]:
genome_files <- data.table(path = genome_files,
                           Chr = NA)

In [5]:
genome_files$Chr <- str_split_fixed(genome_files$path,
                                    "chr",
                                    2)[,2]
genome_files$Chr <- gsub(".pvar", "", genome_files$Chr)

In [6]:
genome_files$Chr <- as.integer(genome_files$Chr)
genome_files <- genome_files[order(genome_files$Chr), ]

In [7]:
df <- fread("12-OUT_matched_SNP_meth_cov_outputs.csv")

In [8]:
summary_stats_list <-  list.files("/expanse/lustre/projects/jhu152/naglemi/mwas/gwas",
                                  pattern = "stat", full.names = TRUE)

In [9]:
summary_stats_list

In [10]:
#   FOR THIS TO BE EFFICIENT WE MUST LOAD IN SUMMARY STATS FIRST INSTEAD OF PASSING PATH

In [11]:
# Pre-load all summary stats files into a list and clean/standardize column names
summary_stats_data <- lapply(summary_stats_list, function(path) {
  stats <- suppressWarnings(data.table::fread(path))
  colnames(stats) <- gsub("#CHROM", "CHR", colnames(stats))
  clean_and_standardize_colnames(stats)
})

In [12]:
head(summary_stats_data[[1]])

CHR,BP,SNP,A1,A2,BETA,SE,PVAL,NGT,FCAS,FCON,IMPINFO,NEFFDIV2,NCAS,NCON,DIRE
<int>,<int>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<int>,<int>,<chr>
10,98322628,10:100082385_C_A,C,A,0.004001981,0.0201,0.8422,0,0.934,0.941,0.971,43354.73,37940,103559,+++-+--++-+-++---+--++++-++--++++++--+++-+-----??-?+--+?-
10,98429811,10:100189568_C_A,C,A,0.010098835,0.02,0.6135,25,0.925,0.925,0.89,42290.42,36834,102499,-++--++++--+-++++++-?---++--+--+-+---+-+----+++??-?+?+-?+
10,98624242,10:100383999_C_G,C,G,-0.007397293,0.0122,0.5428,0,0.799,0.797,1.0,43445.24,38084,103625,--++++-+++-++-++-+--++-++-+-+---+++-+----+--++-??-?---+-+
10,99046608,10:100806365_A_C,A,C,-0.002703652,0.0152,0.8603,26,0.119,0.126,0.993,43445.24,38084,103625,++--++++-+-++---+++-+---+++-+--+---+--++---++++??+?---++-
10,99053963,10:100813720_C_T,C,T,-0.00359646,0.0152,0.8123,0,0.118,0.126,0.99,43445.24,38084,103625,--++----+-+--+++---+-+++---+-++-+++-++--+++-+--??-?+++--+
10,99497901,10:101257658_T_G,T,G,0.005096988,0.0112,0.6474,0,0.259,0.259,0.997,43445.24,38084,103625,-+--+++----+-+--++++---+---++-++---+--++--+-+++??+?--+--+


In [None]:
Sys.time()

In [None]:
library(profvis)

profvis({

# Loop over the loaded objects instead of paths
#for(g in 1:nrow(genome_files)){
for(g in 1){
    
    print(genome_files[g])
    paths <- list(pvar_path = genome_files[g]$path,
                  pgen_path = gsub("pvar", "pgen", genome_files[g]$path),
                  psam_path = gsub("pvar", "psam", genome_files[g]$path))

    my_SNPs <- CpGWAS::loadSNPData(paths$pvar_path, paths$pgen_path, paths$psam_path)
    setkey(my_SNPs$pvar_dt, `#CHROM`, POS)
    df_this_chr <- df[which(df$Chr == genome_files[g]$Chr), ]

    for(j in 1:5){
#    for(j in 1:nrow(df_this_chr)){
        #print(df_this_chr$path[j])
        if(grepl("empty", df_this_chr$path[j])){
            message(paste0("no model for ", df_this_chr$path[j]))
            next
        }

        my_rds <- readRDS(df_this_chr$path[j])

#        for(k in 1){
        for(k in 1:length(summary_stats_list)){
            # Saving results
            outname <- gsub("\\.rds$", 
                            paste0("_", 
                                   basename(tools::file_path_sans_ext(summary_stats_list[[k]])), 
                                   "_results.rds"), 
                            df_this_chr$path[j])
            if(file.exists(outname)){
                if (j%%20 == 0)
               message(paste0("We already have file ", outname))
               next
            }
            print(summary_stats_list[[k]])
            summary_stats <- summary_stats_data[[k]]  # Use pre-loaded and cleaned summary stats

            # Unpacking process_MWAS_models here
            MWASmodels <- vector("list", length(my_rds@models))
            if(is.null(summary_stats)) {
                summary_stats <- suppressWarnings(fread(summary_stats_list[[k]]))
                summary_stats <- clean_and_standardize_colnames(summary_stats)
            }

            for (i in seq_along(my_rds@models)) {
                this_MethylationBase <- my_rds@models[[i]]
                
                # Unpacking process_model here
                SNP_split <- stringr::str_split_fixed(names(this_MethylationBase@snpWeights), ":", 4)
                SNP_split[,1] <- gsub("chr", "", SNP_split[,1])
                SNP_split_dt <- data.table::as.data.table(SNP_split)
                data.table::setnames(SNP_split_dt, c("chr", "post", "ref", "alt"))
                SNP_split_dt[, `:=`(chr = as.integer(chr), post = as.integer(post))]
                data.table::setkey(SNP_split_dt, chr, post)

                relevant_SNP_indices <- my_SNPs$pvar_dt[SNP_split_dt, on = .(`#CHROM` = chr, POS = post), which = TRUE, nomatch = 0]
                relevant_ids <- my_SNPs$pvar_dt$ID[relevant_SNP_indices]
                summary_stats_sub <- summary_stats[relevant_ids, nomatch = 0]

                # Ensuring the order matches and handling unmatched positions
                if(!identical(summary_stats_sub$BP, SNP_split_dt$post)){
                    # Order summary_stats_sub by BP
                    summary_stats_sub <- summary_stats_sub[order(summary_stats_sub$BP), ]
                    if(!identical(summary_stats_sub$BP, SNP_split_dt$post)){
                        # Identify positions in SNP_split not found in summary_stats_sub$BP
                        unmatched_positions <- !SNP_split_dt$post %in% summary_stats_sub$BP
                        if (any(unmatched_positions)) {
                            # Remove rows from SNP_split_dt where positions do not match any in summary_stats_sub$BP
                            SNP_split_dt <- SNP_split_dt[!unmatched_positions, ]
                            
                            # Remove corresponding entries from this_MethylationBase@snpWeights
                            this_MethylationBase@snpWeights <- this_MethylationBase@snpWeights[!unmatched_positions]

                            relevant_SNP_indices <- my_SNPs$pvar_dt[SNP_split_dt, on = .(`#CHROM` = chr, POS = post), which = TRUE, nomatch = 0]
                
                            # Check again after removing unmatched positions
                            if(!identical(summary_stats_sub$BP, SNP_split_dt$post)) {
                                stop("SNP order does not match even after removing unmatched positions. This should not happen. Code is broken.")
                            }
                        }
                    }
                }


                if(!identical(SNP_split_dt$alt, summary_stats_sub$A2) |
                   !identical(SNP_split_dt$ref, summary_stats_sub$A1)){
                    not_matching <- which(SNP_split_dt$alt != summary_stats_sub$A2)
                    summary_stats_ref_flipped <- SNP_split_dt$ref[not_matching]
                    summary_stats_alt_flipped <- SNP_split_dt$alt[not_matching]
                    SNP_split_dt[not_matching, `:=`(ref = summary_stats_alt_flipped, alt = summary_stats_ref_flipped)]
                    this_MethylationBase@snpWeights[not_matching] <-
                        this_MethylationBase@snpWeights[not_matching] * -1
                }


                G <- pgenlibr::ReadList(my_SNPs$pgen, variant_subset = relevant_SNP_indices)
                mwas_out <- mwas(z = summary_stats_sub$BETA,
                                 w = this_MethylationBase@snpWeights,
                                 G = G)

                MWASmodels[[i]] <- mwas_out  # Assuming MWASmodel is a simple function to wrap results
                
            }

            results <- MWASresults(MWASmodels, paths$pvar_path, paths$pgen_path, paths$psam_path, summary_stats_list[[k]], df_this_chr$path[j])
            
            #message(paste0("saving to ", outname))
            #message(Sys.time())
            saveRDS(results, outname)
        }
    }
}

    })

In [None]:
SNP_split

In [None]:
SNP_split_dt

In [None]:
this_MethylationBase@snpWeights

In [None]:
length(this_MethylationBase@snpWeights)

In [None]:
summary_stats_sub

In [None]:
w

In [None]:
i

In [None]:
j

In [None]:
k

In [None]:
dim(G)

In [None]:
this_MethylationBase@snpWeights

In [None]:
length(this_MethylationBase@snpWeights)

In [None]:
summary_stats_sub

In [None]:
SNP_split

In [None]:
j

In [None]:
k

In [None]:
i

In [None]:
length(as.integer(SNP_split[, 2]))

We need to address edge case where WE have the SNP but summary stats don't

In [None]:
as.integer(SNP_split[, 2])

In [None]:
summary_stats_sub

In [None]:
# Loop over the loaded objects instead of paths
for(i in 1:nrow(genome_files)){
    print(genome_files[i])
    paths <- list(pvar_path = genome_files[i]$path,
                  pgen_path = gsub("pvar", "pgen", genome_files[i]$path),
                  psam_path = gsub("pvar", "psam", genome_files[i]$path))

    my_SNPs <- CpGWAS::loadSNPData(paths$pvar_path, paths$pgen_path, paths$psam_path)
    df_this_chr <- df[which(df$Chr == genome_files[i]$Chr), ]

    for(j in 1:nrow(df_this_chr)){
        print(df_this_chr$path[j])
        if(grepl("empty", df_this_chr$path[j])){
            message(paste0("no model for ", df_this_chr$path[j]))
            next
        }

        my_rds <- readRDS(df_this_chr$path[j])

        for(k in 1:length(summary_stats_list)){
            print(summary_stats_list[[k]])
            summary_stats <- summary_stats_data[[k]]  # Use pre-loaded and cleaned summary stats

            # Unpacking process_MWAS_models here
            MWASmodels <- vector("list", length(my_rds@models))
            if(is.null(summary_stats)) {
                summary_stats <- suppressWarnings(fread(summary_stats_list[[k]]))
                summary_stats <- clean_and_standardize_colnames(summary_stats)
            }

            for (i in seq_along(my_rds@models)) {
                this_MethylationBase <- my_rds@models[[i]]
                MWASmodels[[i]] <- process_model(this_MethylationBase, my_SNPs, summary_stats)
            }

            # Ensure the lengths of my_rds@models and MWASmodels are the same
            stopifnot(length(my_rds@models) == length(MWASmodels))

            results <- MWASresults(MWASmodels, paths$pvar_path, paths$pgen_path, paths$psam_path, summary_stats_list[[k]], df_this_chr$path[j])

            # Saving results
            outname <- gsub("\\.rds$", 
                            paste0("_", 
                                   basename(tools::file_path_sans_ext(summary_stats_list[[k]])), 
                                   "_results.rds"), 
                            df_this_chr$path[j])
            
            message(paste0("saving to ", outname))
            message(Sys.time())
            saveRDS(results, outname)
        }
    }
}


In [None]:
# Loop over the loaded objects instead of paths
for(i in 1:nrow(genome_files)){
    print(genome_files[i])
    paths <- list(pvar_path = genome_files[i]$path,
                  pgen_path = gsub("pvar", "pgen", genome_files[i]$path),
                  psam_path = gsub("pvar", "psam", genome_files[i]$path))

    my_SNPs <- CpGWAS::loadSNPData(paths$pvar_path, paths$pgen_path, paths$psam_path)
    df_this_chr <- df[which(df$Chr == genome_files[i]$Chr), ]

    for(j in 1:nrow(df_this_chr)){
        print(df_this_chr$path[j])
        if(grepl("empty", df_this_chr$path[j])){
            message(paste0("no model for ", df_this_chr$path[j]))
            next
        }

        my_rds <- readRDS(df_this_chr$path[j])

        for(k in 1:length(summary_stats_list)){
            print(summary_stats_list[[k]])
            summary_stats <- summary_stats_data[[k]]  # Use pre-loaded and cleaned summary stats

            results <- process_MWAS_models(my_rds = my_rds, my_SNPs = my_SNPs, paths = paths,
                                           summary_stats_path = summary_stats_list[[k]],  # Use the path string
                                           rds_path = df_this_chr$path[j],
                                           summary_stats = summary_stats)

            outname <- gsub("\\.rds$", 
                            paste0("_", 
                                   basename(tools::file_path_sans_ext(results@summary_stats_path)), 
                                   "_results.rds"), 
                            results@rds_path)
            
            message(paste0("saving to ", outname))
            message(Sys.time())
            # saveRDS(results, outname)
        }
    }
}

Why so slow? Are we still reloading summary_stats every time?

In [None]:
# Loop over chromosome genome files (pvar/pgen/psam)
#  make list of chromosome files
#. levels factor
#. select and load first set of files
#  # subset big file-matching df to those for the chromosome of interest
#. loop over those, and for each....
##Loop over summary stat files
### Loop over RDS files containing our MethylationBase objects with SNP->CpG models