# Produce tables of stage 2 MWAS results

In [1]:
library(data.table)
library(foreach)
library(doParallel)

Loading required package: iterators

Loading required package: parallel



In [2]:
getwd()

In [3]:
df <- fread("12-OUT_matched_SNP_meth_cov_outputs.csv")

In [4]:
print(nrow(df))

[1] 16098


In [None]:
library(data.table)
library(ggplot2)

# Initialize logging
log_file <- file("processing_log.txt", open = "wt")
sink(log_file, type = "message")
sink(log_file, type = "output", append = TRUE)

traits <- c("bp", "mdd", "scz")
df$stage2_paths <- gsub(".rds", "_gwas_stat_", df$path)
df$final_paths <- vector("list", length(df$stage2_paths))

for (trait in traits) {
  message("Processing trait: ", trait)
  df$final_paths <- paste0(df$stage2_paths, trait, "_results.rds")
  output_file <- paste0("16a6-OUT_stage2_MWAS_", trait, ".csv")
  header_written <- FALSE

  for (i in seq_along(df$final_paths)) {
    if (grepl("empty", df$final_paths[i])) next
    
    message("Processing file ", i, " of ", length(df$final_paths))
    stage2_in <- readRDS(df$final_paths[i])
    stage1_in <- readRDS(df$path[i])
    
    if (length(stage1_in@models) != length(stage2_in@MWASmodels)) {
      stop("Files don't match")
    }

    data_list <- vector("list", length(stage1_in@models))
    for (j in seq_along(stage1_in@models)) {
      model1 <- stage1_in@models[[j]]
      model2 <- stage2_in@MWASmodels[[j]]

      this_z <- model2["z"]
      this_p <- model2["p"]
      this_n <- model2["n"]
      this_pos <- model1@methylationPosition
      this_stats <- stage2_in@summary_stats_path
      this_scaff <- stage1_in@scaffoldIdentifier

      data_list[[j]] <- data.table(
        z = this_z,
        p = this_p,
        n = this_n,
        pos = this_pos,
        stats = this_stats,
        scaff = this_scaff
      )
    }

    combined_data <- rbindlist(data_list, use.names = TRUE, fill = TRUE)

    # Write data incrementally
    if (!header_written) {
      fwrite(combined_data, output_file)
      header_written <- TRUE
    } else {
      fwrite(combined_data, output_file, append = TRUE)
    }
  }
}

# Close the log file
sink(type = "message")
sink(type = "output")
close(log_file)


Processing trait: bp

Processing file 1 of 16098

Processing file 2 of 16098

Processing file 3 of 16098

Processing file 4 of 16098

Processing file 5 of 16098

Processing file 6 of 16098

Processing file 7 of 16098

Processing file 8 of 16098

Processing file 9 of 16098

Processing file 10 of 16098

Processing file 11 of 16098

Processing file 12 of 16098

Processing file 13 of 16098

Processing file 14 of 16098

Processing file 15 of 16098

Processing file 16 of 16098

Processing file 17 of 16098

Processing file 18 of 16098

Processing file 19 of 16098

Processing file 20 of 16098

Processing file 21 of 16098

Processing file 22 of 16098

Processing file 23 of 16098

Processing file 24 of 16098

Processing file 25 of 16098

Processing file 26 of 16098

Processing file 27 of 16098

Processing file 28 of 16098

Processing file 29 of 16098

Processing file 30 of 16098

Processing file 31 of 16098

Processing file 32 of 16098

Processing file 33 of 16098

Processing file 34 of 16098

P

## Trust but verify

In [5]:
getwd()

In [1]:
# Function to check for errors in the log file
check_log_for_errors <- function(log_path) {
  log_contents <- readLines(log_path)
  errors <- grep("error|stop|interrupted", tolower(log_contents), value = TRUE)
  if (length(errors) > 0) {
    return(list(success = FALSE, messages = errors))
  }
  return(list(success = TRUE))
}

# Usage
log_result <- check_log_for_errors("processing_log.txt")
if (log_result$success) {
  cat("No errors found in log.\n")
} else {
  cat("Errors found in log:\n", paste(log_result$messages, collapse = "\n"))
}


No errors found in log.


In [1]:
library(data.table)

In [4]:
# SLOW Function to validate output files
validate_output_files <- function(trait_names, output_path_template) {
  for (trait in trait_names) {
    file_path <- sprintf(output_path_template, trait)
    if (!file.exists(file_path)) {
      cat(sprintf("Output file for %s does not exist.\n", trait))
      next
    }
    dt <- fread(file_path)
    if (nrow(dt) == 0) {
      cat(sprintf("Output file for %s is empty.\n", trait))
    } else {
      cat(sprintf("Output file for %s has %d rows.\n", trait, nrow(dt)))
    }
  }
}

# Usage
#validate_output_files(c("bp", "mdd", "scz"), "16a5-OUT_stage2_MWAS_%s.csv")


In [2]:
# Function to check if output files exist and are not empty
check_output_files_existence_and_size <- function(trait_names, output_path_template) {
  results <- list()
  for (trait in trait_names) {
    file_path <- sprintf(output_path_template, trait)
    if (!file.exists(file_path)) {
      results[[trait]] <- sprintf("Output file for %s does not exist.", trait)
    } else {
      file_size <- file.info(file_path)$size
      if (file_size > 0) {
        results[[trait]] <- sprintf("Output file for %s is valid with size %d bytes.", trait, file_size)
      } else {
        results[[trait]] <- sprintf("Output file for %s exists but is empty.", trait)
      }
    }
  }
  return(results)
}

# Usage
file_check_results <- check_output_files_existence_and_size(c("bp", "mdd", "scz"), "16a5-OUT_stage2_MWAS_%s.csv")
for (result in file_check_results) {
  cat(result, "\n")
}


ERROR: Error in sprintf("Output file for %s is valid with size %d bytes.", trait, : invalid format '%d'; use format %f, %e, %g or %a for numeric objects


In [3]:
# Function to check if output files exist and are not empty
check_output_files_existence_and_size <- function(trait_names, output_path_template) {
  results <- list()
  for (trait in trait_names) {
    file_path <- sprintf(output_path_template, trait)
    if (!file.exists(file_path)) {
      results[[trait]] <- sprintf("Output file for %s does not exist.", trait)
    } else {
      file_size <- as.integer(file.info(file_path)$size)  # Ensure integer format
      if (file_size > 0) {
        results[[trait]] <- sprintf("Output file for %s is valid with size %d bytes.", trait, file_size)
      } else {
        results[[trait]] <- sprintf("Output file for %s exists but is empty.", trait)
      }
    }
  }
  return(results)
}

# Usage
file_check_results <- check_output_files_existence_and_size(c("bp", "mdd", "scz"), "16a5-OUT_stage2_MWAS_%s.csv")
for (result in file_check_results) {
  cat(result, "\n")
}


“NAs introduced by coercion to integer range”


ERROR: Error in if (file_size > 0) {: missing value where TRUE/FALSE needed


In [4]:
# Function to check if output files exist and are not empty
check_output_files_existence_and_size <- function(trait_names, output_path_template) {
  results <- list()
  for (trait in trait_names) {
    file_path <- sprintf(output_path_template, trait)
    if (!file.exists(file_path)) {
      results[[trait]] <- sprintf("Output file for %s does not exist.", trait)
    } else {
      file_size <- file.info(file_path)$size  # Get the file size without casting
      if (!is.na(file_size) && file_size > 0) {
        results[[trait]] <- sprintf("Output file for %s is valid with size %f bytes.", trait, file_size)
      } else if (is.na(file_size)) {
        results[[trait]] <- sprintf("Error retrieving file size for %s.", trait)
      } else {
        results[[trait]] <- sprintf("Output file for %s exists but is empty.", trait)
      }
    }
  }
  return(results)
}

# Usage
file_check_results <- check_output_files_existence_and_size(c("bp", "mdd", "scz"), "16a5-OUT_stage2_MWAS_%s.csv")
for (result in file_check_results) {
  cat(result, "\n")
}


Output file for bp is valid with size 16912150884.000000 bytes. 
Output file for mdd does not exist. 
Output file for scz does not exist. 


## Second attempt

In [6]:
library(data.table)
library(ggplot2)

# Initialize logging
log_file <- file("processing_log.txt", open = "wt")
sink(log_file, type = "message")
sink(log_file, type = "output", append = TRUE)

tryCatch({
    traits <- c("bp", "mdd", "scz")
    df$stage2_paths <- gsub(".rds", "_gwas_stat_", df$path)
    df$final_paths <- vector("list", length(df$stage2_paths))

    for (trait in traits) {
        message("Processing trait: ", trait)
        df$final_paths <- paste0(df$stage2_paths, trait, "_results.rds")
        output_file <- paste0("16a6-OUT_stage2_MWAS_", trait, ".csv")
        header_written <- FALSE

        for (i in seq_along(df$final_paths)) {
            if (grepl("empty", df$final_paths[i])) next
            
            message("Processing file ", i, " of ", length(df$final_paths))
            stage2_in <- readRDS(df$final_paths[i])
            stage1_in <- readRDS(df$path[i])
            
            if (length(stage1_in@models) != length(stage2_in@MWASmodels)) {
                stop("Files don't match")
            }

            data_list <- vector("list", length(stage1_in@models))
            for (j in seq_along(stage1_in@models)) {
                model1 <- stage1_in@models[[j]]
                model2 <- stage2_in@MWASmodels[[j]]

                data_list[[j]] <- data.table(
                    z = model2["z"],
                    p = model2["p"],
                    n = model2["n"],
                    pos = model1@methylationPosition,
                    stats = stage2_in@summary_stats_path,
                    scaff = stage1_in@scaffoldIdentifier
                )
            }

            combined_data <- rbindlist(data_list, use.names = TRUE, fill = TRUE)

            # Write data incrementally
            if (!header_written) {
                fwrite(combined_data, output_file)
                header_written <- TRUE
            } else {
                fwrite(combined_data, output_file, append = TRUE)
            }
        }
    }
}, error = function(e) {
    message("An error occurred: ", e$message)
}, finally = {
    sink(NULL)  # Turn off logging
    close(log_file)
    message("Logging ended.")
})



An error occurred: object of type 'closure' is not subsettable



ERROR: Error in close.connection(log_file): cannot close 'message' sink connection
