# Produce tables of stage 2 MWAS results

In [1]:
library(data.table)
library(foreach)
library(doParallel)

Loading required package: iterators

Loading required package: parallel



In [2]:
getwd()

In [3]:
df <- fread("12-OUT_matched_SNP_meth_cov_outputs.csv")

In [4]:
print(nrow(df))

[1] 16098


In [5]:
library(data.table)
library(ggplot2)

In [6]:
# Initialize logging
log_file <- file("processing_log.txt", open = "wt")
sink(log_file, type = "message")
sink(log_file, type = "output", append = TRUE)

In [7]:
traits <- c("bp", "mdd", "scz")
df$stage2_paths <- gsub(".rds", "_gwas_stat_", df$path)
df$final_paths <- vector("list", length(df$stage2_paths))

In [8]:
for (trait in traits) {
  message("Processing trait: ", trait)
  df$final_paths <- paste0(df$stage2_paths, trait, "_results.rds")
}

Processing trait: bp

Processing trait: mdd

Processing trait: scz



In [None]:
for (trait in traits) {
  message("Processing trait: ", trait)
  df$final_paths <- paste0(df$stage2_paths, trait, "_results.rds")
  output_file <- paste0("16a6-OUT_stage2_MWAS_", trait, ".csv")
  header_written <- FALSE

  for (i in seq_along(df$final_paths)) {
    if (grepl("empty", df$final_paths[i])) next
    
    # Only print a message for every 20th file
    if (i %% 20 == 0) {
      message("Processing file ", i, " of ", length(df$final_paths))
    }

    stage2_in <- readRDS(df$final_paths[i])
    stage1_in <- readRDS(df$path[i])
    
    if (length(stage1_in@models) != length(stage2_in@MWASmodels)) {
      stop("Files don't match")
    }

    data_list <- vector("list", length(stage1_in@models))
    for (j in seq_along(stage1_in@models)) {
      model1 <- stage1_in@models[[j]]
      model2 <- stage2_in@MWASmodels[[j]]

      data_list[[j]] <- data.table(
        z = model2["z"],
        p = model2["p"],
        n = model2["n"],
        pos = model1@methylationPosition,
        stats = stage2_in@summary_stats_path,
        scaff = stage1_in@scaffoldIdentifier
      )
    }

    combined_data <- rbindlist(data_list, use.names = TRUE, fill = TRUE)

    # Write data incrementally
    if (!header_written) {
      fwrite(combined_data, output_file)
      header_written <- TRUE
    } else {
      fwrite(combined_data, output_file, append = TRUE)
    }
  }
}

# Close the log file
sink(type = "message")
sink(type = "output")
close(log_file)

## Trust but verify

In [9]:
which(df$final_paths == "..//output_EXPANSE_a2_hippo/libd_chr2-chr2_all-libd_chr2-chr2_all-1520001-1540000-dynamic-1corestotal-allcorepera-20240505-001208_gwas_stat_bp_results.rds")

## Investigate the files that didn't finish and repeat if needed

In [10]:
df$final_paths[4294]

In [11]:
file.exists(df$final_paths[4294])

In [12]:
file.exists(df$final_paths[4293])

In [13]:
file.exists(df$final_paths[4299])

In [None]:
df$exists <- file.exists(df$final_paths)

In [None]:
levels(factor(df$exists))

In [None]:
table(df$exists)

In [5]:
getwd()

## Checking logs etc

In [1]:
# Function to check for errors in the log file
check_log_for_errors <- function(log_path) {
  log_contents <- readLines(log_path)
  errors <- grep("error|stop|interrupted", tolower(log_contents), value = TRUE)
  if (length(errors) > 0) {
    return(list(success = FALSE, messages = errors))
  }
  return(list(success = TRUE))
}

# Usage
log_result <- check_log_for_errors("processing_log.txt")
if (log_result$success) {
  cat("No errors found in log.\n")
} else {
  cat("Errors found in log:\n", paste(log_result$messages, collapse = "\n"))
}


No errors found in log.


In [1]:
library(data.table)

In [4]:
# SLOW Function to validate output files
validate_output_files <- function(trait_names, output_path_template) {
  for (trait in trait_names) {
    file_path <- sprintf(output_path_template, trait)
    if (!file.exists(file_path)) {
      cat(sprintf("Output file for %s does not exist.\n", trait))
      next
    }
    dt <- fread(file_path)
    if (nrow(dt) == 0) {
      cat(sprintf("Output file for %s is empty.\n", trait))
    } else {
      cat(sprintf("Output file for %s has %d rows.\n", trait, nrow(dt)))
    }
  }
}

# Usage
#validate_output_files(c("bp", "mdd", "scz"), "16a5-OUT_stage2_MWAS_%s.csv")


In [2]:
# Function to check if output files exist and are not empty
check_output_files_existence_and_size <- function(trait_names, output_path_template) {
  results <- list()
  for (trait in trait_names) {
    file_path <- sprintf(output_path_template, trait)
    if (!file.exists(file_path)) {
      results[[trait]] <- sprintf("Output file for %s does not exist.", trait)
    } else {
      file_size <- file.info(file_path)$size
      if (file_size > 0) {
        results[[trait]] <- sprintf("Output file for %s is valid with size %d bytes.", trait, file_size)
      } else {
        results[[trait]] <- sprintf("Output file for %s exists but is empty.", trait)
      }
    }
  }
  return(results)
}

# Usage
file_check_results <- check_output_files_existence_and_size(c("bp", "mdd", "scz"), "16a5-OUT_stage2_MWAS_%s.csv")
for (result in file_check_results) {
  cat(result, "\n")
}


ERROR: Error in sprintf("Output file for %s is valid with size %d bytes.", trait, : invalid format '%d'; use format %f, %e, %g or %a for numeric objects


In [3]:
# Function to check if output files exist and are not empty
check_output_files_existence_and_size <- function(trait_names, output_path_template) {
  results <- list()
  for (trait in trait_names) {
    file_path <- sprintf(output_path_template, trait)
    if (!file.exists(file_path)) {
      results[[trait]] <- sprintf("Output file for %s does not exist.", trait)
    } else {
      file_size <- as.integer(file.info(file_path)$size)  # Ensure integer format
      if (file_size > 0) {
        results[[trait]] <- sprintf("Output file for %s is valid with size %d bytes.", trait, file_size)
      } else {
        results[[trait]] <- sprintf("Output file for %s exists but is empty.", trait)
      }
    }
  }
  return(results)
}

# Usage
file_check_results <- check_output_files_existence_and_size(c("bp", "mdd", "scz"), "16a5-OUT_stage2_MWAS_%s.csv")
for (result in file_check_results) {
  cat(result, "\n")
}


“NAs introduced by coercion to integer range”


ERROR: Error in if (file_size > 0) {: missing value where TRUE/FALSE needed


In [4]:
# Function to check if output files exist and are not empty
check_output_files_existence_and_size <- function(trait_names, output_path_template) {
  results <- list()
  for (trait in trait_names) {
    file_path <- sprintf(output_path_template, trait)
    if (!file.exists(file_path)) {
      results[[trait]] <- sprintf("Output file for %s does not exist.", trait)
    } else {
      file_size <- file.info(file_path)$size  # Get the file size without casting
      if (!is.na(file_size) && file_size > 0) {
        results[[trait]] <- sprintf("Output file for %s is valid with size %f bytes.", trait, file_size)
      } else if (is.na(file_size)) {
        results[[trait]] <- sprintf("Error retrieving file size for %s.", trait)
      } else {
        results[[trait]] <- sprintf("Output file for %s exists but is empty.", trait)
      }
    }
  }
  return(results)
}

# Usage
file_check_results <- check_output_files_existence_and_size(c("bp", "mdd", "scz"), "16a5-OUT_stage2_MWAS_%s.csv")
for (result in file_check_results) {
  cat(result, "\n")
}


Output file for bp is valid with size 16912150884.000000 bytes. 
Output file for mdd does not exist. 
Output file for scz does not exist. 


## Second attempt

In [6]:
library(data.table)
library(ggplot2)

# Initialize logging
log_file <- file("processing_log.txt", open = "wt")
sink(log_file, type = "message")
sink(log_file, type = "output", append = TRUE)

tryCatch({
    traits <- c("bp", "mdd", "scz")
    df$stage2_paths <- gsub(".rds", "_gwas_stat_", df$path)
    df$final_paths <- vector("list", length(df$stage2_paths))

    for (trait in traits) {
        message("Processing trait: ", trait)
        df$final_paths <- paste0(df$stage2_paths, trait, "_results.rds")
        output_file <- paste0("16a6-OUT_stage2_MWAS_", trait, ".csv")
        header_written <- FALSE

        for (i in seq_along(df$final_paths)) {
            if (grepl("empty", df$final_paths[i])) next
            
            message("Processing file ", i, " of ", length(df$final_paths))
            stage2_in <- readRDS(df$final_paths[i])
            stage1_in <- readRDS(df$path[i])
            
            if (length(stage1_in@models) != length(stage2_in@MWASmodels)) {
                stop("Files don't match")
            }

            data_list <- vector("list", length(stage1_in@models))
            for (j in seq_along(stage1_in@models)) {
                model1 <- stage1_in@models[[j]]
                model2 <- stage2_in@MWASmodels[[j]]

                data_list[[j]] <- data.table(
                    z = model2["z"],
                    p = model2["p"],
                    n = model2["n"],
                    pos = model1@methylationPosition,
                    stats = stage2_in@summary_stats_path,
                    scaff = stage1_in@scaffoldIdentifier
                )
            }

            combined_data <- rbindlist(data_list, use.names = TRUE, fill = TRUE)

            # Write data incrementally
            if (!header_written) {
                fwrite(combined_data, output_file)
                header_written <- TRUE
            } else {
                fwrite(combined_data, output_file, append = TRUE)
            }
        }
    }
}, error = function(e) {
    message("An error occurred: ", e$message)
}, finally = {
    sink(NULL)  # Turn off logging
    close(log_file)
    message("Logging ended.")
})



An error occurred: object of type 'closure' is not subsettable



ERROR: Error in close.connection(log_file): cannot close 'message' sink connection
