# Produce tables of stage 2 MWAS results

In [52]:
library(data.table)
library(foreach)
library(doParallel)

In [53]:
getwd()

In [54]:
df <- fread("12-OUT_matched_SNP_meth_cov_outputs.csv")

In [55]:
print(nrow(df))

[1] 16098


In [56]:
library(data.table)
library(ggplot2)

In [57]:
# Initialize logging
log_file <- file("processing_log.txt", open = "wt")
sink(log_file, type = "message")
sink(log_file, type = "output", append = TRUE)

In [58]:
traits <- c("bp", "mdd", "scz")
df$stage2_paths <- gsub(".rds", "_gwas_stat_", df$path)
df$final_paths <- vector("list", length(df$stage2_paths))

In [59]:
for (trait in traits) {
  message("Processing trait: ", trait)
  df$final_paths <- paste0(df$stage2_paths, trait, "_results.rds")
}

Processing trait: bp

Processing trait: mdd

Processing trait: scz



In [None]:
for (trait in traits) {
  message("Processing trait: ", trait)
  df$final_paths <- paste0(df$stage2_paths, trait, "_results.rds")
  output_file <- paste0("16a6-OUT_stage2_MWAS_", trait, ".csv")
  header_written <- FALSE

  for (i in seq_along(df$final_paths)) {
    if (grepl("empty", df$final_paths[i])) next
    
    # Only print a message for every 20th file
    if (i %% 20 == 0) {
      message("Processing file ", i, " of ", length(df$final_paths))
    }

    stage2_in <- readRDS(df$final_paths[i])
    stage1_in <- readRDS(df$path[i])
    
    if (length(stage1_in@models) != length(stage2_in@MWASmodels)) {
      stop("Files don't match")
    }

    data_list <- vector("list", length(stage1_in@models))
    for (j in seq_along(stage1_in@models)) {
      model1 <- stage1_in@models[[j]]
      model2 <- stage2_in@MWASmodels[[j]]

      data_list[[j]] <- data.table(
        z = model2["z"],
        p = model2["p"],
        n = model2["n"],
        pos = model1@methylationPosition,
        stats = stage2_in@summary_stats_path,
        scaff = stage1_in@scaffoldIdentifier
      )
    }

    combined_data <- rbindlist(data_list, use.names = TRUE, fill = TRUE)

    # Write data incrementally
    if (!header_written) {
      fwrite(combined_data, output_file)
      header_written <- TRUE
    } else {
      fwrite(combined_data, output_file, append = TRUE)
    }
  }
}

# Close the log file
sink(type = "message")
sink(type = "output")
close(log_file)

## Trust but verify

In [9]:
which(df$final_paths == "..//output_EXPANSE_a2_hippo/libd_chr2-chr2_all-libd_chr2-chr2_all-1520001-1540000-dynamic-1corestotal-allcorepera-20240505-001208_gwas_stat_bp_results.rds")

## Investigate the files that didn't finish and repeat if needed

In [60]:
df$final_paths[4294]

In [61]:
file.exists(df$final_paths[4294])

In [62]:
file.exists(df$final_paths[4293])

In [63]:
file.exists(df$final_paths[4299])

In [64]:
df$exists <- file.exists(df$final_paths)

In [65]:
levels(factor(df$exists))

In [66]:
table(df$exists)


FALSE  TRUE 
   54 16044 

In [67]:
df[!df$exists, ]

Chr,population,region,chunk_start,chunk_end,SNP_data,methylation_data,last_meth_value_with_SNP_coverage,first_meth_value_with_SNP_coverage,last_meth_index_with_SNP_coverage,first_meth_index_with_SNP_coverage,cov_file,modified_methylation_data,path,stage2_paths,final_paths,exists
<int>,<chr>,<chr>,<int>,<dbl>,<chr>,<chr>,<int>,<int>,<int>,<int>,<chr>,<chr>,<chr>,<chr>,<chr>,<lgl>
1,AA,caud,1248982,1268981,/dcs04/lieber/statsgen/shizhong/michael/mwas/gwas//libd_chr1.pgen,/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno/caud/out/chr1_AA.rda,248918358,1069461,2202702,8982,/dcs04/lieber/statsgen/mnagle/mwas/full_covariates/AA_caud.csv,/dcs04/lieber/statsgen/mnagle/mwas/pheno/caud/out/chr1_AA_1248982-1268981.rds,..//output_EXPANSE_a2_caud/libd_chr1-chr1_AA-libd_chr1-chr1_AA-1248982-1268981-dynamic-1corestotal-allcorepera-20240501-110927-empty.rds,..//output_EXPANSE_a2_caud/libd_chr1-chr1_AA-libd_chr1-chr1_AA-1248982-1268981-dynamic-1corestotal-allcorepera-20240501-110927-empty_gwas_stat_,..//output_EXPANSE_a2_caud/libd_chr1-chr1_AA-libd_chr1-chr1_AA-1248982-1268981-dynamic-1corestotal-allcorepera-20240501-110927-empty_gwas_stat_scz_results.rds,False
1,AA,caud,1248982,1268981,/dcs04/lieber/statsgen/shizhong/michael/mwas/gwas//libd_chr1.pgen,/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno/caud/out/chr1_AA.rda,248918358,1069461,2202702,8982,/dcs04/lieber/statsgen/mnagle/mwas/full_covariates/AA_caud.csv,/dcs04/lieber/statsgen/mnagle/mwas/pheno/caud/out/chr1_AA_1248982-1268981.rds,..//output_EXPANSE_a2_caud/libd_chr1-chr1_AA-libd_chr1-chr1_AA-1248982-1268981-dynamic-1corestotal-allcorepera-caud-20240510-132314-empty.rds,..//output_EXPANSE_a2_caud/libd_chr1-chr1_AA-libd_chr1-chr1_AA-1248982-1268981-dynamic-1corestotal-allcorepera-caud-20240510-132314-empty_gwas_stat_,..//output_EXPANSE_a2_caud/libd_chr1-chr1_AA-libd_chr1-chr1_AA-1248982-1268981-dynamic-1corestotal-allcorepera-caud-20240510-132314-empty_gwas_stat_scz_results.rds,False
1,AA,caud,1268982,1288981,/dcs04/lieber/statsgen/shizhong/michael/mwas/gwas//libd_chr1.pgen,/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno/caud/out/chr1_AA.rda,248918358,1069461,2202702,8982,/dcs04/lieber/statsgen/mnagle/mwas/full_covariates/AA_caud.csv,/dcs04/lieber/statsgen/mnagle/mwas/pheno/caud/out/chr1_AA_1268982-1288981.rds,..//output_EXPANSE_a2_caud/libd_chr1-chr1_AA-libd_chr1-chr1_AA-1268982-1288981-dynamic-1corestotal-allcorepera-20240501-111229-empty.rds,..//output_EXPANSE_a2_caud/libd_chr1-chr1_AA-libd_chr1-chr1_AA-1268982-1288981-dynamic-1corestotal-allcorepera-20240501-111229-empty_gwas_stat_,..//output_EXPANSE_a2_caud/libd_chr1-chr1_AA-libd_chr1-chr1_AA-1268982-1288981-dynamic-1corestotal-allcorepera-20240501-111229-empty_gwas_stat_scz_results.rds,False
1,AA,caud,1268982,1288981,/dcs04/lieber/statsgen/shizhong/michael/mwas/gwas//libd_chr1.pgen,/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno/caud/out/chr1_AA.rda,248918358,1069461,2202702,8982,/dcs04/lieber/statsgen/mnagle/mwas/full_covariates/AA_caud.csv,/dcs04/lieber/statsgen/mnagle/mwas/pheno/caud/out/chr1_AA_1268982-1288981.rds,..//output_EXPANSE_a2_caud/libd_chr1-chr1_AA-libd_chr1-chr1_AA-1268982-1288981-dynamic-1corestotal-allcorepera-caud-20240510-132415-empty.rds,..//output_EXPANSE_a2_caud/libd_chr1-chr1_AA-libd_chr1-chr1_AA-1268982-1288981-dynamic-1corestotal-allcorepera-caud-20240510-132415-empty_gwas_stat_,..//output_EXPANSE_a2_caud/libd_chr1-chr1_AA-libd_chr1-chr1_AA-1268982-1288981-dynamic-1corestotal-allcorepera-caud-20240510-132415-empty_gwas_stat_scz_results.rds,False
1,AA,dlpfc,1248982,1268981,/dcs04/lieber/statsgen/shizhong/michael/mwas/gwas//libd_chr1.pgen,/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno/dlpfc/out/chr1_AA.rda,248918358,1069461,2202702,8982,/dcs04/lieber/statsgen/mnagle/mwas/full_covariates/AA_dlpfc.csv,/dcs04/lieber/statsgen/mnagle/mwas/pheno/dlpfc/out/chr1_AA_1248982-1268981.rds,..//output_EXPANSE_a2_dlpfc/libd_chr1-chr1_AA-libd_chr1-chr1_AA-1248982-1268981-dynamic-1corestotal-allcorepera-20240501-114006-empty.rds,..//output_EXPANSE_a2_dlpfc/libd_chr1-chr1_AA-libd_chr1-chr1_AA-1248982-1268981-dynamic-1corestotal-allcorepera-20240501-114006-empty_gwas_stat_,..//output_EXPANSE_a2_dlpfc/libd_chr1-chr1_AA-libd_chr1-chr1_AA-1248982-1268981-dynamic-1corestotal-allcorepera-20240501-114006-empty_gwas_stat_scz_results.rds,False
1,AA,dlpfc,1248982,1268981,/dcs04/lieber/statsgen/shizhong/michael/mwas/gwas//libd_chr1.pgen,/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno/dlpfc/out/chr1_AA.rda,248918358,1069461,2202702,8982,/dcs04/lieber/statsgen/mnagle/mwas/full_covariates/AA_dlpfc.csv,/dcs04/lieber/statsgen/mnagle/mwas/pheno/dlpfc/out/chr1_AA_1248982-1268981.rds,..//output_EXPANSE_a2_dlpfc/libd_chr1-chr1_AA-libd_chr1-chr1_AA-1248982-1268981-dynamic-1corestotal-allcorepera-dlpfc-20240510-190109-empty.rds,..//output_EXPANSE_a2_dlpfc/libd_chr1-chr1_AA-libd_chr1-chr1_AA-1248982-1268981-dynamic-1corestotal-allcorepera-dlpfc-20240510-190109-empty_gwas_stat_,..//output_EXPANSE_a2_dlpfc/libd_chr1-chr1_AA-libd_chr1-chr1_AA-1248982-1268981-dynamic-1corestotal-allcorepera-dlpfc-20240510-190109-empty_gwas_stat_scz_results.rds,False
1,AA,dlpfc,1268982,1288981,/dcs04/lieber/statsgen/shizhong/michael/mwas/gwas//libd_chr1.pgen,/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno/dlpfc/out/chr1_AA.rda,248918358,1069461,2202702,8982,/dcs04/lieber/statsgen/mnagle/mwas/full_covariates/AA_dlpfc.csv,/dcs04/lieber/statsgen/mnagle/mwas/pheno/dlpfc/out/chr1_AA_1268982-1288981.rds,..//output_EXPANSE_a2_dlpfc/libd_chr1-chr1_AA-libd_chr1-chr1_AA-1268982-1288981-dynamic-1corestotal-allcorepera-20240501-114308-empty.rds,..//output_EXPANSE_a2_dlpfc/libd_chr1-chr1_AA-libd_chr1-chr1_AA-1268982-1288981-dynamic-1corestotal-allcorepera-20240501-114308-empty_gwas_stat_,..//output_EXPANSE_a2_dlpfc/libd_chr1-chr1_AA-libd_chr1-chr1_AA-1268982-1288981-dynamic-1corestotal-allcorepera-20240501-114308-empty_gwas_stat_scz_results.rds,False
1,AA,dlpfc,1268982,1288981,/dcs04/lieber/statsgen/shizhong/michael/mwas/gwas//libd_chr1.pgen,/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno/dlpfc/out/chr1_AA.rda,248918358,1069461,2202702,8982,/dcs04/lieber/statsgen/mnagle/mwas/full_covariates/AA_dlpfc.csv,/dcs04/lieber/statsgen/mnagle/mwas/pheno/dlpfc/out/chr1_AA_1268982-1288981.rds,..//output_EXPANSE_a2_dlpfc/libd_chr1-chr1_AA-libd_chr1-chr1_AA-1268982-1288981-dynamic-1corestotal-allcorepera-dlpfc-20240510-190211-empty.rds,..//output_EXPANSE_a2_dlpfc/libd_chr1-chr1_AA-libd_chr1-chr1_AA-1268982-1288981-dynamic-1corestotal-allcorepera-dlpfc-20240510-190211-empty_gwas_stat_,..//output_EXPANSE_a2_dlpfc/libd_chr1-chr1_AA-libd_chr1-chr1_AA-1268982-1288981-dynamic-1corestotal-allcorepera-dlpfc-20240510-190211-empty_gwas_stat_scz_results.rds,False
1,AA,hippo,1248982,1268981,/dcs04/lieber/statsgen/shizhong/michael/mwas/gwas//libd_chr1.pgen,/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno/hippo/out/chr1_AA.rda,248918358,1069461,2202702,8982,/dcs04/lieber/statsgen/mnagle/mwas/full_covariates/AA_hippo.csv,/dcs04/lieber/statsgen/mnagle/mwas/pheno/hippo/out/chr1_AA_1248982-1268981.rds,..//output_EXPANSE_a2_hippo/libd_chr1-chr1_AA-libd_chr1-chr1_AA-1248982-1268981-dynamic-1corestotal-allcorepera-20240501-123446-empty.rds,..//output_EXPANSE_a2_hippo/libd_chr1-chr1_AA-libd_chr1-chr1_AA-1248982-1268981-dynamic-1corestotal-allcorepera-20240501-123446-empty_gwas_stat_,..//output_EXPANSE_a2_hippo/libd_chr1-chr1_AA-libd_chr1-chr1_AA-1248982-1268981-dynamic-1corestotal-allcorepera-20240501-123446-empty_gwas_stat_scz_results.rds,False
1,AA,hippo,1248982,1268981,/dcs04/lieber/statsgen/shizhong/michael/mwas/gwas//libd_chr1.pgen,/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno/hippo/out/chr1_AA.rda,248918358,1069461,2202702,8982,/dcs04/lieber/statsgen/mnagle/mwas/full_covariates/AA_hippo.csv,/dcs04/lieber/statsgen/mnagle/mwas/pheno/hippo/out/chr1_AA_1248982-1268981.rds,..//output_EXPANSE_a2_hippo/libd_chr1-chr1_AA-libd_chr1-chr1_AA-1248982-1268981-dynamic-1corestotal-allcorepera-hippo-20240511-003819-empty.rds,..//output_EXPANSE_a2_hippo/libd_chr1-chr1_AA-libd_chr1-chr1_AA-1248982-1268981-dynamic-1corestotal-allcorepera-hippo-20240511-003819-empty_gwas_stat_,..//output_EXPANSE_a2_hippo/libd_chr1-chr1_AA-libd_chr1-chr1_AA-1248982-1268981-dynamic-1corestotal-allcorepera-hippo-20240511-003819-empty_gwas_stat_scz_results.rds,False


Ok, some don't exist because we had empty rds due to no SNPs... okay, but what about the other 76?

In [68]:
df_not_empty_rds_due_to_no_snps_in_window <- df[!grepl("empty", df$path), ]

In [69]:
library(data.table)

# Sort df alphabetically by final_paths
df_not_empty_rds_due_to_no_snps_in_window <- df[!grepl("empty", df$path), ]
setorder(df_not_empty_rds_due_to_no_snps_in_window, final_paths)

In [70]:
# Remove duplicates, keeping the last occurrence
df_not_empty_rds_due_to_no_snps_in_window <- df_not_empty_rds_due_to_no_snps_in_window[
  , .SD[.N], by = modified_methylation_data
]

Even if we remove the ones interrupted...

In [71]:
table(df_not_empty_rds_due_to_no_snps_in_window$exists)


 TRUE 
11421 

In [72]:
df_not_empty_rds_due_to_no_snps_in_window[!which(df_not_empty_rds_due_to_no_snps_in_window$exists), ]

modified_methylation_data,Chr,population,region,chunk_start,chunk_end,SNP_data,methylation_data,last_meth_value_with_SNP_coverage,first_meth_value_with_SNP_coverage,last_meth_index_with_SNP_coverage,first_meth_index_with_SNP_coverage,cov_file,path,stage2_paths,final_paths,exists
<chr>,<int>,<chr>,<chr>,<int>,<dbl>,<chr>,<chr>,<int>,<int>,<int>,<int>,<chr>,<chr>,<chr>,<chr>,<lgl>


In [73]:
length(levels(factor(df_not_empty_rds_due_to_no_snps_in_window$modified_methylation_data)))

In [74]:
length(levels(factor(df_not_empty_rds_due_to_no_snps_in_window$modified_methylation_data[which(
    df_not_empty_rds_due_to_no_snps_in_window$exists == TRUE
)])))

What if we don't remove earlier observations

In [19]:
# Sort df alphabetically by final_paths
df_not_empty_rds_due_to_no_snps_in_window <- df[!grepl("empty", df$path), ]
setorder(df_not_empty_rds_due_to_no_snps_in_window, final_paths)

In [20]:
length(levels(factor(df_not_empty_rds_due_to_no_snps_in_window$modified_methylation_data[which(
    df_not_empty_rds_due_to_no_snps_in_window$exists == TRUE
)])))

In [21]:
df[(grepl("1520001", df$modified_methylation_data)), ]

Chr,population,region,chunk_start,chunk_end,SNP_data,methylation_data,last_meth_value_with_SNP_coverage,first_meth_value_with_SNP_coverage,last_meth_index_with_SNP_coverage,first_meth_index_with_SNP_coverage,cov_file,modified_methylation_data,path,stage2_paths,final_paths,exists
<int>,<chr>,<chr>,<int>,<dbl>,<chr>,<chr>,<int>,<int>,<int>,<int>,<chr>,<chr>,<chr>,<chr>,<chr>,<lgl>
2,AA,caud,1520001,1540000,/dcs04/lieber/statsgen/shizhong/michael/mwas/gwas//libd_chr2.pgen,/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno/caud/out/chr2_AA.rda,241863783,10001,2019984,1,/dcs04/lieber/statsgen/mnagle/mwas/full_covariates/AA_caud.csv,/dcs04/lieber/statsgen/mnagle/mwas/pheno/caud/out/chr2_AA_1520001-1540000.rds,..//output_EXPANSE_a2_caud/libd_chr2-chr2_AA-libd_chr2-chr2_AA-1520001-1540000-dynamic-1corestotal-allcorepera-20240423-151115.rds,..//output_EXPANSE_a2_caud/libd_chr2-chr2_AA-libd_chr2-chr2_AA-1520001-1540000-dynamic-1corestotal-allcorepera-20240423-151115_gwas_stat_,..//output_EXPANSE_a2_caud/libd_chr2-chr2_AA-libd_chr2-chr2_AA-1520001-1540000-dynamic-1corestotal-allcorepera-20240423-151115_gwas_stat_scz_results.rds,True
2,AA,caud,1520001,1540000,/dcs04/lieber/statsgen/shizhong/michael/mwas/gwas//libd_chr2.pgen,/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno/caud/out/chr2_AA.rda,241863783,10001,2019984,1,/dcs04/lieber/statsgen/mnagle/mwas/full_covariates/AA_caud.csv,/dcs04/lieber/statsgen/mnagle/mwas/pheno/caud/out/chr2_AA_1520001-1540000.rds,..//output_EXPANSE_a2_caud/libd_chr2-chr2_AA-libd_chr2-chr2_AA-1520001-1540000-dynamic-1corestotal-allcorepera-caud-20240511-062949.rds,..//output_EXPANSE_a2_caud/libd_chr2-chr2_AA-libd_chr2-chr2_AA-1520001-1540000-dynamic-1corestotal-allcorepera-caud-20240511-062949_gwas_stat_,..//output_EXPANSE_a2_caud/libd_chr2-chr2_AA-libd_chr2-chr2_AA-1520001-1540000-dynamic-1corestotal-allcorepera-caud-20240511-062949_gwas_stat_scz_results.rds,True
2,AA,dlpfc,1520001,1540000,/dcs04/lieber/statsgen/shizhong/michael/mwas/gwas//libd_chr2.pgen,/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno/dlpfc/out/chr2_AA.rda,241863783,10001,2019984,1,/dcs04/lieber/statsgen/mnagle/mwas/full_covariates/AA_dlpfc.csv,/dcs04/lieber/statsgen/mnagle/mwas/pheno/dlpfc/out/chr2_AA_1520001-1540000.rds,..//output_EXPANSE_a2_dlpfc/libd_chr2-chr2_AA-libd_chr2-chr2_AA-1520001-1540000-dynamic-1corestotal-allcorepera-20240501-144218.rds,..//output_EXPANSE_a2_dlpfc/libd_chr2-chr2_AA-libd_chr2-chr2_AA-1520001-1540000-dynamic-1corestotal-allcorepera-20240501-144218_gwas_stat_,..//output_EXPANSE_a2_dlpfc/libd_chr2-chr2_AA-libd_chr2-chr2_AA-1520001-1540000-dynamic-1corestotal-allcorepera-20240501-144218_gwas_stat_scz_results.rds,True
2,AA,dlpfc,1520001,1540000,/dcs04/lieber/statsgen/shizhong/michael/mwas/gwas//libd_chr2.pgen,/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno/dlpfc/out/chr2_AA.rda,241863783,10001,2019984,1,/dcs04/lieber/statsgen/mnagle/mwas/full_covariates/AA_dlpfc.csv,/dcs04/lieber/statsgen/mnagle/mwas/pheno/dlpfc/out/chr2_AA_1520001-1540000.rds,..//output_EXPANSE_a2_dlpfc/libd_chr2-chr2_AA-libd_chr2-chr2_AA-1520001-1540000-dynamic-1corestotal-allcorepera-20240513-213318.rds,..//output_EXPANSE_a2_dlpfc/libd_chr2-chr2_AA-libd_chr2-chr2_AA-1520001-1540000-dynamic-1corestotal-allcorepera-20240513-213318_gwas_stat_,..//output_EXPANSE_a2_dlpfc/libd_chr2-chr2_AA-libd_chr2-chr2_AA-1520001-1540000-dynamic-1corestotal-allcorepera-20240513-213318_gwas_stat_scz_results.rds,True
2,AA,dlpfc,1520001,1540000,/dcs04/lieber/statsgen/shizhong/michael/mwas/gwas//libd_chr2.pgen,/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno/dlpfc/out/chr2_AA.rda,241863783,10001,2019984,1,/dcs04/lieber/statsgen/mnagle/mwas/full_covariates/AA_dlpfc.csv,/dcs04/lieber/statsgen/mnagle/mwas/pheno/dlpfc/out/chr2_AA_1520001-1540000.rds,..//output_EXPANSE_a2_dlpfc/libd_chr2-chr2_AA-libd_chr2-chr2_AA-1520001-1540000-dynamic-1corestotal-allcorepera-dlpfc-20240511-113916.rds,..//output_EXPANSE_a2_dlpfc/libd_chr2-chr2_AA-libd_chr2-chr2_AA-1520001-1540000-dynamic-1corestotal-allcorepera-dlpfc-20240511-113916_gwas_stat_,..//output_EXPANSE_a2_dlpfc/libd_chr2-chr2_AA-libd_chr2-chr2_AA-1520001-1540000-dynamic-1corestotal-allcorepera-dlpfc-20240511-113916_gwas_stat_scz_results.rds,True
2,AA,hippo,1520001,1540000,/dcs04/lieber/statsgen/shizhong/michael/mwas/gwas//libd_chr2.pgen,/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno/hippo/out/chr2_AA.rda,241863783,10001,2019984,1,/dcs04/lieber/statsgen/mnagle/mwas/full_covariates/AA_hippo.csv,/dcs04/lieber/statsgen/mnagle/mwas/pheno/hippo/out/chr2_AA_1520001-1540000.rds,..//output_EXPANSE_a2_hippo/libd_chr2-chr2_AA-libd_chr2-chr2_AA-1520001-1540000-dynamic-1corestotal-allcorepera-20240504-223556.rds,..//output_EXPANSE_a2_hippo/libd_chr2-chr2_AA-libd_chr2-chr2_AA-1520001-1540000-dynamic-1corestotal-allcorepera-20240504-223556_gwas_stat_,..//output_EXPANSE_a2_hippo/libd_chr2-chr2_AA-libd_chr2-chr2_AA-1520001-1540000-dynamic-1corestotal-allcorepera-20240504-223556_gwas_stat_scz_results.rds,True
2,AA,hippo,1520001,1540000,/dcs04/lieber/statsgen/shizhong/michael/mwas/gwas//libd_chr2.pgen,/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno/hippo/out/chr2_AA.rda,241863783,10001,2019984,1,/dcs04/lieber/statsgen/mnagle/mwas/full_covariates/AA_hippo.csv,/dcs04/lieber/statsgen/mnagle/mwas/pheno/hippo/out/chr2_AA_1520001-1540000.rds,..//output_EXPANSE_a2_hippo/libd_chr2-chr2_AA-libd_chr2-chr2_AA-1520001-1540000-dynamic-1corestotal-allcorepera-20240514-024346.rds,..//output_EXPANSE_a2_hippo/libd_chr2-chr2_AA-libd_chr2-chr2_AA-1520001-1540000-dynamic-1corestotal-allcorepera-20240514-024346_gwas_stat_,..//output_EXPANSE_a2_hippo/libd_chr2-chr2_AA-libd_chr2-chr2_AA-1520001-1540000-dynamic-1corestotal-allcorepera-20240514-024346_gwas_stat_scz_results.rds,True
2,AA,hippo,1520001,1540000,/dcs04/lieber/statsgen/shizhong/michael/mwas/gwas//libd_chr2.pgen,/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno/hippo/out/chr2_AA.rda,241863783,10001,2019984,1,/dcs04/lieber/statsgen/mnagle/mwas/full_covariates/AA_hippo.csv,/dcs04/lieber/statsgen/mnagle/mwas/pheno/hippo/out/chr2_AA_1520001-1540000.rds,..//output_EXPANSE_a2_hippo/libd_chr2-chr2_AA-libd_chr2-chr2_AA-1520001-1540000-dynamic-1corestotal-allcorepera-hippo-20240511-164818.rds,..//output_EXPANSE_a2_hippo/libd_chr2-chr2_AA-libd_chr2-chr2_AA-1520001-1540000-dynamic-1corestotal-allcorepera-hippo-20240511-164818_gwas_stat_,..//output_EXPANSE_a2_hippo/libd_chr2-chr2_AA-libd_chr2-chr2_AA-1520001-1540000-dynamic-1corestotal-allcorepera-hippo-20240511-164818_gwas_stat_scz_results.rds,True
2,EA,caud,1520001,1540000,/dcs04/lieber/statsgen/shizhong/michael/mwas/gwas//libd_chr2.pgen,/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno/caud/out/chr2_EA.rda,241863783,10001,2019984,1,/dcs04/lieber/statsgen/mnagle/mwas/full_covariates/EA_caud.csv,/dcs04/lieber/statsgen/mnagle/mwas/pheno/caud/out/chr2_EA_1520001-1540000.rds,..//output_EXPANSE_a2_caud/libd_chr2-chr2_EA-libd_chr2-chr2_EA-1520001-1540000-dynamic-1corestotal-allcorepera-20240424-044347.rds,..//output_EXPANSE_a2_caud/libd_chr2-chr2_EA-libd_chr2-chr2_EA-1520001-1540000-dynamic-1corestotal-allcorepera-20240424-044347_gwas_stat_,..//output_EXPANSE_a2_caud/libd_chr2-chr2_EA-libd_chr2-chr2_EA-1520001-1540000-dynamic-1corestotal-allcorepera-20240424-044347_gwas_stat_scz_results.rds,True
2,EA,caud,1520001,1540000,/dcs04/lieber/statsgen/shizhong/michael/mwas/gwas//libd_chr2.pgen,/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno/caud/out/chr2_EA.rda,241863783,10001,2019984,1,/dcs04/lieber/statsgen/mnagle/mwas/full_covariates/EA_caud.csv,/dcs04/lieber/statsgen/mnagle/mwas/pheno/caud/out/chr2_EA_1520001-1540000.rds,..//output_EXPANSE_a2_caud/libd_chr2-chr2_EA-libd_chr2-chr2_EA-1520001-1540000-dynamic-1corestotal-allcorepera-caud-20240511-095613.rds,..//output_EXPANSE_a2_caud/libd_chr2-chr2_EA-libd_chr2-chr2_EA-1520001-1540000-dynamic-1corestotal-allcorepera-caud-20240511-095613_gwas_stat_,..//output_EXPANSE_a2_caud/libd_chr2-chr2_EA-libd_chr2-chr2_EA-1520001-1540000-dynamic-1corestotal-allcorepera-caud-20240511-095613_gwas_stat_scz_results.rds,True


In [22]:
df_not_empty_rds_due_to_no_snps_in_window[!df_not_empty_rds_due_to_no_snps_in_window$exists, ]

Chr,population,region,chunk_start,chunk_end,SNP_data,methylation_data,last_meth_value_with_SNP_coverage,first_meth_value_with_SNP_coverage,last_meth_index_with_SNP_coverage,first_meth_index_with_SNP_coverage,cov_file,modified_methylation_data,path,stage2_paths,final_paths,exists
<int>,<chr>,<chr>,<int>,<dbl>,<chr>,<chr>,<int>,<int>,<int>,<int>,<chr>,<chr>,<chr>,<chr>,<chr>,<lgl>
2,all,hippo,1940001,1960000,/dcs04/lieber/statsgen/shizhong/michael/mwas/gwas//libd_chr2.pgen,/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno/hippo/out/chr2_all.rda,241863783,10001,2019984,1,/dcs04/lieber/statsgen/mnagle/mwas/full_covariates/all_hippo.csv,/dcs04/lieber/statsgen/mnagle/mwas/pheno/hippo/out/chr2_all_1940001-1960000.rds,..//output_EXPANSE_a2_hippo/libd_chr2-chr2_all-libd_chr2-chr2_all-1940001-1960000-dynamic-1corestotal-allcorepera-20240514-045111.rds,..//output_EXPANSE_a2_hippo/libd_chr2-chr2_all-libd_chr2-chr2_all-1940001-1960000-dynamic-1corestotal-allcorepera-20240514-045111_gwas_stat_,..//output_EXPANSE_a2_hippo/libd_chr2-chr2_all-libd_chr2-chr2_all-1940001-1960000-dynamic-1corestotal-allcorepera-20240514-045111_gwas_stat_scz_results.rds,False
2,all,hippo,1940001,1960000,/dcs04/lieber/statsgen/shizhong/michael/mwas/gwas//libd_chr2.pgen,/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno/hippo/out/chr2_all.rda,241863783,10001,2019984,1,/dcs04/lieber/statsgen/mnagle/mwas/full_covariates/all_hippo.csv,/dcs04/lieber/statsgen/mnagle/mwas/pheno/hippo/out/chr2_all_1940001-1960000.rds,..//output_EXPANSE_a2_hippo/libd_chr2-chr2_all-libd_chr2-chr2_all-1940001-1960000-dynamic-1corestotal-allcorepera-hippo-20240511-185237.rds,..//output_EXPANSE_a2_hippo/libd_chr2-chr2_all-libd_chr2-chr2_all-1940001-1960000-dynamic-1corestotal-allcorepera-hippo-20240511-185237_gwas_stat_,..//output_EXPANSE_a2_hippo/libd_chr2-chr2_all-libd_chr2-chr2_all-1940001-1960000-dynamic-1corestotal-allcorepera-hippo-20240511-185237_gwas_stat_scz_results.rds,False
2,all,hippo,1960001,1980000,/dcs04/lieber/statsgen/shizhong/michael/mwas/gwas//libd_chr2.pgen,/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno/hippo/out/chr2_all.rda,241863783,10001,2019984,1,/dcs04/lieber/statsgen/mnagle/mwas/full_covariates/all_hippo.csv,/dcs04/lieber/statsgen/mnagle/mwas/pheno/hippo/out/chr2_all_1960001-1980000.rds,..//output_EXPANSE_a2_hippo/libd_chr2-chr2_all-libd_chr2-chr2_all-1960001-1980000-dynamic-1corestotal-allcorepera-20240505-003448.rds,..//output_EXPANSE_a2_hippo/libd_chr2-chr2_all-libd_chr2-chr2_all-1960001-1980000-dynamic-1corestotal-allcorepera-20240505-003448_gwas_stat_,..//output_EXPANSE_a2_hippo/libd_chr2-chr2_all-libd_chr2-chr2_all-1960001-1980000-dynamic-1corestotal-allcorepera-20240505-003448_gwas_stat_scz_results.rds,False
2,all,hippo,1960001,1980000,/dcs04/lieber/statsgen/shizhong/michael/mwas/gwas//libd_chr2.pgen,/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno/hippo/out/chr2_all.rda,241863783,10001,2019984,1,/dcs04/lieber/statsgen/mnagle/mwas/full_covariates/all_hippo.csv,/dcs04/lieber/statsgen/mnagle/mwas/pheno/hippo/out/chr2_all_1960001-1980000.rds,..//output_EXPANSE_a2_hippo/libd_chr2-chr2_all-libd_chr2-chr2_all-1960001-1980000-dynamic-1corestotal-allcorepera-20240514-045212.rds,..//output_EXPANSE_a2_hippo/libd_chr2-chr2_all-libd_chr2-chr2_all-1960001-1980000-dynamic-1corestotal-allcorepera-20240514-045212_gwas_stat_,..//output_EXPANSE_a2_hippo/libd_chr2-chr2_all-libd_chr2-chr2_all-1960001-1980000-dynamic-1corestotal-allcorepera-20240514-045212_gwas_stat_scz_results.rds,False
2,all,hippo,1960001,1980000,/dcs04/lieber/statsgen/shizhong/michael/mwas/gwas//libd_chr2.pgen,/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno/hippo/out/chr2_all.rda,241863783,10001,2019984,1,/dcs04/lieber/statsgen/mnagle/mwas/full_covariates/all_hippo.csv,/dcs04/lieber/statsgen/mnagle/mwas/pheno/hippo/out/chr2_all_1960001-1980000.rds,..//output_EXPANSE_a2_hippo/libd_chr2-chr2_all-libd_chr2-chr2_all-1960001-1980000-dynamic-1corestotal-allcorepera-hippo-20240511-185338.rds,..//output_EXPANSE_a2_hippo/libd_chr2-chr2_all-libd_chr2-chr2_all-1960001-1980000-dynamic-1corestotal-allcorepera-hippo-20240511-185338_gwas_stat_,..//output_EXPANSE_a2_hippo/libd_chr2-chr2_all-libd_chr2-chr2_all-1960001-1980000-dynamic-1corestotal-allcorepera-hippo-20240511-185338_gwas_stat_scz_results.rds,False
2,all,hippo,1980001,2000000,/dcs04/lieber/statsgen/shizhong/michael/mwas/gwas//libd_chr2.pgen,/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno/hippo/out/chr2_all.rda,241863783,10001,2019984,1,/dcs04/lieber/statsgen/mnagle/mwas/full_covariates/all_hippo.csv,/dcs04/lieber/statsgen/mnagle/mwas/pheno/hippo/out/chr2_all_1980001-2e+06.rds,..//output_EXPANSE_a2_hippo/libd_chr2-chr2_all-libd_chr2-chr2_all-1980001-2000000-dynamic-1corestotal-allcorepera-20240505-003549.rds,..//output_EXPANSE_a2_hippo/libd_chr2-chr2_all-libd_chr2-chr2_all-1980001-2000000-dynamic-1corestotal-allcorepera-20240505-003549_gwas_stat_,..//output_EXPANSE_a2_hippo/libd_chr2-chr2_all-libd_chr2-chr2_all-1980001-2000000-dynamic-1corestotal-allcorepera-20240505-003549_gwas_stat_scz_results.rds,False
2,all,hippo,1980001,2000000,/dcs04/lieber/statsgen/shizhong/michael/mwas/gwas//libd_chr2.pgen,/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno/hippo/out/chr2_all.rda,241863783,10001,2019984,1,/dcs04/lieber/statsgen/mnagle/mwas/full_covariates/all_hippo.csv,/dcs04/lieber/statsgen/mnagle/mwas/pheno/hippo/out/chr2_all_1980001-2e+06.rds,..//output_EXPANSE_a2_hippo/libd_chr2-chr2_all-libd_chr2-chr2_all-1980001-2000000-dynamic-1corestotal-allcorepera-20240514-045315.rds,..//output_EXPANSE_a2_hippo/libd_chr2-chr2_all-libd_chr2-chr2_all-1980001-2000000-dynamic-1corestotal-allcorepera-20240514-045315_gwas_stat_,..//output_EXPANSE_a2_hippo/libd_chr2-chr2_all-libd_chr2-chr2_all-1980001-2000000-dynamic-1corestotal-allcorepera-20240514-045315_gwas_stat_scz_results.rds,False
2,all,hippo,1980001,2000000,/dcs04/lieber/statsgen/shizhong/michael/mwas/gwas//libd_chr2.pgen,/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno/hippo/out/chr2_all.rda,241863783,10001,2019984,1,/dcs04/lieber/statsgen/mnagle/mwas/full_covariates/all_hippo.csv,/dcs04/lieber/statsgen/mnagle/mwas/pheno/hippo/out/chr2_all_1980001-2e+06.rds,..//output_EXPANSE_a2_hippo/libd_chr2-chr2_all-libd_chr2-chr2_all-1980001-2000000-dynamic-1corestotal-allcorepera-hippo-20240511-185439.rds,..//output_EXPANSE_a2_hippo/libd_chr2-chr2_all-libd_chr2-chr2_all-1980001-2000000-dynamic-1corestotal-allcorepera-hippo-20240511-185439_gwas_stat_,..//output_EXPANSE_a2_hippo/libd_chr2-chr2_all-libd_chr2-chr2_all-1980001-2000000-dynamic-1corestotal-allcorepera-hippo-20240511-185439_gwas_stat_scz_results.rds,False
2,all,hippo,2000001,2019984,/dcs04/lieber/statsgen/shizhong/michael/mwas/gwas//libd_chr2.pgen,/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno/hippo/out/chr2_all.rda,241863783,10001,2019984,1,/dcs04/lieber/statsgen/mnagle/mwas/full_covariates/all_hippo.csv,/dcs04/lieber/statsgen/mnagle/mwas/pheno/hippo/out/chr2_all_2000001-2019984.rds,..//output_EXPANSE_a2_hippo/libd_chr2-chr2_all-libd_chr2-chr2_all-2000001-2019984-dynamic-1corestotal-allcorepera-20240505-003652.rds,..//output_EXPANSE_a2_hippo/libd_chr2-chr2_all-libd_chr2-chr2_all-2000001-2019984-dynamic-1corestotal-allcorepera-20240505-003652_gwas_stat_,..//output_EXPANSE_a2_hippo/libd_chr2-chr2_all-libd_chr2-chr2_all-2000001-2019984-dynamic-1corestotal-allcorepera-20240505-003652_gwas_stat_scz_results.rds,False
2,all,hippo,2000001,2019984,/dcs04/lieber/statsgen/shizhong/michael/mwas/gwas//libd_chr2.pgen,/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno/hippo/out/chr2_all.rda,241863783,10001,2019984,1,/dcs04/lieber/statsgen/mnagle/mwas/full_covariates/all_hippo.csv,/dcs04/lieber/statsgen/mnagle/mwas/pheno/hippo/out/chr2_all_2000001-2019984.rds,..//output_EXPANSE_a2_hippo/libd_chr2-chr2_all-libd_chr2-chr2_all-2000001-2019984-dynamic-1corestotal-allcorepera-20240514-045416.rds,..//output_EXPANSE_a2_hippo/libd_chr2-chr2_all-libd_chr2-chr2_all-2000001-2019984-dynamic-1corestotal-allcorepera-20240514-045416_gwas_stat_,..//output_EXPANSE_a2_hippo/libd_chr2-chr2_all-libd_chr2-chr2_all-2000001-2019984-dynamic-1corestotal-allcorepera-20240514-045416_gwas_stat_scz_results.rds,False


In [17]:
getwd()

## Checking logs etc

In [1]:
# Function to check for errors in the log file
check_log_for_errors <- function(log_path) {
  log_contents <- readLines(log_path)
  errors <- grep("error|stop|interrupted", tolower(log_contents), value = TRUE)
  if (length(errors) > 0) {
    return(list(success = FALSE, messages = errors))
  }
  return(list(success = TRUE))
}

# Usage
log_result <- check_log_for_errors("processing_log.txt")
if (log_result$success) {
  cat("No errors found in log.\n")
} else {
  cat("Errors found in log:\n", paste(log_result$messages, collapse = "\n"))
}


No errors found in log.


In [1]:
library(data.table)

In [4]:
# SLOW Function to validate output files
validate_output_files <- function(trait_names, output_path_template) {
  for (trait in trait_names) {
    file_path <- sprintf(output_path_template, trait)
    if (!file.exists(file_path)) {
      cat(sprintf("Output file for %s does not exist.\n", trait))
      next
    }
    dt <- fread(file_path)
    if (nrow(dt) == 0) {
      cat(sprintf("Output file for %s is empty.\n", trait))
    } else {
      cat(sprintf("Output file for %s has %d rows.\n", trait, nrow(dt)))
    }
  }
}

# Usage
#validate_output_files(c("bp", "mdd", "scz"), "16a5-OUT_stage2_MWAS_%s.csv")


In [2]:
# Function to check if output files exist and are not empty
check_output_files_existence_and_size <- function(trait_names, output_path_template) {
  results <- list()
  for (trait in trait_names) {
    file_path <- sprintf(output_path_template, trait)
    if (!file.exists(file_path)) {
      results[[trait]] <- sprintf("Output file for %s does not exist.", trait)
    } else {
      file_size <- file.info(file_path)$size
      if (file_size > 0) {
        results[[trait]] <- sprintf("Output file for %s is valid with size %d bytes.", trait, file_size)
      } else {
        results[[trait]] <- sprintf("Output file for %s exists but is empty.", trait)
      }
    }
  }
  return(results)
}

# Usage
file_check_results <- check_output_files_existence_and_size(c("bp", "mdd", "scz"), "16a5-OUT_stage2_MWAS_%s.csv")
for (result in file_check_results) {
  cat(result, "\n")
}


ERROR: Error in sprintf("Output file for %s is valid with size %d bytes.", trait, : invalid format '%d'; use format %f, %e, %g or %a for numeric objects


In [3]:
# Function to check if output files exist and are not empty
check_output_files_existence_and_size <- function(trait_names, output_path_template) {
  results <- list()
  for (trait in trait_names) {
    file_path <- sprintf(output_path_template, trait)
    if (!file.exists(file_path)) {
      results[[trait]] <- sprintf("Output file for %s does not exist.", trait)
    } else {
      file_size <- as.integer(file.info(file_path)$size)  # Ensure integer format
      if (file_size > 0) {
        results[[trait]] <- sprintf("Output file for %s is valid with size %d bytes.", trait, file_size)
      } else {
        results[[trait]] <- sprintf("Output file for %s exists but is empty.", trait)
      }
    }
  }
  return(results)
}

# Usage
file_check_results <- check_output_files_existence_and_size(c("bp", "mdd", "scz"), "16a5-OUT_stage2_MWAS_%s.csv")
for (result in file_check_results) {
  cat(result, "\n")
}


“NAs introduced by coercion to integer range”


ERROR: Error in if (file_size > 0) {: missing value where TRUE/FALSE needed


In [4]:
# Function to check if output files exist and are not empty
check_output_files_existence_and_size <- function(trait_names, output_path_template) {
  results <- list()
  for (trait in trait_names) {
    file_path <- sprintf(output_path_template, trait)
    if (!file.exists(file_path)) {
      results[[trait]] <- sprintf("Output file for %s does not exist.", trait)
    } else {
      file_size <- file.info(file_path)$size  # Get the file size without casting
      if (!is.na(file_size) && file_size > 0) {
        results[[trait]] <- sprintf("Output file for %s is valid with size %f bytes.", trait, file_size)
      } else if (is.na(file_size)) {
        results[[trait]] <- sprintf("Error retrieving file size for %s.", trait)
      } else {
        results[[trait]] <- sprintf("Output file for %s exists but is empty.", trait)
      }
    }
  }
  return(results)
}

# Usage
file_check_results <- check_output_files_existence_and_size(c("bp", "mdd", "scz"), "16a5-OUT_stage2_MWAS_%s.csv")
for (result in file_check_results) {
  cat(result, "\n")
}


Output file for bp is valid with size 16912150884.000000 bytes. 
Output file for mdd does not exist. 
Output file for scz does not exist. 


## Second attempt

In [6]:
library(data.table)
library(ggplot2)

# Initialize logging
log_file <- file("processing_log.txt", open = "wt")
sink(log_file, type = "message")
sink(log_file, type = "output", append = TRUE)

tryCatch({
    traits <- c("bp", "mdd", "scz")
    df$stage2_paths <- gsub(".rds", "_gwas_stat_", df$path)
    df$final_paths <- vector("list", length(df$stage2_paths))

    for (trait in traits) {
        message("Processing trait: ", trait)
        df$final_paths <- paste0(df$stage2_paths, trait, "_results.rds")
        output_file <- paste0("16a6-OUT_stage2_MWAS_", trait, ".csv")
        header_written <- FALSE

        for (i in seq_along(df$final_paths)) {
            if (grepl("empty", df$final_paths[i])) next
            
            message("Processing file ", i, " of ", length(df$final_paths))
            stage2_in <- readRDS(df$final_paths[i])
            stage1_in <- readRDS(df$path[i])
            
            if (length(stage1_in@models) != length(stage2_in@MWASmodels)) {
                stop("Files don't match")
            }

            data_list <- vector("list", length(stage1_in@models))
            for (j in seq_along(stage1_in@models)) {
                model1 <- stage1_in@models[[j]]
                model2 <- stage2_in@MWASmodels[[j]]

                data_list[[j]] <- data.table(
                    z = model2["z"],
                    p = model2["p"],
                    n = model2["n"],
                    pos = model1@methylationPosition,
                    stats = stage2_in@summary_stats_path,
                    scaff = stage1_in@scaffoldIdentifier
                )
            }

            combined_data <- rbindlist(data_list, use.names = TRUE, fill = TRUE)

            # Write data incrementally
            if (!header_written) {
                fwrite(combined_data, output_file)
                header_written <- TRUE
            } else {
                fwrite(combined_data, output_file, append = TRUE)
            }
        }
    }
}, error = function(e) {
    message("An error occurred: ", e$message)
}, finally = {
    sink(NULL)  # Turn off logging
    close(log_file)
    message("Logging ended.")
})



An error occurred: object of type 'closure' is not subsettable



ERROR: Error in close.connection(log_file): cannot close 'message' sink connection
