# Match up SNP, methylation and covariate files to be analyzed together

As part of this, we will determine which methylation sites have coverage in the SNP data and should be analyzed.

## Match SNP and methylation files

In [1]:
library(stringr)

In [2]:
library(data.table)

In [3]:
SNP_files <- list.files("/expanse/lustre/projects/jhu152/naglemi/mwas/gwas/",
                        #"/dcs04/lieber/statsgen/shizhong/michael/mwas/gwas",
                        pattern = "pgen", full.names = TRUE)
SNP_files <- SNP_files[ordered(SNP_files)]

SNP_files_libd <- data.frame(SNPs_libd = SNP_files[grepl("libd", SNP_files)])
SNP_files_ref_EUR <- data.frame(SNPs_ref_EUR = SNP_files[grepl("ref_EUR", SNP_files)])

SNP_files_libd$Chr <- as.numeric(stringr::str_extract(SNP_files_libd$SNPs_libd, "(?<=chr)\\d+"))
SNP_files_ref_EUR$Chr <- as.numeric(stringr::str_extract(SNP_files_ref_EUR$SNPs_ref_EUR, "(?<=chr)\\d+"))

SNP_files <- merge(SNP_files_libd, SNP_files_ref_EUR)

In [4]:
meth_files <- list.files("/expanse/lustre/projects/jhu152/naglemi/mwas/pheno", recursive = TRUE, pattern = "rda", full.names = TRUE)

We will bring in original dlpfc and hippo methylation files, and the caudate files that were reprocessed to store everything in given files (where previously those ones had h5 pointers)

In [5]:
#meth_caud <- data.frame(meth_caud = meth_files[grepl("caud", meth_files)])
meth_dlpfc <- data.frame(meth_dlpfc = meth_files[grepl("dlpfc", meth_files)])
meth_hippo <- data.frame(meth_hippo = meth_files[grepl("hippo", meth_files)])

In [6]:
meth_caud_files <- list.files("/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/caud",
                              recursive = TRUE, pattern = "rda", full.names = TRUE)
meth_caud_files <- meth_caud_files[!grepl("pointers", meth_caud_files)]
meth_caud <- data.frame(meth_caud = meth_caud_files[grepl("caud", meth_caud_files)])

In [7]:
dim(meth_caud)

In [8]:
head(meth_dlpfc)

Unnamed: 0_level_0,meth_dlpfc
Unnamed: 0_level_1,<chr>
1,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/dlpfc/out/chr1_AA.rda
2,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/dlpfc/out/chr1_all.rda
3,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/dlpfc/out/chr1_EA.rda
4,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/dlpfc/out/chr10_AA.rda
5,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/dlpfc/out/chr10_all.rda
6,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/dlpfc/out/chr10_EA.rda


In [9]:
head(meth_caud)

Unnamed: 0_level_0,meth_caud
Unnamed: 0_level_1,<chr>
1,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/caud/out/chr1_AA.rda
2,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/caud/out/chr1_all.rda
3,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/caud/out/chr1_EA.rda
4,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/caud/out/chr10_AA.rda
5,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/caud/out/chr10_all.rda
6,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/caud/out/chr10_EA.rda


In [10]:
meth_caud$Chr <- as.numeric(str_extract(meth_caud$meth_caud, "(?<=chr)\\d+(?=_)"))
meth_dlpfc$Chr <- as.numeric(str_extract(meth_dlpfc$meth_dlpfc, "(?<=chr)\\d+(?=_)"))
meth_hippo$Chr <- as.numeric(str_extract(meth_hippo$meth_hippo, "(?<=chr)\\d+(?=_)"))

In [11]:
reshape_and_rename_corrected <- function(data, region) {
  # Extracting the type (AA, EA, all) based on the filename pattern
  data$Type <- gsub(paste0(".*", region, "/out/chr\\d+_(AA|EA|all)\\.rda"), "\\1", data[[1]])
  
  # Creating a wide dataframe with separate columns for AA, EA, and all
  wide_data <- reshape(data, idvar = "Chr", timevar = "Type", direction = "wide")
  
  # Correcting column names to include only the region and type without redundant parts
  colnames(wide_data)[-1] <- sapply(colnames(wide_data)[-1], function(x) {
    paste(region, gsub(".*\\.(AA|EA|all)$", "\\1", x), sep = "_")
  })
  
  return(wide_data)
}

# Applying the corrected function to each dataset
meth_caud_wide <- reshape_and_rename_corrected(meth_caud, "caud")
meth_dlpfc_wide <- reshape_and_rename_corrected(meth_dlpfc, "dlpfc")
meth_hippo_wide <- reshape_and_rename_corrected(meth_hippo, "hippo")

merged_data <- Reduce(function(x, y) merge(x, y, by = "Chr", all = TRUE), list(meth_caud_wide, meth_dlpfc_wide, meth_hippo_wide))


In [12]:
head(merged_data)

Unnamed: 0_level_0,Chr,caud_AA,caud_all,caud_EA,dlpfc_AA,dlpfc_all,dlpfc_EA,hippo_AA,hippo_all,hippo_EA
Unnamed: 0_level_1,<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
1,1,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/caud/out/chr1_AA.rda,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/caud/out/chr1_all.rda,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/caud/out/chr1_EA.rda,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/dlpfc/out/chr1_AA.rda,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/dlpfc/out/chr1_all.rda,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/dlpfc/out/chr1_EA.rda,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/hippo/out/chr1_AA.rda,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/hippo/out/chr1_all.rda,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/hippo/out/chr1_EA.rda
2,2,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/caud/out/chr2_AA.rda,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/caud/out/chr2_all.rda,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/caud/out/chr2_EA.rda,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/dlpfc/out/chr2_AA.rda,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/dlpfc/out/chr2_all.rda,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/dlpfc/out/chr2_EA.rda,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/hippo/out/chr2_AA.rda,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/hippo/out/chr2_all.rda,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/hippo/out/chr2_EA.rda
3,3,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/caud/out/chr3_AA.rda,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/caud/out/chr3_all.rda,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/caud/out/chr3_EA.rda,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/dlpfc/out/chr3_AA.rda,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/dlpfc/out/chr3_all.rda,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/dlpfc/out/chr3_EA.rda,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/hippo/out/chr3_AA.rda,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/hippo/out/chr3_all.rda,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/hippo/out/chr3_EA.rda
4,4,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/caud/out/chr4_AA.rda,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/caud/out/chr4_all.rda,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/caud/out/chr4_EA.rda,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/dlpfc/out/chr4_AA.rda,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/dlpfc/out/chr4_all.rda,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/dlpfc/out/chr4_EA.rda,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/hippo/out/chr4_AA.rda,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/hippo/out/chr4_all.rda,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/hippo/out/chr4_EA.rda
5,5,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/caud/out/chr5_AA.rda,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/caud/out/chr5_all.rda,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/caud/out/chr5_EA.rda,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/dlpfc/out/chr5_AA.rda,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/dlpfc/out/chr5_all.rda,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/dlpfc/out/chr5_EA.rda,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/hippo/out/chr5_AA.rda,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/hippo/out/chr5_all.rda,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/hippo/out/chr5_EA.rda
6,6,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/caud/out/chr6_AA.rda,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/caud/out/chr6_all.rda,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/caud/out/chr6_EA.rda,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/dlpfc/out/chr6_AA.rda,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/dlpfc/out/chr6_all.rda,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/dlpfc/out/chr6_EA.rda,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/hippo/out/chr6_AA.rda,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/hippo/out/chr6_all.rda,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/hippo/out/chr6_EA.rda


In [13]:
head(SNP_files)

Unnamed: 0_level_0,Chr,SNPs_libd,SNPs_ref_EUR
Unnamed: 0_level_1,<dbl>,<chr>,<chr>
1,1,/expanse/lustre/projects/jhu152/naglemi/mwas/gwas//libd_chr1.pgen,/expanse/lustre/projects/jhu152/naglemi/mwas/gwas//ref_EUR_chr1.pgen
2,2,/expanse/lustre/projects/jhu152/naglemi/mwas/gwas//libd_chr2.pgen,/expanse/lustre/projects/jhu152/naglemi/mwas/gwas//ref_EUR_chr2.pgen
3,3,/expanse/lustre/projects/jhu152/naglemi/mwas/gwas//libd_chr3.pgen,/expanse/lustre/projects/jhu152/naglemi/mwas/gwas//ref_EUR_chr3.pgen
4,4,/expanse/lustre/projects/jhu152/naglemi/mwas/gwas//libd_chr4.pgen,/expanse/lustre/projects/jhu152/naglemi/mwas/gwas//ref_EUR_chr4.pgen
5,5,/expanse/lustre/projects/jhu152/naglemi/mwas/gwas//libd_chr5.pgen,/expanse/lustre/projects/jhu152/naglemi/mwas/gwas//ref_EUR_chr5.pgen
6,6,/expanse/lustre/projects/jhu152/naglemi/mwas/gwas//libd_chr6.pgen,/expanse/lustre/projects/jhu152/naglemi/mwas/gwas//ref_EUR_chr6.pgen


In [14]:
merged <- merge(SNP_files, merged_data)

In [15]:
data.table::fwrite(merged, "09-OUT_matched_SNP_meth_files.csv")

## Determine first, last position for each SNP, methylation file pair

In [16]:
merged$SNPs_ref_EUR <- NULL

In [17]:
head(merged)

Unnamed: 0_level_0,Chr,SNPs_libd,caud_AA,caud_all,caud_EA,dlpfc_AA,dlpfc_all,dlpfc_EA,hippo_AA,hippo_all,hippo_EA
Unnamed: 0_level_1,<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
1,1,/expanse/lustre/projects/jhu152/naglemi/mwas/gwas//libd_chr1.pgen,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/caud/out/chr1_AA.rda,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/caud/out/chr1_all.rda,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/caud/out/chr1_EA.rda,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/dlpfc/out/chr1_AA.rda,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/dlpfc/out/chr1_all.rda,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/dlpfc/out/chr1_EA.rda,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/hippo/out/chr1_AA.rda,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/hippo/out/chr1_all.rda,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/hippo/out/chr1_EA.rda
2,2,/expanse/lustre/projects/jhu152/naglemi/mwas/gwas//libd_chr2.pgen,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/caud/out/chr2_AA.rda,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/caud/out/chr2_all.rda,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/caud/out/chr2_EA.rda,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/dlpfc/out/chr2_AA.rda,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/dlpfc/out/chr2_all.rda,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/dlpfc/out/chr2_EA.rda,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/hippo/out/chr2_AA.rda,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/hippo/out/chr2_all.rda,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/hippo/out/chr2_EA.rda
3,3,/expanse/lustre/projects/jhu152/naglemi/mwas/gwas//libd_chr3.pgen,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/caud/out/chr3_AA.rda,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/caud/out/chr3_all.rda,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/caud/out/chr3_EA.rda,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/dlpfc/out/chr3_AA.rda,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/dlpfc/out/chr3_all.rda,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/dlpfc/out/chr3_EA.rda,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/hippo/out/chr3_AA.rda,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/hippo/out/chr3_all.rda,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/hippo/out/chr3_EA.rda
4,4,/expanse/lustre/projects/jhu152/naglemi/mwas/gwas//libd_chr4.pgen,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/caud/out/chr4_AA.rda,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/caud/out/chr4_all.rda,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/caud/out/chr4_EA.rda,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/dlpfc/out/chr4_AA.rda,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/dlpfc/out/chr4_all.rda,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/dlpfc/out/chr4_EA.rda,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/hippo/out/chr4_AA.rda,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/hippo/out/chr4_all.rda,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/hippo/out/chr4_EA.rda
5,5,/expanse/lustre/projects/jhu152/naglemi/mwas/gwas//libd_chr5.pgen,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/caud/out/chr5_AA.rda,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/caud/out/chr5_all.rda,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/caud/out/chr5_EA.rda,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/dlpfc/out/chr5_AA.rda,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/dlpfc/out/chr5_all.rda,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/dlpfc/out/chr5_EA.rda,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/hippo/out/chr5_AA.rda,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/hippo/out/chr5_all.rda,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/hippo/out/chr5_EA.rda
6,6,/expanse/lustre/projects/jhu152/naglemi/mwas/gwas//libd_chr6.pgen,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/caud/out/chr6_AA.rda,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/caud/out/chr6_all.rda,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/caud/out/chr6_EA.rda,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/dlpfc/out/chr6_AA.rda,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/dlpfc/out/chr6_all.rda,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/dlpfc/out/chr6_EA.rda,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/hippo/out/chr6_AA.rda,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/hippo/out/chr6_all.rda,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/hippo/out/chr6_EA.rda


In [18]:
# Load data.table
library(data.table)

# Assuming 'merged' is your data.table with the wide format
# Set it as a data.table if it's not already
setDT(merged)

# Melt the data.table from wide to long format
# id.vars = "Chr" to keep the chromosome column as identifier
# measure.vars patterns to match SNP data and methylation data columns
long_format <- melt(
  merged,
  id.vars = "Chr",
  measure.vars = patterns("^SNPs_libd", "caud_.*|dlpfc_.*|hippo_.*"),
  value.name = c("SNP_data", "methylation_data"),
  variable.name = "type"
)

# Adjust 'type' to distinguish between SNP and methylation data if needed
long_format[, type := ifelse(grepl("SNPs_libd", type), "pgen", "rda")]

# View the head of the long format table
head(long_format)

Chr,type,SNP_data,methylation_data
<dbl>,<chr>,<chr>,<chr>
1,rda,/expanse/lustre/projects/jhu152/naglemi/mwas/gwas//libd_chr1.pgen,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/caud/out/chr1_AA.rda
2,rda,/expanse/lustre/projects/jhu152/naglemi/mwas/gwas//libd_chr2.pgen,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/caud/out/chr2_AA.rda
3,rda,/expanse/lustre/projects/jhu152/naglemi/mwas/gwas//libd_chr3.pgen,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/caud/out/chr3_AA.rda
4,rda,/expanse/lustre/projects/jhu152/naglemi/mwas/gwas//libd_chr4.pgen,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/caud/out/chr4_AA.rda
5,rda,/expanse/lustre/projects/jhu152/naglemi/mwas/gwas//libd_chr5.pgen,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/caud/out/chr5_AA.rda
6,rda,/expanse/lustre/projects/jhu152/naglemi/mwas/gwas//libd_chr6.pgen,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/caud/out/chr6_AA.rda


In [19]:
# Load the data.table library
library(data.table)

# Assuming 'merged' is already a data.table
# If it's not, convert it using setDT(merged)

# Select SNP column and reshape methylation columns into long format
# Extract column names for methylation data
methylation_columns <- grep("caud_|dlpfc_|hippo_", names(merged), value = TRUE)

# Create a long format table
long_format <- melt(merged, id.vars = "Chr", measure.vars = methylation_columns, variable.name = "Methylation_Type", value.name = "methylation_data")

# Add SNP data to the long format data.table
long_format[, SNP_data := merged[.SD, on = "Chr", SNPs_libd]]

# Remove Methylation_Type column and arrange columns as specified
long_format[, `Methylation_Type` := NULL]
setcolorder(long_format, c("Chr", "SNP_data", "methylation_data"))

# Result
head(long_format)


Chr,SNP_data,methylation_data
<dbl>,<chr>,<chr>
1,/expanse/lustre/projects/jhu152/naglemi/mwas/gwas//libd_chr1.pgen,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/caud/out/chr1_AA.rda
2,/expanse/lustre/projects/jhu152/naglemi/mwas/gwas//libd_chr2.pgen,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/caud/out/chr2_AA.rda
3,/expanse/lustre/projects/jhu152/naglemi/mwas/gwas//libd_chr3.pgen,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/caud/out/chr3_AA.rda
4,/expanse/lustre/projects/jhu152/naglemi/mwas/gwas//libd_chr4.pgen,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/caud/out/chr4_AA.rda
5,/expanse/lustre/projects/jhu152/naglemi/mwas/gwas//libd_chr5.pgen,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/caud/out/chr5_AA.rda
6,/expanse/lustre/projects/jhu152/naglemi/mwas/gwas//libd_chr6.pgen,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/caud/out/chr6_AA.rda


In [20]:
library(bsseq)

Loading required package: BiocGenerics


Attaching package: ‘BiocGenerics’


The following objects are masked from ‘package:stats’:

    IQR, mad, sd, var, xtabs


The following objects are masked from ‘package:base’:

    anyDuplicated, aperm, append, as.data.frame, basename, cbind,
    colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
    get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
    match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
    Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
    table, tapply, union, unique, unsplit, which.max, which.min


Loading required package: GenomicRanges

Loading required package: stats4

Loading required package: S4Vectors


Attaching package: ‘S4Vectors’


The following objects are masked from ‘package:data.table’:

    first, second


The following object is masked from ‘package:utils’:

    findMatches


The following objects are masked from ‘package:base’:

    expand.grid, I, unname

### Test on first row

In [21]:
dim(long_format)
head(long_format)
Sys.time()

Chr,SNP_data,methylation_data
<dbl>,<chr>,<chr>
1,/expanse/lustre/projects/jhu152/naglemi/mwas/gwas//libd_chr1.pgen,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/caud/out/chr1_AA.rda
2,/expanse/lustre/projects/jhu152/naglemi/mwas/gwas//libd_chr2.pgen,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/caud/out/chr2_AA.rda
3,/expanse/lustre/projects/jhu152/naglemi/mwas/gwas//libd_chr3.pgen,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/caud/out/chr3_AA.rda
4,/expanse/lustre/projects/jhu152/naglemi/mwas/gwas//libd_chr4.pgen,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/caud/out/chr4_AA.rda
5,/expanse/lustre/projects/jhu152/naglemi/mwas/gwas//libd_chr5.pgen,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/caud/out/chr5_AA.rda
6,/expanse/lustre/projects/jhu152/naglemi/mwas/gwas//libd_chr6.pgen,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/caud/out/chr6_AA.rda


[1] "2024-04-08 13:19:23 PDT"

In [22]:
i <- 1

In [32]:
pvar <- fread(gsub("pgen", "pvar", long_format$SNP_data[i]))[, 1:3]
load(long_format$methylation_data[i])
first_SNP_position <- min(pvar$POS)
last_SNP_position <- max(pvar$POS)
first_meth_position <- min(start(BSobj2))
last_meth_position <- max(start(BSobj2))
window_size <- 10000
# Find first methylation index with SNP coverage
meth_indices_w_SNP_coverage <- which(start(BSobj2) >= (first_SNP_position - window_size) &
                                   start(BSobj2) <= (last_SNP_position + window_size))
first_meth_index_with_SNP_coverage <- min(meth_indices_w_SNP_coverage)
# Correcting the approach to find the last methylation index with SNP coverage
last_meth_index_with_SNP_coverage <- max(meth_indices_w_SNP_coverage)

first_meth_value_with_SNP_coverage <- start(BSobj2)[first_meth_index_with_SNP_coverage]
last_meth_value_with_SNP_coverage <- start(BSobj2)[last_meth_index_with_SNP_coverage]

In [33]:
first_meth_index_with_SNP_coverage

In [34]:
first_SNP_position

In [35]:
max(pvar$POS)

In [36]:
if(!identical(start(BSobj2), sort(start(BSobj2)))) {
  stop("start positions in BSobj2 are not ordered")
}

BSobj2 <- NULL

In [37]:
Sys.time()

[1] "2024-04-08 13:22:10 PDT"

### Sanity checks

In [38]:
first_meth_value_with_SNP_coverage

In [39]:
last_meth_value_with_SNP_coverage

In [40]:
first_meth_index_with_SNP_coverage

In [41]:
last_meth_index_with_SNP_coverage

### Deploy

In [None]:
i

In [None]:
BSobj2

In [None]:
long_format$first_snp_position <-
long_format$last_snp_position <-
long_format$last_meth_index <-
long_format$first_meth_position <- 
long_format$last_meth_position <-
long_format$first_meth_index_with_SNP_coverage <-
long_format$last_meth_index_with_SNP_coverage <- 
long_format$first_meth_value_with_SNP_coverage <-
long_format$last_meth_value_with_SNP_coverage <- 
NA

start_time <- Sys.time()

for(i in 1:nrow(long_format)){
    print(i)
    iter_start_time <- Sys.time()
    
    pvar <- fread(gsub("pgen", "pvar", long_format$SNP_data[i]))[, 1:3]
    load(long_format$methylation_data[i])

    if(!identical(start(BSobj2), sort(start(BSobj2)))) {
      stop("start positions in BSobj2 are not ordered")
    }
    
    first_SNP_position <- min(pvar$POS)
    last_SNP_position <- max(pvar$POS)
    first_meth_position <- min(start(BSobj2))
    last_meth_position <- max(start(BSobj2))
    window_size <- 10000
    # Find first methylation index with SNP coverage
    meth_indices_w_SNP_coverage <- which(start(BSobj2) >= (first_SNP_position - window_size) &
                                         start(BSobj2) <= (last_SNP_position + window_size))
    first_meth_index_with_SNP_coverage <- min(meth_indices_w_SNP_coverage)
    # Correcting the approach to find the last methylation index with SNP coverage
    last_meth_index_with_SNP_coverage <- max(meth_indices_w_SNP_coverage)

    first_meth_value_with_SNP_coverage <- start(BSobj2)[first_meth_index_with_SNP_coverage]
    last_meth_value_with_SNP_coverage <- start(BSobj2)[last_meth_index_with_SNP_coverage]

    long_format$first_snp_position[i] <- first_SNP_position
    long_format$last_snp_position[i] <- last_SNP_position
    long_format$last_meth_index[i] <- length(start(BSobj2))
    BSobj2 <- NULL
    long_format$first_meth_position[i] <- first_meth_position
    long_format$last_meth_position[i] <- last_meth_position
    
    long_format$first_meth_index_with_SNP_coverage[i] <- first_meth_index_with_SNP_coverage
    long_format$last_meth_index_with_SNP_coverage[i] <- last_meth_index_with_SNP_coverage
    long_format$first_meth_value_with_SNP_coverage[i] <- first_meth_value_with_SNP_coverage
    long_format$last_meth_value_with_SNP_coverage[i] <- last_meth_value_with_SNP_coverage
    
    iter_end_time <- Sys.time()
    elapsed_time_per_iter <- iter_end_time - iter_start_time
    total_elapsed_time <- Sys.time() - start_time
    average_time_per_iter <- total_elapsed_time / i
    estimated_total_time <- average_time_per_iter * nrow(long_format)
    estimated_time_remaining <- estimated_total_time - total_elapsed_time

    print(sprintf("Ran %d of %d so far. Estimated time remaining: %s minutes\n", 
                i, nrow(long_format), round(as.numeric(estimated_time_remaining) / 60, 2)))

}

In [None]:
fwrite(long_format, "09-OUT_matched_files_and_indices_to_test_a3.csv")

In [3]:
long_format_a2 <- fread("09-OUT_matched_files_and_indices_to_test_a2.csv")

In [4]:
long_format <- fread("09-OUT_matched_files_and_indices_to_test_a3.csv")

In [5]:
head(long_format_a2)

Chr,SNP_data,methylation_data,last_meth_value_with_SNP_coverage,first_meth_value_with_SNP_coverage,last_meth_index_with_SNP_coverage,first_meth_index_with_SNP_coverage
<int>,<chr>,<chr>,<int>,<int>,<int>,<int>
1,/expanse/lustre/projects/jhu152/naglemi/mwas/gwas//libd_chr1.pgen,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/caud/out/chr1_AA.rda,248918358,1069461,2202702,8982
2,/expanse/lustre/projects/jhu152/naglemi/mwas/gwas//libd_chr2.pgen,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/caud/out/chr2_AA.rda,241863783,10001,2019984,1
3,/expanse/lustre/projects/jhu152/naglemi/mwas/gwas//libd_chr3.pgen,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/caud/out/chr3_AA.rda,198099789,11602,1538467,1
4,/expanse/lustre/projects/jhu152/naglemi/mwas/gwas//libd_chr4.pgen,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/caud/out/chr4_AA.rda,189877411,69399,1387731,1
5,/expanse/lustre/projects/jhu152/naglemi/mwas/gwas//libd_chr5.pgen,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/caud/out/chr5_AA.rda,181172584,44104,1409038,1
6,/expanse/lustre/projects/jhu152/naglemi/mwas/gwas//libd_chr6.pgen,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/caud/out/chr6_AA.rda,170619093,192453,1412543,1138


In [6]:
head(long_format)

Chr,SNP_data,methylation_data,last_meth_value_with_SNP_coverage,first_meth_value_with_SNP_coverage,last_meth_index_with_SNP_coverage,first_meth_index_with_SNP_coverage,last_meth_position,first_meth_position,last_meth_index,last_snp_position,first_snp_position
<int>,<chr>,<chr>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
1,/expanse/lustre/projects/jhu152/naglemi/mwas/gwas//libd_chr1.pgen,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/caud/out/chr1_AA.rda,248918358,1069461,2202702,8982,248932459,792731,2202819,248908368,1079456
2,/expanse/lustre/projects/jhu152/naglemi/mwas/gwas//libd_chr2.pgen,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/caud/out/chr2_AA.rda,241863783,10001,2019984,1,242110081,10001,2025108,241853790,10797
3,/expanse/lustre/projects/jhu152/naglemi/mwas/gwas//libd_chr3.pgen,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/caud/out/chr3_AA.rda,198099789,11602,1538467,1,198099789,11602,1538467,198114462,18519
4,/expanse/lustre/projects/jhu152/naglemi/mwas/gwas//libd_chr4.pgen,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/caud/out/chr4_AA.rda,189877411,69399,1387731,1,190048422,69399,1388072,189974231,68894
5,/expanse/lustre/projects/jhu152/naglemi/mwas/gwas//libd_chr5.pgen,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/caud/out/chr5_AA.rda,181172584,44104,1409038,1,181172584,44104,1409038,181286423,25361
6,/expanse/lustre/projects/jhu152/naglemi/mwas/gwas//libd_chr6.pgen,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/caud/out/chr6_AA.rda,170619093,192453,1412543,1138,170745956,60470,1413875,170609176,202452


Started around 2pm

## Prepare covariate files

### Get paths

In [None]:
# genotype_pc <- list.files("/dcs04/lieber/statsgen/shizhong/database/libd/genotype/postmortem/topmed/merge_H650_1M_2.5M_5M/EA_AA/all/pca_wgbs",
#                           recursive = TRUE,
#                           full.names = TRUE,
#                           pattern = "pca.eigenvec")

# genotype_pc <- data.frame(genotype_pc_path = genotype_pc, brain_region = c("caud", "dlpfc", "hippo"))

# genotype_pc <- rbind(genotype_pc, genotype_pc, genotype_pc)

# genotype_pc$subpopulation <- c("EA", "EA", "EA", "AA", "AA", "AA", "all", "all", "all")

# genotype_pc

# meth_pc <- list.files("/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno/",
#                       recursive = TRUE,
#                       full.names = TRUE,
#                       pattern = "pc_")

# meth_pc <- data.table(meth_pc_path = meth_pc)

# meth_pc[, subpopulation := sub(".*pc_(.*)\\.csv", "\\1", meth_pc_path)]
# meth_pc[, brain_region := sub(".*/pheno//(.*)/out/.*", "\\1", meth_pc_path)]

# meth_pc

# cov3 <- "/dcs04/lieber/statsgen/shizhong/database/libd/genotype/postmortem/phenotype/pheno_PC"

### Format, merge with data frame

In [None]:
# long_format[, subpopulation := sub(".*_(.*)\\.rda", "\\1", methylation_data)]
# long_format[, brain_region := sub(".*/pheno/(.*)/out/.*", "\\1", methylation_data)]

# merged <- merge(long_format, genotype_pc)

# merged <- merge(merged, meth_pc)

# merged$pheno_pc <- cov3

# head(merged)

### Unite covariate files for each row into a single file

In [None]:
# outdir <- "/dcs04/lieber/statsgen/mnagle/mwas/full_covariates/"
# #dir.create(outdir)

#### test for first row

In [None]:
# colnames(merged)

# dim(merged)



# merged_pc_only <- unique(merged[, c(1, 2, 10:12)])

# merged_pc_only

# i <- 1

# geno_pc <- fread(merged$genotype_pc_path[i])
# meth_pc <- fread(merged$meth_pc_path[i])
# pheno_pc <- fread(merged$pheno_pc[i])

# meth_pc <- meth_pc[, 1:11]

# geno_pc <- geno_pc[, 1:4]

# pheno_pc <- pheno_pc[, c(2, 6, 7, 8)]

# colnames(meth_pc)[1] <- colnames(pheno_pc)[1] <- colnames(geno_pc)[1] <- "ID"

# colnames(meth_pc)[2:11] <- paste0("methPC", 1:10)

# colnames(geno_pc)[2:4] <- paste0("genoPC", 1:3)

# geno_pc$ID <- gsub("Br0", "Br", geno_pc$ID)
# pheno_pc$ID <- gsub("Br0", "Br", pheno_pc$ID)
# meth_pc$ID <- gsub("Br0", "Br", meth_pc$ID)

# #dim(meth_pc)
# #dim(geno_pc)
# #dim(pheno_pc)

# unified_pc <- merge(geno_pc, pheno_pc)
# #dim(unified_pc)
# unified_pc <- merge(unified_pc, meth_pc)
# #dim(unified_pc)

# if(nrow(unified_pc) != nrow(meth_pc)){
#     stop(paste0("We have a mismatch for row ", i))
#     }

# outname <- paste0(outdir, merged$subpopulation[i], "_", merged$brain_region[i], ".csv")
# fwrite(merged_pc_only, outname)

#### Run for all

In [None]:
# for(i in 1:nrow(merged_pc_only)){
#     geno_pc <- fread(merged_pc_only$genotype_pc_path[i])
#     meth_pc <- fread(merged_pc_only$meth_pc_path[i])
#     pheno_pc <- fread(merged_pc_only$pheno_pc[i])

#     meth_pc <- meth_pc[, 1:11]

#     geno_pc <- geno_pc[, 1:4]

#     pheno_pc <- pheno_pc[, c(2, 6, 7, 8)]

#     colnames(meth_pc)[1] <- colnames(pheno_pc)[1] <- colnames(geno_pc)[1] <- "ID"

#     colnames(meth_pc)[2:11] <- paste0("methPC", 1:10)

#     colnames(geno_pc)[2:4] <- paste0("genoPC", 1:3)

#     geno_pc$ID <- gsub("Br0", "Br", geno_pc$ID)
#     pheno_pc$ID <- gsub("Br0", "Br", pheno_pc$ID)
#     pheno_pc$Dx <- gsub("chizo", "CZ", pheno_pc$Dx)
#     meth_pc$ID <- gsub("Br0", "Br", meth_pc$ID)

#     #dim(meth_pc)
#     #dim(geno_pc)
#     #dim(pheno_pc)

#     unified_pc <- merge(geno_pc, meth_pc)
#     #dim(unified_pc)
#     unified_pc <- merge(unified_pc, pheno_pc)
#     #dim(unified_pc)

#     if(nrow(unified_pc) != nrow(meth_pc)){
#         print(setdiff(meth_pc$ID, unified_pc$ID))
#         warning(paste0("We have a mismatch for row ", i))
#         print(merged_pc_only[i, ])
#         }
#     outname <- paste0(outdir, merged_pc_only$subpopulation[i], "_", merged_pc_only$brain_region[i], ".csv")
#     fwrite(unified_pc, outname)
    
#     }


# outname

# merged$full_covariate_path <-
# paste0(outdir, merged$subpopulation, "_", merged$brain_region, ".csv")

# fread("/dcs04/lieber/statsgen/mnagle/mwas/full_covariates/AA_caud.csv")

## Match covariate files to corresponding datasets

In [7]:
list.files("../../full_covariates/")

In [8]:
head(long_format)

Chr,SNP_data,methylation_data,last_meth_value_with_SNP_coverage,first_meth_value_with_SNP_coverage,last_meth_index_with_SNP_coverage,first_meth_index_with_SNP_coverage
<int>,<chr>,<chr>,<int>,<int>,<int>,<int>
1,/expanse/lustre/projects/jhu152/naglemi/mwas/gwas//libd_chr1.pgen,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/caud/out/chr1_AA.rda,248918358,1069461,2202702,8982
2,/expanse/lustre/projects/jhu152/naglemi/mwas/gwas//libd_chr2.pgen,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/caud/out/chr2_AA.rda,241863783,10001,2019984,1
3,/expanse/lustre/projects/jhu152/naglemi/mwas/gwas//libd_chr3.pgen,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/caud/out/chr3_AA.rda,198099789,11602,1538467,1
4,/expanse/lustre/projects/jhu152/naglemi/mwas/gwas//libd_chr4.pgen,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/caud/out/chr4_AA.rda,189877411,69399,1387731,1
5,/expanse/lustre/projects/jhu152/naglemi/mwas/gwas//libd_chr5.pgen,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/caud/out/chr5_AA.rda,181172584,44104,1409038,1
6,/expanse/lustre/projects/jhu152/naglemi/mwas/gwas//libd_chr6.pgen,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/caud/out/chr6_AA.rda,170619093,192453,1412543,1138


In [10]:
long_format$population <- str_split_fixed(long_format$methylation_data,
                                          pattern = "_",
                                          2)[, 2]

In [12]:
long_format$population <- gsub("\\.rda", "", long_format$population)

In [13]:
long_format$population

In [14]:
long_format$region <- str_split_fixed(long_format$methylation_data,
                                      pattern = "pheno/",
                                      2)[, 2]
long_format$region <- str_split_fixed(long_format$region,
                                      pattern = "/out",
                                      2)[, 1]

In [15]:
head(long_format$region)

In [17]:
list.files("/expanse/lustre/projects/jhu152/naglemi/mwas/full_covariates")

In [19]:
long_format$cov_file <- paste0("/expanse/lustre/projects/jhu152/naglemi/mwas/full_covariates/",
                               long_format$population,
                               "_",
                               long_format$region,
                               ".csv")

In [20]:
file.exists(long_format$cov_file[1])

In [21]:
fwrite(long_format, "09-OUT_matched_SNP_meth_cov.csv")

## Inspect files

In [22]:
files <- unique(long_format$cov_file)

In [25]:
files

In [24]:
for(file in files){
    print(file)
    file_in <- fread(file)
    print(file_in)
    cat("\n\n")
}

[1] "/expanse/lustre/projects/jhu152/naglemi/mwas/full_covariates/AA_caud.csv"
         ID   genoPC1     genoPC2    genoPC3    methPC1    methPC2     methPC3
     <char>     <num>       <num>      <num>      <num>      <num>       <num>
  1: Br1003 0.0591231 -0.03980770  0.0205308 -157.04647   22.43150   -3.613153
  2: Br1004 0.0507512  0.01670030 -0.0235019 -207.97793   14.56387  -48.109417
  3: Br1007 0.0683202  0.00502359 -0.0595899  119.29338 -184.28704 -127.987516
  4: Br1017 0.0685061 -0.00844357 -0.0109324  148.35140  -89.71558  -97.742602
  5: Br1021 0.0596699  0.00367841  0.0479552  453.47100 -132.38714   35.349695
 ---                                                                          
160:  Br948 0.0639954  0.00695405 -0.0100903  413.58583 -263.70724    4.226894
161:  Br949 0.0712634 -0.02716360  0.0572070 -200.93232  265.80379   56.822449
162:  Br963 0.0120633  0.01905100 -0.0894283  -27.92241 -144.83283  -66.482063
163:  Br991 0.0436291  0.01022270 -0.0078481  -42.06