# Match up SNP, methylation and covariate files to be analyzed together

As part of this, we will determine which methylation sites have coverage in the SNP data and should be analyzed.

## Match SNP and methylation files

In [1]:
library(stringr)

In [2]:
SNP_files <- list.files(#"/expanse/lustre/projects/jhu152/naglemi/mwas/gwas/",
                        "/dcs04/lieber/statsgen/shizhong/michael/mwas/gwas",
                        pattern = "pgen", full.names = TRUE)
SNP_files <- SNP_files[ordered(SNP_files)]

SNP_files_libd <- data.frame(SNPs_libd = SNP_files[grepl("libd", SNP_files)])
SNP_files_ref_EUR <- data.frame(SNPs_ref_EUR = SNP_files[grepl("ref_EUR", SNP_files)])

SNP_files_libd$Chr <- as.numeric(stringr::str_extract(SNP_files_libd$SNPs_libd, "(?<=chr)\\d+"))
SNP_files_ref_EUR$Chr <- as.numeric(stringr::str_extract(SNP_files_ref_EUR$SNPs_ref_EUR, "(?<=chr)\\d+"))

SNP_files <- merge(SNP_files_libd, SNP_files_ref_EUR)

In [3]:
meth_files <- list.files("/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno", recursive = TRUE, pattern = "rda", full.names = TRUE)

We will bring in original dlpfc and hippo methylation files, and the caudate files that were reprocessed to store everything in given files (where previously those ones had h5 pointers)

In [4]:
#meth_caud <- data.frame(meth_caud = meth_files[grepl("caud", meth_files)])
meth_dlpfc <- data.frame(meth_dlpfc = meth_files[grepl("dlpfc", meth_files)])
meth_hippo <- data.frame(meth_hippo = meth_files[grepl("hippo", meth_files)])

In [5]:
meth_caud_files <- list.files("/dcs04/lieber/statsgen/mnagle/mwas/pheno/caud",
                              recursive = TRUE, pattern = "rda", full.names = TRUE)
meth_caud <- data.frame(meth_caud = meth_caud_files[grepl("caud", meth_caud_files)])

In [6]:
head(meth_dlpfc)

Unnamed: 0_level_0,meth_dlpfc
Unnamed: 0_level_1,<chr>
1,/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno/dlpfc/out/chr1_AA.rda
2,/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno/dlpfc/out/chr1_all.rda
3,/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno/dlpfc/out/chr1_EA.rda
4,/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno/dlpfc/out/chr10_AA.rda
5,/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno/dlpfc/out/chr10_all.rda
6,/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno/dlpfc/out/chr10_EA.rda


In [7]:
head(meth_caud)

Unnamed: 0_level_0,meth_caud
Unnamed: 0_level_1,<chr>
1,/dcs04/lieber/statsgen/mnagle/mwas/pheno/caud/out/chr1_AA.rda
2,/dcs04/lieber/statsgen/mnagle/mwas/pheno/caud/out/chr1_all.rda
3,/dcs04/lieber/statsgen/mnagle/mwas/pheno/caud/out/chr1_EA.rda
4,/dcs04/lieber/statsgen/mnagle/mwas/pheno/caud/out/chr10_AA.rda
5,/dcs04/lieber/statsgen/mnagle/mwas/pheno/caud/out/chr10_all.rda
6,/dcs04/lieber/statsgen/mnagle/mwas/pheno/caud/out/chr10_EA.rda


In [8]:
meth_caud$Chr <- as.numeric(str_extract(meth_caud$meth_caud, "(?<=chr)\\d+(?=_)"))
meth_dlpfc$Chr <- as.numeric(str_extract(meth_dlpfc$meth_dlpfc, "(?<=chr)\\d+(?=_)"))
meth_hippo$Chr <- as.numeric(str_extract(meth_hippo$meth_hippo, "(?<=chr)\\d+(?=_)"))

In [9]:
reshape_and_rename_corrected <- function(data, region) {
  # Extracting the type (AA, EA, all) based on the filename pattern
  data$Type <- gsub(paste0(".*", region, "/out/chr\\d+_(AA|EA|all)\\.rda"), "\\1", data[[1]])
  
  # Creating a wide dataframe with separate columns for AA, EA, and all
  wide_data <- reshape(data, idvar = "Chr", timevar = "Type", direction = "wide")
  
  # Correcting column names to include only the region and type without redundant parts
  colnames(wide_data)[-1] <- sapply(colnames(wide_data)[-1], function(x) {
    paste(region, gsub(".*\\.(AA|EA|all)$", "\\1", x), sep = "_")
  })
  
  return(wide_data)
}

# Applying the corrected function to each dataset
meth_caud_wide <- reshape_and_rename_corrected(meth_caud, "caud")
meth_dlpfc_wide <- reshape_and_rename_corrected(meth_dlpfc, "dlpfc")
meth_hippo_wide <- reshape_and_rename_corrected(meth_hippo, "hippo")

merged_data <- Reduce(function(x, y) merge(x, y, by = "Chr", all = TRUE), list(meth_caud_wide, meth_dlpfc_wide, meth_hippo_wide))


In [10]:
head(merged_data)

Unnamed: 0_level_0,Chr,caud_AA,caud_all,caud_EA,dlpfc_AA,dlpfc_all,dlpfc_EA,hippo_AA,hippo_all,hippo_EA
Unnamed: 0_level_1,<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
1,1,/dcs04/lieber/statsgen/mnagle/mwas/pheno/caud/out/chr1_AA.rda,/dcs04/lieber/statsgen/mnagle/mwas/pheno/caud/out/chr1_all.rda,/dcs04/lieber/statsgen/mnagle/mwas/pheno/caud/out/chr1_EA.rda,/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno/dlpfc/out/chr1_AA.rda,/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno/dlpfc/out/chr1_all.rda,/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno/dlpfc/out/chr1_EA.rda,/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno/hippo/out/chr1_AA.rda,/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno/hippo/out/chr1_all.rda,/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno/hippo/out/chr1_EA.rda
2,2,/dcs04/lieber/statsgen/mnagle/mwas/pheno/caud/out/chr2_AA.rda,/dcs04/lieber/statsgen/mnagle/mwas/pheno/caud/out/chr2_all.rda,/dcs04/lieber/statsgen/mnagle/mwas/pheno/caud/out/chr2_EA.rda,/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno/dlpfc/out/chr2_AA.rda,/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno/dlpfc/out/chr2_all.rda,/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno/dlpfc/out/chr2_EA.rda,/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno/hippo/out/chr2_AA.rda,/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno/hippo/out/chr2_all.rda,/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno/hippo/out/chr2_EA.rda
3,3,/dcs04/lieber/statsgen/mnagle/mwas/pheno/caud/out/chr3_AA.rda,/dcs04/lieber/statsgen/mnagle/mwas/pheno/caud/out/chr3_all.rda,/dcs04/lieber/statsgen/mnagle/mwas/pheno/caud/out/chr3_EA.rda,/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno/dlpfc/out/chr3_AA.rda,/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno/dlpfc/out/chr3_all.rda,/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno/dlpfc/out/chr3_EA.rda,/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno/hippo/out/chr3_AA.rda,/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno/hippo/out/chr3_all.rda,/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno/hippo/out/chr3_EA.rda
4,4,/dcs04/lieber/statsgen/mnagle/mwas/pheno/caud/out/chr4_AA.rda,/dcs04/lieber/statsgen/mnagle/mwas/pheno/caud/out/chr4_all.rda,/dcs04/lieber/statsgen/mnagle/mwas/pheno/caud/out/chr4_EA.rda,/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno/dlpfc/out/chr4_AA.rda,/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno/dlpfc/out/chr4_all.rda,/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno/dlpfc/out/chr4_EA.rda,/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno/hippo/out/chr4_AA.rda,/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno/hippo/out/chr4_all.rda,/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno/hippo/out/chr4_EA.rda
5,5,/dcs04/lieber/statsgen/mnagle/mwas/pheno/caud/out/chr5_AA.rda,/dcs04/lieber/statsgen/mnagle/mwas/pheno/caud/out/chr5_all.rda,/dcs04/lieber/statsgen/mnagle/mwas/pheno/caud/out/chr5_EA.rda,/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno/dlpfc/out/chr5_AA.rda,/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno/dlpfc/out/chr5_all.rda,/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno/dlpfc/out/chr5_EA.rda,/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno/hippo/out/chr5_AA.rda,/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno/hippo/out/chr5_all.rda,/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno/hippo/out/chr5_EA.rda
6,6,/dcs04/lieber/statsgen/mnagle/mwas/pheno/caud/out/chr6_AA.rda,/dcs04/lieber/statsgen/mnagle/mwas/pheno/caud/out/chr6_all.rda,/dcs04/lieber/statsgen/mnagle/mwas/pheno/caud/out/chr6_EA.rda,/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno/dlpfc/out/chr6_AA.rda,/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno/dlpfc/out/chr6_all.rda,/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno/dlpfc/out/chr6_EA.rda,/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno/hippo/out/chr6_AA.rda,/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno/hippo/out/chr6_all.rda,/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno/hippo/out/chr6_EA.rda


In [11]:
head(SNP_files)

Unnamed: 0_level_0,Chr,SNPs_libd,SNPs_ref_EUR
Unnamed: 0_level_1,<dbl>,<chr>,<chr>
1,1,/dcs04/lieber/statsgen/shizhong/michael/mwas/gwas/libd_chr1.pgen,/dcs04/lieber/statsgen/shizhong/michael/mwas/gwas/ref_EUR_chr1.pgen
2,2,/dcs04/lieber/statsgen/shizhong/michael/mwas/gwas/libd_chr2.pgen,/dcs04/lieber/statsgen/shizhong/michael/mwas/gwas/ref_EUR_chr2.pgen
3,3,/dcs04/lieber/statsgen/shizhong/michael/mwas/gwas/libd_chr3.pgen,/dcs04/lieber/statsgen/shizhong/michael/mwas/gwas/ref_EUR_chr3.pgen
4,4,/dcs04/lieber/statsgen/shizhong/michael/mwas/gwas/libd_chr4.pgen,/dcs04/lieber/statsgen/shizhong/michael/mwas/gwas/ref_EUR_chr4.pgen
5,5,/dcs04/lieber/statsgen/shizhong/michael/mwas/gwas/libd_chr5.pgen,/dcs04/lieber/statsgen/shizhong/michael/mwas/gwas/ref_EUR_chr5.pgen
6,6,/dcs04/lieber/statsgen/shizhong/michael/mwas/gwas/libd_chr6.pgen,/dcs04/lieber/statsgen/shizhong/michael/mwas/gwas/ref_EUR_chr6.pgen


In [12]:
merged <- merge(SNP_files, merged_data)

In [13]:
data.table::fwrite(merged, "09-OUT_matched_SNP_meth_files.csv")

## Determine first, last position for each SNP, methylation file pair

In [14]:
merged$SNPs_ref_EUR <- NULL

In [15]:
head(merged)

Unnamed: 0_level_0,Chr,SNPs_libd,caud_AA,caud_all,caud_EA,dlpfc_AA,dlpfc_all,dlpfc_EA,hippo_AA,hippo_all,hippo_EA
Unnamed: 0_level_1,<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
1,1,/dcs04/lieber/statsgen/shizhong/michael/mwas/gwas/libd_chr1.pgen,/dcs04/lieber/statsgen/mnagle/mwas/pheno/caud/out/chr1_AA.rda,/dcs04/lieber/statsgen/mnagle/mwas/pheno/caud/out/chr1_all.rda,/dcs04/lieber/statsgen/mnagle/mwas/pheno/caud/out/chr1_EA.rda,/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno/dlpfc/out/chr1_AA.rda,/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno/dlpfc/out/chr1_all.rda,/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno/dlpfc/out/chr1_EA.rda,/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno/hippo/out/chr1_AA.rda,/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno/hippo/out/chr1_all.rda,/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno/hippo/out/chr1_EA.rda
2,2,/dcs04/lieber/statsgen/shizhong/michael/mwas/gwas/libd_chr2.pgen,/dcs04/lieber/statsgen/mnagle/mwas/pheno/caud/out/chr2_AA.rda,/dcs04/lieber/statsgen/mnagle/mwas/pheno/caud/out/chr2_all.rda,/dcs04/lieber/statsgen/mnagle/mwas/pheno/caud/out/chr2_EA.rda,/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno/dlpfc/out/chr2_AA.rda,/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno/dlpfc/out/chr2_all.rda,/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno/dlpfc/out/chr2_EA.rda,/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno/hippo/out/chr2_AA.rda,/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno/hippo/out/chr2_all.rda,/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno/hippo/out/chr2_EA.rda
3,3,/dcs04/lieber/statsgen/shizhong/michael/mwas/gwas/libd_chr3.pgen,/dcs04/lieber/statsgen/mnagle/mwas/pheno/caud/out/chr3_AA.rda,/dcs04/lieber/statsgen/mnagle/mwas/pheno/caud/out/chr3_all.rda,/dcs04/lieber/statsgen/mnagle/mwas/pheno/caud/out/chr3_EA.rda,/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno/dlpfc/out/chr3_AA.rda,/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno/dlpfc/out/chr3_all.rda,/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno/dlpfc/out/chr3_EA.rda,/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno/hippo/out/chr3_AA.rda,/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno/hippo/out/chr3_all.rda,/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno/hippo/out/chr3_EA.rda
4,4,/dcs04/lieber/statsgen/shizhong/michael/mwas/gwas/libd_chr4.pgen,/dcs04/lieber/statsgen/mnagle/mwas/pheno/caud/out/chr4_AA.rda,/dcs04/lieber/statsgen/mnagle/mwas/pheno/caud/out/chr4_all.rda,/dcs04/lieber/statsgen/mnagle/mwas/pheno/caud/out/chr4_EA.rda,/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno/dlpfc/out/chr4_AA.rda,/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno/dlpfc/out/chr4_all.rda,/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno/dlpfc/out/chr4_EA.rda,/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno/hippo/out/chr4_AA.rda,/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno/hippo/out/chr4_all.rda,/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno/hippo/out/chr4_EA.rda
5,5,/dcs04/lieber/statsgen/shizhong/michael/mwas/gwas/libd_chr5.pgen,/dcs04/lieber/statsgen/mnagle/mwas/pheno/caud/out/chr5_AA.rda,/dcs04/lieber/statsgen/mnagle/mwas/pheno/caud/out/chr5_all.rda,/dcs04/lieber/statsgen/mnagle/mwas/pheno/caud/out/chr5_EA.rda,/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno/dlpfc/out/chr5_AA.rda,/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno/dlpfc/out/chr5_all.rda,/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno/dlpfc/out/chr5_EA.rda,/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno/hippo/out/chr5_AA.rda,/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno/hippo/out/chr5_all.rda,/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno/hippo/out/chr5_EA.rda
6,6,/dcs04/lieber/statsgen/shizhong/michael/mwas/gwas/libd_chr6.pgen,/dcs04/lieber/statsgen/mnagle/mwas/pheno/caud/out/chr6_AA.rda,/dcs04/lieber/statsgen/mnagle/mwas/pheno/caud/out/chr6_all.rda,/dcs04/lieber/statsgen/mnagle/mwas/pheno/caud/out/chr6_EA.rda,/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno/dlpfc/out/chr6_AA.rda,/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno/dlpfc/out/chr6_all.rda,/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno/dlpfc/out/chr6_EA.rda,/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno/hippo/out/chr6_AA.rda,/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno/hippo/out/chr6_all.rda,/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno/hippo/out/chr6_EA.rda


In [16]:
# Load data.table
library(data.table)

# Assuming 'merged' is your data.table with the wide format
# Set it as a data.table if it's not already
setDT(merged)

# Melt the data.table from wide to long format
# id.vars = "Chr" to keep the chromosome column as identifier
# measure.vars patterns to match SNP data and methylation data columns
long_format <- melt(
  merged,
  id.vars = "Chr",
  measure.vars = patterns("^SNPs_libd", "caud_.*|dlpfc_.*|hippo_.*"),
  value.name = c("SNP_data", "methylation_data"),
  variable.name = "type"
)

# Adjust 'type' to distinguish between SNP and methylation data if needed
long_format[, type := ifelse(grepl("SNPs_libd", type), "pgen", "rda")]

# View the head of the long format table
head(long_format)

Chr,type,SNP_data,methylation_data
<dbl>,<chr>,<chr>,<chr>
1,rda,/dcs04/lieber/statsgen/shizhong/michael/mwas/gwas/libd_chr1.pgen,/dcs04/lieber/statsgen/mnagle/mwas/pheno/caud/out/chr1_AA.rda
2,rda,/dcs04/lieber/statsgen/shizhong/michael/mwas/gwas/libd_chr2.pgen,/dcs04/lieber/statsgen/mnagle/mwas/pheno/caud/out/chr2_AA.rda
3,rda,/dcs04/lieber/statsgen/shizhong/michael/mwas/gwas/libd_chr3.pgen,/dcs04/lieber/statsgen/mnagle/mwas/pheno/caud/out/chr3_AA.rda
4,rda,/dcs04/lieber/statsgen/shizhong/michael/mwas/gwas/libd_chr4.pgen,/dcs04/lieber/statsgen/mnagle/mwas/pheno/caud/out/chr4_AA.rda
5,rda,/dcs04/lieber/statsgen/shizhong/michael/mwas/gwas/libd_chr5.pgen,/dcs04/lieber/statsgen/mnagle/mwas/pheno/caud/out/chr5_AA.rda
6,rda,/dcs04/lieber/statsgen/shizhong/michael/mwas/gwas/libd_chr6.pgen,/dcs04/lieber/statsgen/mnagle/mwas/pheno/caud/out/chr6_AA.rda


In [17]:
# Load the data.table library
library(data.table)

# Assuming 'merged' is already a data.table
# If it's not, convert it using setDT(merged)

# Select SNP column and reshape methylation columns into long format
# Extract column names for methylation data
methylation_columns <- grep("caud_|dlpfc_|hippo_", names(merged), value = TRUE)

# Create a long format table
long_format <- melt(merged, id.vars = "Chr", measure.vars = methylation_columns, variable.name = "Methylation_Type", value.name = "methylation_data")

# Add SNP data to the long format data.table
long_format[, SNP_data := merged[.SD, on = "Chr", SNPs_libd]]

# Remove Methylation_Type column and arrange columns as specified
long_format[, `Methylation_Type` := NULL]
setcolorder(long_format, c("Chr", "SNP_data", "methylation_data"))

# Result
head(long_format)


Chr,SNP_data,methylation_data
<dbl>,<chr>,<chr>
1,/dcs04/lieber/statsgen/shizhong/michael/mwas/gwas/libd_chr1.pgen,/dcs04/lieber/statsgen/mnagle/mwas/pheno/caud/out/chr1_AA.rda
2,/dcs04/lieber/statsgen/shizhong/michael/mwas/gwas/libd_chr2.pgen,/dcs04/lieber/statsgen/mnagle/mwas/pheno/caud/out/chr2_AA.rda
3,/dcs04/lieber/statsgen/shizhong/michael/mwas/gwas/libd_chr3.pgen,/dcs04/lieber/statsgen/mnagle/mwas/pheno/caud/out/chr3_AA.rda
4,/dcs04/lieber/statsgen/shizhong/michael/mwas/gwas/libd_chr4.pgen,/dcs04/lieber/statsgen/mnagle/mwas/pheno/caud/out/chr4_AA.rda
5,/dcs04/lieber/statsgen/shizhong/michael/mwas/gwas/libd_chr5.pgen,/dcs04/lieber/statsgen/mnagle/mwas/pheno/caud/out/chr5_AA.rda
6,/dcs04/lieber/statsgen/shizhong/michael/mwas/gwas/libd_chr6.pgen,/dcs04/lieber/statsgen/mnagle/mwas/pheno/caud/out/chr6_AA.rda


In [18]:
library(bsseq)

Loading required package: BiocGenerics


Attaching package: ‘BiocGenerics’


The following objects are masked from ‘package:stats’:

    IQR, mad, sd, var, xtabs


The following objects are masked from ‘package:base’:

    anyDuplicated, aperm, append, as.data.frame, basename, cbind,
    colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
    get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
    match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
    Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
    table, tapply, union, unique, unsplit, which.max, which.min


Loading required package: GenomicRanges

Loading required package: stats4

Loading required package: S4Vectors


Attaching package: ‘S4Vectors’


The following objects are masked from ‘package:data.table’:

    first, second


The following object is masked from ‘package:utils’:

    findMatches


The following objects are masked from ‘package:base’:

    expand.grid, I, unname

### Test on first row

In [19]:
dim(long_format)
head(long_format)
Sys.time()

Chr,SNP_data,methylation_data
<dbl>,<chr>,<chr>
1,/dcs04/lieber/statsgen/shizhong/michael/mwas/gwas/libd_chr1.pgen,/dcs04/lieber/statsgen/mnagle/mwas/pheno/caud/out/chr1_AA.rda
2,/dcs04/lieber/statsgen/shizhong/michael/mwas/gwas/libd_chr2.pgen,/dcs04/lieber/statsgen/mnagle/mwas/pheno/caud/out/chr2_AA.rda
3,/dcs04/lieber/statsgen/shizhong/michael/mwas/gwas/libd_chr3.pgen,/dcs04/lieber/statsgen/mnagle/mwas/pheno/caud/out/chr3_AA.rda
4,/dcs04/lieber/statsgen/shizhong/michael/mwas/gwas/libd_chr4.pgen,/dcs04/lieber/statsgen/mnagle/mwas/pheno/caud/out/chr4_AA.rda
5,/dcs04/lieber/statsgen/shizhong/michael/mwas/gwas/libd_chr5.pgen,/dcs04/lieber/statsgen/mnagle/mwas/pheno/caud/out/chr5_AA.rda
6,/dcs04/lieber/statsgen/shizhong/michael/mwas/gwas/libd_chr6.pgen,/dcs04/lieber/statsgen/mnagle/mwas/pheno/caud/out/chr6_AA.rda


[1] "2024-03-06 14:15:54 EST"

In [20]:
i <- 1

In [21]:
pvar <- fread(gsub("pgen", "pvar", long_format$SNP_data[i]))[, 1:3]
load(long_format$methylation_data[i])
first_SNP_position <- min(pvar$POS)
last_SNP_position <- max(pvar$POS)
first_meth_position <- min(start(BSobj2))
last_meth_position <- max(start(BSobj2))
window_size <- 10000
# Find first methylation index with SNP coverage
first_meth_index_with_SNP_coverage <- which(start(BSobj2) >= (first_SNP_position - window_size) & 
                                             start(BSobj2) <= (last_SNP_position + window_size))[1]
# Correcting the approach to find the last methylation index with SNP coverage
last_meth_index_with_SNP_coverage <- which(start(BSobj2) >= (first_SNP_position - window_size) & 
                                            start(BSobj2) <= (last_SNP_position + window_size))
if(length(last_meth_index_with_SNP_coverage) > 0) {
  last_meth_index_with_SNP_coverage <- last_meth_index_with_SNP_coverage[length(last_meth_index_with_SNP_coverage)]
} else {
  last_meth_index_with_SNP_coverage <- NA # In case no indices are found
}

# Values for the first and last methylation sites with SNP coverage
if(!is.na(first_meth_index_with_SNP_coverage)) {
  first_meth_value_with_SNP_coverage <- start(BSobj2)[first_meth_index_with_SNP_coverage]
} else {
  first_meth_value_with_SNP_coverage <- NA # In case no first index is found
}

if(!is.na(last_meth_index_with_SNP_coverage)) {
  last_meth_value_with_SNP_coverage <- start(BSobj2)[last_meth_index_with_SNP_coverage]
} else {
  last_meth_value_with_SNP_coverage <- NA # In case no last index is found
}

if(!identical(start(BSobj2), sort(start(BSobj2)))) {
  stop("start positions in BSobj2 are not ordered")
}

BSobj2 <- NULL


In [22]:
Sys.time()

[1] "2024-03-06 14:16:55 EST"

### Sanity checks

In [23]:
first_meth_value_with_SNP_coverage

In [24]:
last_meth_value_with_SNP_coverage

In [25]:
first_meth_index_with_SNP_coverage

In [26]:
last_meth_index_with_SNP_coverage

### Deploy

In [27]:
long_format$first_meth_index_with_SNP_coverage <-
long_format$last_meth_index_with_SNP_coverage <- 
long_format$first_meth_value_with_SNP_coverage <-
long_format$last_meth_value_with_SNP_coverage <- 
NA

start_time <- Sys.time()

for(i in 1:nrow(long_format)){
    print(i)
    iter_start_time <- Sys.time()
    
    pvar <- fread(gsub("pgen", "pvar", long_format$SNP_data[i]))[, 1:3]
    load(long_format$methylation_data[i])

    if(!identical(start(BSobj2), sort(start(BSobj2)))) {
      stop("start positions in BSobj2 are not ordered")
    }
    
    first_SNP_position <- min(pvar$POS)
    last_SNP_position <- max(pvar$POS)
    first_meth_position <- min(start(BSobj2))
    last_meth_position <- max(start(BSobj2))
    window_size <- 10000
    # Find first methylation index with SNP coverage
    first_meth_index_with_SNP_coverage <- which(start(BSobj2) >= (first_SNP_position - window_size) & 
                                                 start(BSobj2) <= (last_SNP_position + window_size))[1]
    # Correcting the approach to find the last methylation index with SNP coverage
    last_meth_index_with_SNP_coverage <- which(start(BSobj2) >= (first_SNP_position - window_size) & 
                                                start(BSobj2) <= (last_SNP_position + window_size))
    if(length(last_meth_index_with_SNP_coverage) > 0) {
      last_meth_index_with_SNP_coverage <- last_meth_index_with_SNP_coverage[length(last_meth_index_with_SNP_coverage)]
    } else {
      last_meth_index_with_SNP_coverage <- NA # In case no indices are found
    }
    
    # Values for the first and last methylation sites with SNP coverage
    if(!is.na(first_meth_index_with_SNP_coverage)) {
      first_meth_value_with_SNP_coverage <- start(BSobj2)[first_meth_index_with_SNP_coverage]
    } else {
      first_meth_value_with_SNP_coverage <- NA # In case no first index is found
    }
    
    if(!is.na(last_meth_index_with_SNP_coverage)) {
      last_meth_value_with_SNP_coverage <- start(BSobj2)[last_meth_index_with_SNP_coverage]
    } else {
      last_meth_value_with_SNP_coverage <- NA # In case no last index is found
    }
    

    
    BSobj2 <- NULL
    
    long_format$first_meth_index_with_SNP_coverage[i] <- first_meth_index_with_SNP_coverage
    long_format$last_meth_index_with_SNP_coverage[i] <- last_meth_index_with_SNP_coverage
    long_format$first_meth_value_with_SNP_coverage[i] <- first_meth_value_with_SNP_coverage
    long_format$last_meth_value_with_SNP_coverage[i] <- last_meth_value_with_SNP_coverage
    
    iter_end_time <- Sys.time()
    elapsed_time_per_iter <- iter_end_time - iter_start_time
    total_elapsed_time <- Sys.time() - start_time
    average_time_per_iter <- total_elapsed_time / i
    estimated_total_time <- average_time_per_iter * nrow(long_format)
    estimated_time_remaining <- estimated_total_time - total_elapsed_time

    print(sprintf("Ran %d of %d so far. Estimated time remaining: %s minutes\n", 
                i, nrow(long_format), round(as.numeric(estimated_time_remaining) / 60, 2)))

}

[1] 1
[1] "Ran 1 of 198 so far. Estimated time remaining: 181.79 minutes\n"
[1] 2
[1] "Ran 2 of 198 so far. Estimated time remaining: 2.97 minutes\n"
[1] 3
[1] "Ran 3 of 198 so far. Estimated time remaining: 2.71 minutes\n"
[1] 4
[1] "Ran 4 of 198 so far. Estimated time remaining: 2.6 minutes\n"
[1] 5
[1] "Ran 5 of 198 so far. Estimated time remaining: 2.55 minutes\n"
[1] 6
[1] "Ran 6 of 198 so far. Estimated time remaining: 2.53 minutes\n"
[1] 7
[1] "Ran 7 of 198 so far. Estimated time remaining: 2.51 minutes\n"
[1] 8
[1] "Ran 8 of 198 so far. Estimated time remaining: 2.44 minutes\n"
[1] 9
[1] "Ran 9 of 198 so far. Estimated time remaining: 2.35 minutes\n"
[1] 10
[1] "Ran 10 of 198 so far. Estimated time remaining: 2.32 minutes\n"
[1] 11
[1] "Ran 11 of 198 so far. Estimated time remaining: 2.29 minutes\n"
[1] 12
[1] "Ran 12 of 198 so far. Estimated time remaining: 2.26 minutes\n"
[1] 13
[1] "Ran 13 of 198 so far. Estimated time remaining: 2.18 minutes\n"
[1] 14
[1] "Ran 14 of 198 so 

In [29]:
fwrite(long_format, "09-OUT_matched_files_and_indices_to_test.csv")

Started around 2pm

## Bring in all covariate files

### Get paths

In [30]:
genotype_pc <- list.files("/dcs04/lieber/statsgen/shizhong/database/libd/genotype/postmortem/topmed/merge_H650_1M_2.5M_5M/EA_AA/all/pca_wgbs",
                          recursive = TRUE,
                          full.names = TRUE,
                          pattern = "pca.eigenvec")

In [35]:
genotype_pc <- data.frame(genotype_pc_path = genotype_pc, brain_region = c("caud", "dlpfc", "hippo"))

In [48]:
genotype_pc <- rbind(genotype_pc, genotype_pc, genotype_pc)

In [50]:
genotype_pc$subpopulation <- c("EA", "EA", "EA", "AA", "AA", "AA", "all", "all", "all")

In [51]:
genotype_pc

genotype_pc_path,brain_region,subpopulation
<chr>,<chr>,<chr>
/dcs04/lieber/statsgen/shizhong/database/libd/genotype/postmortem/topmed/merge_H650_1M_2.5M_5M/EA_AA/all/pca_wgbs/caudate/pca.eigenvec,caud,EA
/dcs04/lieber/statsgen/shizhong/database/libd/genotype/postmortem/topmed/merge_H650_1M_2.5M_5M/EA_AA/all/pca_wgbs/dlpfc/pca.eigenvec,dlpfc,EA
/dcs04/lieber/statsgen/shizhong/database/libd/genotype/postmortem/topmed/merge_H650_1M_2.5M_5M/EA_AA/all/pca_wgbs/hippo/pca.eigenvec,hippo,EA
/dcs04/lieber/statsgen/shizhong/database/libd/genotype/postmortem/topmed/merge_H650_1M_2.5M_5M/EA_AA/all/pca_wgbs/caudate/pca.eigenvec,caud,AA
/dcs04/lieber/statsgen/shizhong/database/libd/genotype/postmortem/topmed/merge_H650_1M_2.5M_5M/EA_AA/all/pca_wgbs/dlpfc/pca.eigenvec,dlpfc,AA
/dcs04/lieber/statsgen/shizhong/database/libd/genotype/postmortem/topmed/merge_H650_1M_2.5M_5M/EA_AA/all/pca_wgbs/hippo/pca.eigenvec,hippo,AA
/dcs04/lieber/statsgen/shizhong/database/libd/genotype/postmortem/topmed/merge_H650_1M_2.5M_5M/EA_AA/all/pca_wgbs/caudate/pca.eigenvec,caud,all
/dcs04/lieber/statsgen/shizhong/database/libd/genotype/postmortem/topmed/merge_H650_1M_2.5M_5M/EA_AA/all/pca_wgbs/dlpfc/pca.eigenvec,dlpfc,all
/dcs04/lieber/statsgen/shizhong/database/libd/genotype/postmortem/topmed/merge_H650_1M_2.5M_5M/EA_AA/all/pca_wgbs/hippo/pca.eigenvec,hippo,all


In [52]:
meth_pc <- list.files("/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno/",
                      recursive = TRUE,
                      full.names = TRUE,
                      pattern = "pc_")

In [53]:
meth_pc <- data.table(meth_pc_path = meth_pc)

In [54]:
meth_pc[, subpopulation := sub(".*pc_(.*)\\.csv", "\\1", meth_pc_path)]
meth_pc[, brain_region := sub(".*/pheno//(.*)/out/.*", "\\1", meth_pc_path)]

In [55]:
meth_pc

meth_pc_path,subpopulation,brain_region
<chr>,<chr>,<chr>
/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno//caud/out/pc_AA.csv,AA,caud
/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno//caud/out/pc_all.csv,all,caud
/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno//caud/out/pc_EA.csv,EA,caud
/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno//dlpfc/out/pc_AA.csv,AA,dlpfc
/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno//dlpfc/out/pc_all.csv,all,dlpfc
/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno//dlpfc/out/pc_EA.csv,EA,dlpfc
/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno//hippo/out/pc_AA.csv,AA,hippo
/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno//hippo/out/pc_all.csv,all,hippo
/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno//hippo/out/pc_EA.csv,EA,hippo


In [56]:
cov3 <- "/dcs04/lieber/statsgen/shizhong/database/libd/genotype/postmortem/phenotype/pheno_PC"

### Format, merge with data frame

In [57]:
long_format[, subpopulation := sub(".*_(.*)\\.rda", "\\1", methylation_data)]
long_format[, brain_region := sub(".*/pheno/(.*)/out/.*", "\\1", methylation_data)]

In [58]:
merged <- merge(long_format, genotype_pc)

In [59]:
merged <- merge(merged, meth_pc)

In [60]:
merged$pheno_pc <- cov3

In [61]:
head(merged)

subpopulation,brain_region,Chr,SNP_data,methylation_data,last_meth_value_with_SNP_coverage,first_meth_value_with_SNP_coverage,last_meth_index_with_SNP_coverage,first_meth_index_with_SNP_coverage,genotype_pc_path,meth_pc_path,pheno_pc
<chr>,<chr>,<dbl>,<chr>,<chr>,<int>,<int>,<int>,<int>,<chr>,<chr>,<chr>
AA,caud,1,/dcs04/lieber/statsgen/shizhong/michael/mwas/gwas/libd_chr1.pgen,/dcs04/lieber/statsgen/mnagle/mwas/pheno/caud/out/chr1_AA.rda,248918358,1069461,2202702,8982,/dcs04/lieber/statsgen/shizhong/database/libd/genotype/postmortem/topmed/merge_H650_1M_2.5M_5M/EA_AA/all/pca_wgbs/caudate/pca.eigenvec,/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno//caud/out/pc_AA.csv,/dcs04/lieber/statsgen/shizhong/database/libd/genotype/postmortem/phenotype/pheno_PC
AA,caud,2,/dcs04/lieber/statsgen/shizhong/michael/mwas/gwas/libd_chr2.pgen,/dcs04/lieber/statsgen/mnagle/mwas/pheno/caud/out/chr2_AA.rda,241863783,10001,2019984,1,/dcs04/lieber/statsgen/shizhong/database/libd/genotype/postmortem/topmed/merge_H650_1M_2.5M_5M/EA_AA/all/pca_wgbs/caudate/pca.eigenvec,/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno//caud/out/pc_AA.csv,/dcs04/lieber/statsgen/shizhong/database/libd/genotype/postmortem/phenotype/pheno_PC
AA,caud,3,/dcs04/lieber/statsgen/shizhong/michael/mwas/gwas/libd_chr3.pgen,/dcs04/lieber/statsgen/mnagle/mwas/pheno/caud/out/chr3_AA.rda,198099789,11602,1538467,1,/dcs04/lieber/statsgen/shizhong/database/libd/genotype/postmortem/topmed/merge_H650_1M_2.5M_5M/EA_AA/all/pca_wgbs/caudate/pca.eigenvec,/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno//caud/out/pc_AA.csv,/dcs04/lieber/statsgen/shizhong/database/libd/genotype/postmortem/phenotype/pheno_PC
AA,caud,4,/dcs04/lieber/statsgen/shizhong/michael/mwas/gwas/libd_chr4.pgen,/dcs04/lieber/statsgen/mnagle/mwas/pheno/caud/out/chr4_AA.rda,189877411,69399,1387731,1,/dcs04/lieber/statsgen/shizhong/database/libd/genotype/postmortem/topmed/merge_H650_1M_2.5M_5M/EA_AA/all/pca_wgbs/caudate/pca.eigenvec,/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno//caud/out/pc_AA.csv,/dcs04/lieber/statsgen/shizhong/database/libd/genotype/postmortem/phenotype/pheno_PC
AA,caud,5,/dcs04/lieber/statsgen/shizhong/michael/mwas/gwas/libd_chr5.pgen,/dcs04/lieber/statsgen/mnagle/mwas/pheno/caud/out/chr5_AA.rda,181172584,44104,1409038,1,/dcs04/lieber/statsgen/shizhong/database/libd/genotype/postmortem/topmed/merge_H650_1M_2.5M_5M/EA_AA/all/pca_wgbs/caudate/pca.eigenvec,/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno//caud/out/pc_AA.csv,/dcs04/lieber/statsgen/shizhong/database/libd/genotype/postmortem/phenotype/pheno_PC
AA,caud,6,/dcs04/lieber/statsgen/shizhong/michael/mwas/gwas/libd_chr6.pgen,/dcs04/lieber/statsgen/mnagle/mwas/pheno/caud/out/chr6_AA.rda,170619093,192453,1412543,1138,/dcs04/lieber/statsgen/shizhong/database/libd/genotype/postmortem/topmed/merge_H650_1M_2.5M_5M/EA_AA/all/pca_wgbs/caudate/pca.eigenvec,/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno//caud/out/pc_AA.csv,/dcs04/lieber/statsgen/shizhong/database/libd/genotype/postmortem/phenotype/pheno_PC


## Unite covariate files for each row into a single file

In [111]:
outdir <- "/dcs04/lieber/statsgen/mnagle/mwas/full_covariates/"
#dir.create(outdir)

### test for first row

In [102]:
colnames(merged)

In [103]:
dim(merged)

In [107]:
merged_pc_only <- unique(merged[, c(1, 2, 10:12)])

In [108]:
merged_pc_only

subpopulation,brain_region,genotype_pc_path,meth_pc_path,pheno_pc
<chr>,<chr>,<chr>,<chr>,<chr>
AA,caud,/dcs04/lieber/statsgen/shizhong/database/libd/genotype/postmortem/topmed/merge_H650_1M_2.5M_5M/EA_AA/all/pca_wgbs/caudate/pca.eigenvec,/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno//caud/out/pc_AA.csv,/dcs04/lieber/statsgen/shizhong/database/libd/genotype/postmortem/phenotype/pheno_PC
AA,dlpfc,/dcs04/lieber/statsgen/shizhong/database/libd/genotype/postmortem/topmed/merge_H650_1M_2.5M_5M/EA_AA/all/pca_wgbs/dlpfc/pca.eigenvec,/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno//dlpfc/out/pc_AA.csv,/dcs04/lieber/statsgen/shizhong/database/libd/genotype/postmortem/phenotype/pheno_PC
AA,hippo,/dcs04/lieber/statsgen/shizhong/database/libd/genotype/postmortem/topmed/merge_H650_1M_2.5M_5M/EA_AA/all/pca_wgbs/hippo/pca.eigenvec,/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno//hippo/out/pc_AA.csv,/dcs04/lieber/statsgen/shizhong/database/libd/genotype/postmortem/phenotype/pheno_PC
EA,caud,/dcs04/lieber/statsgen/shizhong/database/libd/genotype/postmortem/topmed/merge_H650_1M_2.5M_5M/EA_AA/all/pca_wgbs/caudate/pca.eigenvec,/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno//caud/out/pc_EA.csv,/dcs04/lieber/statsgen/shizhong/database/libd/genotype/postmortem/phenotype/pheno_PC
EA,dlpfc,/dcs04/lieber/statsgen/shizhong/database/libd/genotype/postmortem/topmed/merge_H650_1M_2.5M_5M/EA_AA/all/pca_wgbs/dlpfc/pca.eigenvec,/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno//dlpfc/out/pc_EA.csv,/dcs04/lieber/statsgen/shizhong/database/libd/genotype/postmortem/phenotype/pheno_PC
EA,hippo,/dcs04/lieber/statsgen/shizhong/database/libd/genotype/postmortem/topmed/merge_H650_1M_2.5M_5M/EA_AA/all/pca_wgbs/hippo/pca.eigenvec,/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno//hippo/out/pc_EA.csv,/dcs04/lieber/statsgen/shizhong/database/libd/genotype/postmortem/phenotype/pheno_PC
all,caud,/dcs04/lieber/statsgen/shizhong/database/libd/genotype/postmortem/topmed/merge_H650_1M_2.5M_5M/EA_AA/all/pca_wgbs/caudate/pca.eigenvec,/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno//caud/out/pc_all.csv,/dcs04/lieber/statsgen/shizhong/database/libd/genotype/postmortem/phenotype/pheno_PC
all,dlpfc,/dcs04/lieber/statsgen/shizhong/database/libd/genotype/postmortem/topmed/merge_H650_1M_2.5M_5M/EA_AA/all/pca_wgbs/dlpfc/pca.eigenvec,/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno//dlpfc/out/pc_all.csv,/dcs04/lieber/statsgen/shizhong/database/libd/genotype/postmortem/phenotype/pheno_PC
all,hippo,/dcs04/lieber/statsgen/shizhong/database/libd/genotype/postmortem/topmed/merge_H650_1M_2.5M_5M/EA_AA/all/pca_wgbs/hippo/pca.eigenvec,/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno//hippo/out/pc_all.csv,/dcs04/lieber/statsgen/shizhong/database/libd/genotype/postmortem/phenotype/pheno_PC


In [63]:
i <- 1

In [99]:
geno_pc <- fread(merged$genotype_pc_path[i])
meth_pc <- fread(merged$meth_pc_path[i])
pheno_pc <- fread(merged$pheno_pc[i])

meth_pc <- meth_pc[, 1:11]

geno_pc <- geno_pc[, 1:4]

pheno_pc <- pheno_pc[, c(2, 6, 7, 8)]

colnames(meth_pc)[1] <- colnames(pheno_pc)[1] <- colnames(geno_pc)[1] <- "ID"

colnames(meth_pc)[2:11] <- paste0("methPC", 1:10)

colnames(geno_pc)[2:4] <- paste0("genoPC", 1:3)

geno_pc$ID <- gsub("Br0", "Br", geno_pc$ID)
pheno_pc$ID <- gsub("Br0", "Br", pheno_pc$ID)
meth_pc$ID <- gsub("Br0", "Br", meth_pc$ID)

#dim(meth_pc)
#dim(geno_pc)
#dim(pheno_pc)

unified_pc <- merge(geno_pc, pheno_pc)
#dim(unified_pc)
unified_pc <- merge(unified_pc, meth_pc)
#dim(unified_pc)

if(nrow(unified_pc) != nrow(meth_pc)){
    stop(paste0("We have a mismatch for row ", i))
    }

In [112]:
outname <- paste0(outdir, merged$subpopulation[i], "_", merged$brain_region[i], ".csv")
fwrite(merged_pc_only, outname)

In [128]:
for(i in 1:nrow(merged_pc_only)){
    geno_pc <- fread(merged_pc_only$genotype_pc_path[i])
    meth_pc <- fread(merged_pc_only$meth_pc_path[i])
    pheno_pc <- fread(merged_pc_only$pheno_pc[i])

    meth_pc <- meth_pc[, 1:11]

    geno_pc <- geno_pc[, 1:4]

    pheno_pc <- pheno_pc[, c(2, 6, 7, 8)]

    colnames(meth_pc)[1] <- colnames(pheno_pc)[1] <- colnames(geno_pc)[1] <- "ID"

    colnames(meth_pc)[2:11] <- paste0("methPC", 1:10)

    colnames(geno_pc)[2:4] <- paste0("genoPC", 1:3)

    geno_pc$ID <- gsub("Br0", "Br", geno_pc$ID)
    pheno_pc$ID <- gsub("Br0", "Br", pheno_pc$ID)
    pheno_pc$Dx <- gsub("chizo", "CZ", pheno_pc$Dx)
    meth_pc$ID <- gsub("Br0", "Br", meth_pc$ID)

    #dim(meth_pc)
    #dim(geno_pc)
    #dim(pheno_pc)

    unified_pc <- merge(geno_pc, meth_pc)
    #dim(unified_pc)
    unified_pc <- merge(unified_pc, pheno_pc)
    #dim(unified_pc)

    if(nrow(unified_pc) != nrow(meth_pc)){
        print(setdiff(meth_pc$ID, unified_pc$ID))
        warning(paste0("We have a mismatch for row ", i))
        print(merged_pc_only[i, ])
        }
    outname <- paste0(outdir, merged_pc_only$subpopulation[i], "_", merged_pc_only$brain_region[i], ".csv")
    fwrite(unified_pc, outname)
    
    }


[1] "Br2267"


“We have a mismatch for row 2”


   subpopulation brain_region
1:            AA        dlpfc
                                                                                                                       genotype_pc_path
1: /dcs04/lieber/statsgen/shizhong/database/libd/genotype/postmortem/topmed/merge_H650_1M_2.5M_5M/EA_AA/all/pca_wgbs/dlpfc/pca.eigenvec
                                                              meth_pc_path
1: /dcs04/lieber/statsgen/shizhong/michael/mwas/pheno//dlpfc/out/pc_AA.csv
                                                                               pheno_pc
1: /dcs04/lieber/statsgen/shizhong/database/libd/genotype/postmortem/phenotype/pheno_PC
[1] "Br2267"


“We have a mismatch for row 3”


   subpopulation brain_region
1:            AA        hippo
                                                                                                                       genotype_pc_path
1: /dcs04/lieber/statsgen/shizhong/database/libd/genotype/postmortem/topmed/merge_H650_1M_2.5M_5M/EA_AA/all/pca_wgbs/hippo/pca.eigenvec
                                                              meth_pc_path
1: /dcs04/lieber/statsgen/shizhong/michael/mwas/pheno//hippo/out/pc_AA.csv
                                                                               pheno_pc
1: /dcs04/lieber/statsgen/shizhong/database/libd/genotype/postmortem/phenotype/pheno_PC
[1] "Br2267"


“We have a mismatch for row 8”


   subpopulation brain_region
1:           all        dlpfc
                                                                                                                       genotype_pc_path
1: /dcs04/lieber/statsgen/shizhong/database/libd/genotype/postmortem/topmed/merge_H650_1M_2.5M_5M/EA_AA/all/pca_wgbs/dlpfc/pca.eigenvec
                                                               meth_pc_path
1: /dcs04/lieber/statsgen/shizhong/michael/mwas/pheno//dlpfc/out/pc_all.csv
                                                                               pheno_pc
1: /dcs04/lieber/statsgen/shizhong/database/libd/genotype/postmortem/phenotype/pheno_PC
[1] "Br2267"


“We have a mismatch for row 9”


   subpopulation brain_region
1:           all        hippo
                                                                                                                       genotype_pc_path
1: /dcs04/lieber/statsgen/shizhong/database/libd/genotype/postmortem/topmed/merge_H650_1M_2.5M_5M/EA_AA/all/pca_wgbs/hippo/pca.eigenvec
                                                               meth_pc_path
1: /dcs04/lieber/statsgen/shizhong/michael/mwas/pheno//hippo/out/pc_all.csv
                                                                               pheno_pc
1: /dcs04/lieber/statsgen/shizhong/database/libd/genotype/postmortem/phenotype/pheno_PC


In [129]:
outname

In [130]:
merged$full_covariate_path <-
paste0(outdir, merged$subpopulation, "_", merged$brain_region, ".csv")

In [131]:
fread("/dcs04/lieber/statsgen/mnagle/mwas/full_covariates/AA_caud.csv")

ID,genoPC1,genoPC2,genoPC3,methPC1,methPC2,methPC3,methPC4,methPC5,methPC6,methPC7,methPC8,methPC9,methPC10,Dx,Age,Sex
<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<dbl>,<chr>
Br1003,0.0591231,-0.039807700,0.02053080,-157.04647,22.431504,-3.613153,28.587802,-165.830789,15.4914042,27.347098,68.1463463,-83.144721,74.405523,Control,52.57,F
Br1004,0.0507512,0.016700300,-0.02350190,-207.97793,14.563873,-48.109417,-48.368597,65.479538,-174.4362052,-40.363495,-103.8133780,11.087242,-84.670514,Control,21.01,M
Br1007,0.0683202,0.005023590,-0.05958990,119.29338,-184.287037,-127.987516,8.047659,100.518214,112.4048122,-44.079714,-13.3318373,109.550591,55.891996,Control,57.10,M
Br1017,0.0685061,-0.008443570,-0.01093240,148.35140,-89.715577,-97.742602,7.072870,-78.404717,77.0296173,-33.334885,-34.9482041,73.886328,65.622516,Control,48.42,F
Br1021,0.0596699,0.003678410,0.04795520,453.47100,-132.387141,35.349695,80.759933,123.878599,-39.4338123,-122.191818,64.6271242,-62.891067,-25.939892,SCZ,63.18,M
Br1030,0.0166234,0.037136000,-0.11683800,-212.50562,-3.017152,-46.171785,-91.779986,-28.953074,-0.2483534,63.463302,10.9460093,62.956351,4.060549,SCZ,49.65,F
Br1039,0.0372755,0.000920873,0.04261920,-103.04318,-97.394092,6.394317,9.246365,-153.099368,-57.4199032,-21.466761,61.2001242,4.913690,18.458765,Control,51.45,F
Br1040,0.0418527,-0.041008900,-0.00594927,-651.74144,243.329728,-234.910289,-21.365701,137.855050,-107.1335278,-9.355704,-32.7974052,51.679802,-82.487019,Control,20.93,M
Br1050,0.0667423,-0.029448500,-0.06359590,-346.35535,148.209663,-322.522824,-64.676761,121.759943,-70.2543686,2.514280,30.8563428,-19.582689,-27.813140,SCZ,41.25,M
Br1053,0.0545700,0.010893500,-0.00503420,14.25498,-218.737504,-116.037699,-46.148580,-61.733453,-40.1747788,-36.466078,-77.0255876,22.668648,28.637638,Control,40.18,F
