# Match up SNP, methylation and covariate files to be analyzed together

As part of this, we will determine which methylation sites have coverage in the SNP data and should be analyzed.

## Match SNP and methylation files

In [1]:
library(stringr)

In [2]:
library(data.table)

In [None]:
SNP_files <- list.files("/expanse/lustre/projects/jhu152/naglemi/mwas/gwas/",
                        #"/dcs04/lieber/statsgen/shizhong/michael/mwas/gwas",
                        pattern = "pgen", full.names = TRUE)
SNP_files <- SNP_files[ordered(SNP_files)]

SNP_files_libd <- data.frame(SNPs_libd = SNP_files[grepl("libd", SNP_files)])
SNP_files_ref_EUR <- data.frame(SNPs_ref_EUR = SNP_files[grepl("ref_EUR", SNP_files)])

SNP_files_libd$Chr <- as.numeric(stringr::str_extract(SNP_files_libd$SNPs_libd, "(?<=chr)\\d+"))
SNP_files_ref_EUR$Chr <- as.numeric(stringr::str_extract(SNP_files_ref_EUR$SNPs_ref_EUR, "(?<=chr)\\d+"))

SNP_files <- merge(SNP_files_libd, SNP_files_ref_EUR)

In [None]:
meth_files <- list.files("/expanse/lustre/projects/jhu152/naglemi/mwas/pheno", recursive = TRUE, pattern = "rda", full.names = TRUE)

We will bring in original dlpfc and hippo methylation files, and the caudate files that were reprocessed to store everything in given files (where previously those ones had h5 pointers)

In [None]:
#meth_caud <- data.frame(meth_caud = meth_files[grepl("caud", meth_files)])
meth_dlpfc <- data.frame(meth_dlpfc = meth_files[grepl("dlpfc", meth_files)])
meth_hippo <- data.frame(meth_hippo = meth_files[grepl("hippo", meth_files)])

In [None]:
meth_caud_files <- list.files("/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/caud",
                              recursive = TRUE, pattern = "rda", full.names = TRUE)
meth_caud_files <- meth_caud_files[!grepl("pointers", meth_caud_files)]
meth_caud <- data.frame(meth_caud = meth_caud_files[grepl("caud", meth_caud_files)])

In [None]:
dim(meth_caud)

In [None]:
head(meth_dlpfc)

In [None]:
head(meth_caud)

In [None]:
meth_caud$Chr <- as.numeric(str_extract(meth_caud$meth_caud, "(?<=chr)\\d+(?=_)"))
meth_dlpfc$Chr <- as.numeric(str_extract(meth_dlpfc$meth_dlpfc, "(?<=chr)\\d+(?=_)"))
meth_hippo$Chr <- as.numeric(str_extract(meth_hippo$meth_hippo, "(?<=chr)\\d+(?=_)"))

In [None]:
reshape_and_rename_corrected <- function(data, region) {
  # Extracting the type (AA, EA, all) based on the filename pattern
  data$Type <- gsub(paste0(".*", region, "/out/chr\\d+_(AA|EA|all)\\.rda"), "\\1", data[[1]])
  
  # Creating a wide dataframe with separate columns for AA, EA, and all
  wide_data <- reshape(data, idvar = "Chr", timevar = "Type", direction = "wide")
  
  # Correcting column names to include only the region and type without redundant parts
  colnames(wide_data)[-1] <- sapply(colnames(wide_data)[-1], function(x) {
    paste(region, gsub(".*\\.(AA|EA|all)$", "\\1", x), sep = "_")
  })
  
  return(wide_data)
}

# Applying the corrected function to each dataset
meth_caud_wide <- reshape_and_rename_corrected(meth_caud, "caud")
meth_dlpfc_wide <- reshape_and_rename_corrected(meth_dlpfc, "dlpfc")
meth_hippo_wide <- reshape_and_rename_corrected(meth_hippo, "hippo")

merged_data <- Reduce(function(x, y) merge(x, y, by = "Chr", all = TRUE), list(meth_caud_wide, meth_dlpfc_wide, meth_hippo_wide))


In [None]:
head(merged_data)

In [None]:
head(SNP_files)

In [None]:
merged <- merge(SNP_files, merged_data)

In [None]:
data.table::fwrite(merged, "09-OUT_matched_SNP_meth_files.csv")

## Determine first, last position for each SNP, methylation file pair

In [None]:
merged$SNPs_ref_EUR <- NULL

In [None]:
head(merged)

In [None]:
# Load data.table
library(data.table)

# Assuming 'merged' is your data.table with the wide format
# Set it as a data.table if it's not already
setDT(merged)

# Melt the data.table from wide to long format
# id.vars = "Chr" to keep the chromosome column as identifier
# measure.vars patterns to match SNP data and methylation data columns
long_format <- melt(
  merged,
  id.vars = "Chr",
  measure.vars = patterns("^SNPs_libd", "caud_.*|dlpfc_.*|hippo_.*"),
  value.name = c("SNP_data", "methylation_data"),
  variable.name = "type"
)

# Adjust 'type' to distinguish between SNP and methylation data if needed
long_format[, type := ifelse(grepl("SNPs_libd", type), "pgen", "rda")]

# View the head of the long format table
head(long_format)

In [None]:
# Load the data.table library
library(data.table)

# Assuming 'merged' is already a data.table
# If it's not, convert it using setDT(merged)

# Select SNP column and reshape methylation columns into long format
# Extract column names for methylation data
methylation_columns <- grep("caud_|dlpfc_|hippo_", names(merged), value = TRUE)

# Create a long format table
long_format <- melt(merged, id.vars = "Chr", measure.vars = methylation_columns, variable.name = "Methylation_Type", value.name = "methylation_data")

# Add SNP data to the long format data.table
long_format[, SNP_data := merged[.SD, on = "Chr", SNPs_libd]]

# Remove Methylation_Type column and arrange columns as specified
long_format[, `Methylation_Type` := NULL]
setcolorder(long_format, c("Chr", "SNP_data", "methylation_data"))

# Result
head(long_format)


In [None]:
library(bsseq)

### Test on first row

In [None]:
dim(long_format)
head(long_format)
Sys.time()

In [None]:
i <- 1

In [None]:
pvar <- fread(gsub("pgen", "pvar", long_format$SNP_data[i]))[, 1:3]
load(long_format$methylation_data[i])
first_SNP_position <- min(pvar$POS)
last_SNP_position <- max(pvar$POS)
first_meth_position <- min(start(BSobj2))
last_meth_position <- max(start(BSobj2))
window_size <- 10000
# Find first methylation index with SNP coverage
meth_indices_w_SNP_coverage <- which(start(BSobj2) >= (first_SNP_position - window_size) &
                                   start(BSobj2) <= (last_SNP_position + window_size))
first_meth_index_with_SNP_coverage <- min(meth_indices_w_SNP_coverage)
# Correcting the approach to find the last methylation index with SNP coverage
last_meth_index_with_SNP_coverage <- max(meth_indices_w_SNP_coverage)

first_meth_value_with_SNP_coverage <- start(BSobj2)[first_meth_index_with_SNP_coverage]
last_meth_value_with_SNP_coverage <- start(BSobj2)[last_meth_index_with_SNP_coverage]

In [None]:
first_meth_index_with_SNP_coverage

In [None]:
first_SNP_position

In [None]:
max(pvar$POS)

In [None]:
if(!identical(start(BSobj2), sort(start(BSobj2)))) {
  stop("start positions in BSobj2 are not ordered")
}

BSobj2 <- NULL

In [None]:
Sys.time()

### Sanity checks

In [None]:
first_meth_value_with_SNP_coverage

In [None]:
last_meth_value_with_SNP_coverage

In [None]:
first_meth_index_with_SNP_coverage

In [None]:
last_meth_index_with_SNP_coverage

### Deploy

In [None]:
i

In [None]:
BSobj2

In [None]:
long_format$first_snp_position <-
long_format$last_snp_position <-
long_format$last_meth_index <-
long_format$first_meth_position <- 
long_format$last_meth_position <-
long_format$first_meth_index_with_SNP_coverage <-
long_format$last_meth_index_with_SNP_coverage <- 
long_format$first_meth_value_with_SNP_coverage <-
long_format$last_meth_value_with_SNP_coverage <- 
NA

start_time <- Sys.time()

for(i in 1:nrow(long_format)){
    print(i)
    iter_start_time <- Sys.time()
    
    pvar <- fread(gsub("pgen", "pvar", long_format$SNP_data[i]))[, 1:3]
    load(long_format$methylation_data[i])

    if(!identical(start(BSobj2), sort(start(BSobj2)))) {
      stop("start positions in BSobj2 are not ordered")
    }
    
    first_SNP_position <- min(pvar$POS)
    last_SNP_position <- max(pvar$POS)
    first_meth_position <- min(start(BSobj2))
    last_meth_position <- max(start(BSobj2))
    window_size <- 10000
    # Find first methylation index with SNP coverage
    meth_indices_w_SNP_coverage <- which(start(BSobj2) >= (first_SNP_position - window_size) &
                                         start(BSobj2) <= (last_SNP_position + window_size))
    first_meth_index_with_SNP_coverage <- min(meth_indices_w_SNP_coverage)
    # Correcting the approach to find the last methylation index with SNP coverage
    last_meth_index_with_SNP_coverage <- max(meth_indices_w_SNP_coverage)

    first_meth_value_with_SNP_coverage <- start(BSobj2)[first_meth_index_with_SNP_coverage]
    last_meth_value_with_SNP_coverage <- start(BSobj2)[last_meth_index_with_SNP_coverage]

    long_format$first_snp_position[i] <- first_SNP_position
    long_format$last_snp_position[i] <- last_SNP_position
    long_format$last_meth_index[i] <- length(start(BSobj2))
    BSobj2 <- NULL
    long_format$first_meth_position[i] <- first_meth_position
    long_format$last_meth_position[i] <- last_meth_position
    
    long_format$first_meth_index_with_SNP_coverage[i] <- first_meth_index_with_SNP_coverage
    long_format$last_meth_index_with_SNP_coverage[i] <- last_meth_index_with_SNP_coverage
    long_format$first_meth_value_with_SNP_coverage[i] <- first_meth_value_with_SNP_coverage
    long_format$last_meth_value_with_SNP_coverage[i] <- last_meth_value_with_SNP_coverage
    
    iter_end_time <- Sys.time()
    elapsed_time_per_iter <- iter_end_time - iter_start_time
    total_elapsed_time <- Sys.time() - start_time
    average_time_per_iter <- total_elapsed_time / i
    estimated_total_time <- average_time_per_iter * nrow(long_format)
    estimated_time_remaining <- estimated_total_time - total_elapsed_time

    print(sprintf("Ran %d of %d so far. Estimated time remaining: %s minutes\n", 
                i, nrow(long_format), round(as.numeric(estimated_time_remaining) / 60, 2)))

}

In [None]:
fwrite(long_format, "09-OUT_matched_files_and_indices_to_test_a3.csv")

In [8]:
long_format_a2 <- fread("09-OUT_matched_files_and_indices_to_test_a2.csv")

In [None]:
long_format <- fread("09-OUT_matched_files_and_indices_to_test_a3.csv")

In [9]:
head(long_format_a2)

Chr,SNP_data,methylation_data,last_meth_value_with_SNP_coverage,first_meth_value_with_SNP_coverage,last_meth_index_with_SNP_coverage,first_meth_index_with_SNP_coverage
<int>,<chr>,<chr>,<int>,<int>,<int>,<int>
1,/expanse/lustre/projects/jhu152/naglemi/mwas/gwas//libd_chr1.pgen,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/caud/out/chr1_AA.rda,248918358,1069461,2202702,8982
2,/expanse/lustre/projects/jhu152/naglemi/mwas/gwas//libd_chr2.pgen,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/caud/out/chr2_AA.rda,241863783,10001,2019984,1
3,/expanse/lustre/projects/jhu152/naglemi/mwas/gwas//libd_chr3.pgen,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/caud/out/chr3_AA.rda,198099789,11602,1538467,1
4,/expanse/lustre/projects/jhu152/naglemi/mwas/gwas//libd_chr4.pgen,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/caud/out/chr4_AA.rda,189877411,69399,1387731,1
5,/expanse/lustre/projects/jhu152/naglemi/mwas/gwas//libd_chr5.pgen,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/caud/out/chr5_AA.rda,181172584,44104,1409038,1
6,/expanse/lustre/projects/jhu152/naglemi/mwas/gwas//libd_chr6.pgen,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/caud/out/chr6_AA.rda,170619093,192453,1412543,1138


In [None]:
head(long_format)

Started around 2pm

## Prepare covariate files

### Get paths

In [10]:
getwd()

In [11]:
genotype_pc <- list.files("./09-IN_raw_cov/pca_wgbs",
                          recursive = TRUE,
                          full.names = TRUE,
                          pattern = "pca.eigenvec")

genotype_pc <- data.frame(genotype_pc_path = genotype_pc, brain_region = c("caud", "dlpfc", "hippo"))

genotype_pc <- rbind(genotype_pc, genotype_pc, genotype_pc)

genotype_pc$subpopulation <- c("EA", "EA", "EA", "AA", "AA", "AA", "all", "all", "all")

genotype_pc

meth_pc <- list.files("/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/",
                      recursive = TRUE,
                      full.names = TRUE,
                      pattern = "pc_")

meth_pc <- data.table(meth_pc_path = meth_pc)

meth_pc[, subpopulation := sub(".*pc_(.*)\\.csv", "\\1", meth_pc_path)]
meth_pc[, brain_region := sub(".*/pheno//(.*)/out/.*", "\\1", meth_pc_path)]

meth_pc

cov3 <- "./09-IN_raw_cov/pheno_PC"

genotype_pc_path,brain_region,subpopulation
<chr>,<chr>,<chr>
./09-IN_raw_cov/pca_wgbs/caudate/pca.eigenvec,caud,EA
./09-IN_raw_cov/pca_wgbs/dlpfc/pca.eigenvec,dlpfc,EA
./09-IN_raw_cov/pca_wgbs/hippo/pca.eigenvec,hippo,EA
./09-IN_raw_cov/pca_wgbs/caudate/pca.eigenvec,caud,AA
./09-IN_raw_cov/pca_wgbs/dlpfc/pca.eigenvec,dlpfc,AA
./09-IN_raw_cov/pca_wgbs/hippo/pca.eigenvec,hippo,AA
./09-IN_raw_cov/pca_wgbs/caudate/pca.eigenvec,caud,all
./09-IN_raw_cov/pca_wgbs/dlpfc/pca.eigenvec,dlpfc,all
./09-IN_raw_cov/pca_wgbs/hippo/pca.eigenvec,hippo,all


meth_pc_path,subpopulation,brain_region
<chr>,<chr>,<chr>
/expanse/lustre/projects/jhu152/naglemi/mwas/pheno//caud/out/pc_AA.csv,AA,caud
/expanse/lustre/projects/jhu152/naglemi/mwas/pheno//caud/out/pc_all.csv,all,caud
/expanse/lustre/projects/jhu152/naglemi/mwas/pheno//caud/out/pc_EA.csv,EA,caud
/expanse/lustre/projects/jhu152/naglemi/mwas/pheno//dlpfc/out/pc_AA.csv,AA,dlpfc
/expanse/lustre/projects/jhu152/naglemi/mwas/pheno//dlpfc/out/pc_all.csv,all,dlpfc
/expanse/lustre/projects/jhu152/naglemi/mwas/pheno//dlpfc/out/pc_EA.csv,EA,dlpfc
/expanse/lustre/projects/jhu152/naglemi/mwas/pheno//hippo/out/pc_AA.csv,AA,hippo
/expanse/lustre/projects/jhu152/naglemi/mwas/pheno//hippo/out/pc_all.csv,all,hippo
/expanse/lustre/projects/jhu152/naglemi/mwas/pheno//hippo/out/pc_EA.csv,EA,hippo


### Format, merge with data frame

In [6]:
genotype_pc

genotype_pc_path,brain_region,subpopulation
<chr>,<chr>,<chr>
./09-IN_raw_cov/pca_wgbs/caudate/pca.eigenvec,caud,EA
./09-IN_raw_cov/pca_wgbs/dlpfc/pca.eigenvec,dlpfc,EA
./09-IN_raw_cov/pca_wgbs/hippo/pca.eigenvec,hippo,EA
./09-IN_raw_cov/pca_wgbs/caudate/pca.eigenvec,caud,AA
./09-IN_raw_cov/pca_wgbs/dlpfc/pca.eigenvec,dlpfc,AA
./09-IN_raw_cov/pca_wgbs/hippo/pca.eigenvec,hippo,AA
./09-IN_raw_cov/pca_wgbs/caudate/pca.eigenvec,caud,all
./09-IN_raw_cov/pca_wgbs/dlpfc/pca.eigenvec,dlpfc,all
./09-IN_raw_cov/pca_wgbs/hippo/pca.eigenvec,hippo,all


In [12]:
long_format_a2[, subpopulation := sub(".*_(.*)\\.rda", "\\1", methylation_data)]
long_format_a2[, brain_region := sub(".*/pheno/(.*)/out/.*", "\\1", methylation_data)]

merged <- merge(long_format_a2, genotype_pc)

merged <- merge(merged, meth_pc)

merged$pheno_pc <- cov3

head(merged)

subpopulation,brain_region,Chr,SNP_data,methylation_data,last_meth_value_with_SNP_coverage,first_meth_value_with_SNP_coverage,last_meth_index_with_SNP_coverage,first_meth_index_with_SNP_coverage,genotype_pc_path,meth_pc_path,pheno_pc
<chr>,<chr>,<int>,<chr>,<chr>,<int>,<int>,<int>,<int>,<chr>,<chr>,<chr>
AA,caud,1,/expanse/lustre/projects/jhu152/naglemi/mwas/gwas//libd_chr1.pgen,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/caud/out/chr1_AA.rda,248918358,1069461,2202702,8982,./09-IN_raw_cov/pca_wgbs/caudate/pca.eigenvec,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno//caud/out/pc_AA.csv,./09-IN_raw_cov/pheno_PC
AA,caud,2,/expanse/lustre/projects/jhu152/naglemi/mwas/gwas//libd_chr2.pgen,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/caud/out/chr2_AA.rda,241863783,10001,2019984,1,./09-IN_raw_cov/pca_wgbs/caudate/pca.eigenvec,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno//caud/out/pc_AA.csv,./09-IN_raw_cov/pheno_PC
AA,caud,3,/expanse/lustre/projects/jhu152/naglemi/mwas/gwas//libd_chr3.pgen,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/caud/out/chr3_AA.rda,198099789,11602,1538467,1,./09-IN_raw_cov/pca_wgbs/caudate/pca.eigenvec,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno//caud/out/pc_AA.csv,./09-IN_raw_cov/pheno_PC
AA,caud,4,/expanse/lustre/projects/jhu152/naglemi/mwas/gwas//libd_chr4.pgen,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/caud/out/chr4_AA.rda,189877411,69399,1387731,1,./09-IN_raw_cov/pca_wgbs/caudate/pca.eigenvec,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno//caud/out/pc_AA.csv,./09-IN_raw_cov/pheno_PC
AA,caud,5,/expanse/lustre/projects/jhu152/naglemi/mwas/gwas//libd_chr5.pgen,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/caud/out/chr5_AA.rda,181172584,44104,1409038,1,./09-IN_raw_cov/pca_wgbs/caudate/pca.eigenvec,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno//caud/out/pc_AA.csv,./09-IN_raw_cov/pheno_PC
AA,caud,6,/expanse/lustre/projects/jhu152/naglemi/mwas/gwas//libd_chr6.pgen,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/caud/out/chr6_AA.rda,170619093,192453,1412543,1138,./09-IN_raw_cov/pca_wgbs/caudate/pca.eigenvec,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno//caud/out/pc_AA.csv,./09-IN_raw_cov/pheno_PC


### Unite covariate files for each row into a single file

In [14]:
outdir <- "/expanse/lustre/projects/jhu152/naglemi/mwas/full_covariates_a2/"
dir.create(outdir)

“'/expanse/lustre/projects/jhu152/naglemi/mwas/full_covariates_a2' already exists”


#### test for first row

In [15]:
list.files("09-IN_raw_cov/")

In [16]:
getwd()

In [18]:
getwd()

In [21]:
dim(merged)

In [22]:
colnames(merged)

dim(merged)



merged_pc_only <- unique(merged[, c(1, 2, 10:12)])

merged_pc_only

i <- 1

geno_pc <- fread(merged$genotype_pc_path[i])
meth_pc <- fread(merged$meth_pc_path[i])
pheno_pc <- fread(merged$pheno_pc[i])

subpopulation,brain_region,genotype_pc_path,meth_pc_path,pheno_pc
<chr>,<chr>,<chr>,<chr>,<chr>
AA,caud,./09-IN_raw_cov/pca_wgbs/caudate/pca.eigenvec,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno//caud/out/pc_AA.csv,./09-IN_raw_cov/pheno_PC
AA,dlpfc,./09-IN_raw_cov/pca_wgbs/dlpfc/pca.eigenvec,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno//dlpfc/out/pc_AA.csv,./09-IN_raw_cov/pheno_PC
AA,hippo,./09-IN_raw_cov/pca_wgbs/hippo/pca.eigenvec,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno//hippo/out/pc_AA.csv,./09-IN_raw_cov/pheno_PC
EA,caud,./09-IN_raw_cov/pca_wgbs/caudate/pca.eigenvec,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno//caud/out/pc_EA.csv,./09-IN_raw_cov/pheno_PC
EA,dlpfc,./09-IN_raw_cov/pca_wgbs/dlpfc/pca.eigenvec,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno//dlpfc/out/pc_EA.csv,./09-IN_raw_cov/pheno_PC
EA,hippo,./09-IN_raw_cov/pca_wgbs/hippo/pca.eigenvec,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno//hippo/out/pc_EA.csv,./09-IN_raw_cov/pheno_PC
all,caud,./09-IN_raw_cov/pca_wgbs/caudate/pca.eigenvec,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno//caud/out/pc_all.csv,./09-IN_raw_cov/pheno_PC
all,dlpfc,./09-IN_raw_cov/pca_wgbs/dlpfc/pca.eigenvec,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno//dlpfc/out/pc_all.csv,./09-IN_raw_cov/pheno_PC
all,hippo,./09-IN_raw_cov/pca_wgbs/hippo/pca.eigenvec,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno//hippo/out/pc_all.csv,./09-IN_raw_cov/pheno_PC


In [23]:
head(meth_pc)

V1,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,⋯,PC155,PC156,PC157,PC158,PC159,PC160,PC161,PC162,PC163,PC164
<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
Br0836,-241.857694,248.31645,140.320737,-160.91751,156.428331,-110.79616,-60.9013,183.7617,-408.14089,⋯,-13.976762,3.908684,-10.092411,21.23771,-12.23128,7.476315,-2.035277,2.100343,-8.8610601,-6.383895e-12
Br0845,-41.411267,-64.86172,-117.108968,-83.72404,137.504573,-25.41571,23.27153,-63.73627,18.57172,⋯,55.960928,-42.75178,173.362824,-51.27975,66.12544,-111.439135,5.308468,-115.175447,-8.6996676,2.316774e-12
Br0848,39.343708,56.8834,7.310866,79.07612,10.244427,-195.76174,-185.21881,-57.84846,17.46396,⋯,-5.955369,-3.130994,-9.686513,55.25895,28.70295,14.777533,35.424069,-30.724631,0.2187726,1.550198e-12
Br0863,6.675588,-230.09471,108.663256,-107.04824,-23.159694,-40.0834,160.9488,-95.16256,-151.53536,⋯,-18.880785,-22.832672,15.538881,71.20401,37.46977,41.57543,58.967476,-13.443419,30.9792224,4.361282e-12
Br0914,34.804852,281.6715,3.382999,72.75223,2.032968,-45.25626,-19.53384,-142.00728,49.56451,⋯,-121.310756,-30.53599,1.959457,-83.31269,46.63677,-258.672378,7.448061,-60.832173,-5.9882406,-1.012897e-12
Br0948,413.585833,-263.70724,4.226894,89.5519,99.992375,49.8529,-83.63773,-21.61775,60.49754,⋯,-88.811028,-187.394044,108.157978,178.40993,134.28954,-67.72973,71.724768,47.731344,118.6766392,3.187018e-12


In [24]:
head(geno_pc)

#IID,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,⋯,PC11,PC12,PC13,PC14,PC15,PC16,PC17,PC18,PC19,PC20
<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
Br836,0.0602846,0.0265233,0.0749827,-0.000100959,0.0463538,-0.0530571,0.0567143,0.00108723,0.0749053,⋯,-0.0417743,0.0415679,0.0690779,0.00374099,-0.00756449,0.0748824,-0.0630698,-0.0688103,-0.0182203,0.0802361
Br845,0.0537185,-0.00767513,0.0878363,0.0438203,0.00781983,-0.0229062,0.0552191,-0.0181545,-0.155562,⋯,-0.0100759,-0.0497003,-0.0356334,-0.0481457,0.0723928,-0.0138146,0.000313164,-0.0421277,0.0265632,0.00989767
Br848,0.0681916,0.00476771,-0.00207638,-0.0347873,0.0378961,-0.0226657,0.00524405,-0.0454608,-0.00213017,⋯,0.0493435,-0.0268311,0.0381219,0.00696588,0.0282344,-0.0613519,0.0569654,0.057238,-0.0181342,0.00497252
Br863,0.0720192,-0.0158038,-0.010501,0.0130544,-0.0353109,0.0746978,0.0258611,0.0396825,0.0507868,⋯,0.0613879,-0.00378396,-0.045404,0.0963874,0.0052521,-0.0217901,-0.0671394,-0.0151709,-0.010421,-0.0878489
Br914,0.055049,0.000796254,-0.103114,-0.0316576,-0.00878951,0.127911,-0.0588257,-0.0575311,0.0934352,⋯,0.0229736,0.0620659,-0.0203713,0.00876626,0.0221627,0.0499541,-0.0514222,0.0536677,0.0694783,-0.0431241
Br948,0.0639954,0.00695405,-0.0100903,-0.0367064,0.0495357,0.0888598,0.0576344,0.0618725,0.0223113,⋯,0.0474799,-0.0277997,0.00218972,-0.00436651,-0.0326359,-0.0502489,0.0792942,0.0182645,0.0551862,-0.0142606


In [25]:
head(pheno_pc)

ID,BrNum,Batch,SNPnum,MissRate,Dx,Age,Sex,Race,PCArace,⋯,PC11,PC12,PC13,PC14,PC15,PC16,PC17,PC18,PC19,PC20
<chr>,<chr>,<chr>,<int>,<dbl>,<chr>,<dbl>,<chr>,<chr>,<chr>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
5421787087_R01C01,Br1602,1M,992741,0.0001158,Schizo,83.14,F,CAUC,CAUC,⋯,-0.003,-0.0012,0.0041,-0.0069,0.0012,-0.0028,0.0025,-0.0005,-0.0004,-0.0024
5421787087_R01C02,Br1203,1M,992741,0.000138,Schizo,24.33,M,CAUC,CAUC,⋯,-0.0055,0.0009,0.0004,-0.0013,-0.002,-0.0018,0.0013,-0.0039,0.0035,0.0022
4572348457_R01C02,Br1573,1M,992741,0.0005671,Schizo,57.58,M,AA,AA,⋯,0.0021,-0.0031,-0.0068,-0.0055,-0.0041,0.002,-0.0049,-0.0031,0.0044,0.0058
4572348844_R01C01,Br1214,1M,992741,0.007482,Control,61.13,M,CAUC,CAUC,⋯,-0.0045,-0.0024,0.003,0.0038,0.0036,-0.001,0.0016,-0.0031,0.008,0.0023
4572348844_R01C02,Br1276,1M,992741,0.007759,Control,24.25,M,AA,AA,⋯,0.0012,-0.0053,0.0005,0.0008,0.0003,0.0098,0.0071,0.0013,-0.0012,0.009
5532971095_R01C01,Br2147,1M,992741,0.0002216,Control,51.64823,M,HISP,,⋯,0.0016,-0.0018,-0.0091,0.0047,-0.0012,-0.0032,-0.005,-0.001,-0.0021,0.0058


In [26]:
meth_pc <- meth_pc[, 1:11]

geno_pc <- geno_pc[, 1:4]

pheno_pc <- pheno_pc[, c(2, 6, 7, 8)]

In [27]:
colnames(meth_pc)[1] <- colnames(pheno_pc)[1] <- colnames(geno_pc)[1] <- "ID"

colnames(meth_pc)[2:11] <- paste0("methPC", 1:10)

colnames(geno_pc)[2:4] <- paste0("genoPC", 1:3)

geno_pc$ID <- gsub("Br0", "Br", geno_pc$ID)
pheno_pc$ID <- gsub("Br0", "Br", pheno_pc$ID)
meth_pc$ID <- gsub("Br0", "Br", meth_pc$ID)

#dim(meth_pc)
#dim(geno_pc)
#dim(pheno_pc)

unified_pc <- merge(geno_pc, pheno_pc)
#dim(unified_pc)
unified_pc <- merge(unified_pc, meth_pc)
#dim(unified_pc)

if(nrow(unified_pc) != nrow(meth_pc)){
    stop(paste0("We have a mismatch for row ", i))
    }

outname <- paste0(outdir, merged$subpopulation[i], "_", merged$brain_region[i], ".csv")
fwrite(merged_pc_only, outname)

In [28]:
head(merged_pc_only)

subpopulation,brain_region,genotype_pc_path,meth_pc_path,pheno_pc
<chr>,<chr>,<chr>,<chr>,<chr>
AA,caud,./09-IN_raw_cov/pca_wgbs/caudate/pca.eigenvec,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno//caud/out/pc_AA.csv,./09-IN_raw_cov/pheno_PC
AA,dlpfc,./09-IN_raw_cov/pca_wgbs/dlpfc/pca.eigenvec,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno//dlpfc/out/pc_AA.csv,./09-IN_raw_cov/pheno_PC
AA,hippo,./09-IN_raw_cov/pca_wgbs/hippo/pca.eigenvec,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno//hippo/out/pc_AA.csv,./09-IN_raw_cov/pheno_PC
EA,caud,./09-IN_raw_cov/pca_wgbs/caudate/pca.eigenvec,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno//caud/out/pc_EA.csv,./09-IN_raw_cov/pheno_PC
EA,dlpfc,./09-IN_raw_cov/pca_wgbs/dlpfc/pca.eigenvec,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno//dlpfc/out/pc_EA.csv,./09-IN_raw_cov/pheno_PC
EA,hippo,./09-IN_raw_cov/pca_wgbs/hippo/pca.eigenvec,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno//hippo/out/pc_EA.csv,./09-IN_raw_cov/pheno_PC


In [29]:
head(unified_pc)

ID,genoPC1,genoPC2,genoPC3,Dx,Age,Sex,methPC1,methPC2,methPC3,methPC4,methPC5,methPC6,methPC7,methPC8,methPC9,methPC10
<chr>,<dbl>,<dbl>,<dbl>,<chr>,<dbl>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
Br1003,0.0591231,-0.0398077,0.0205308,Control,52.57,F,-157.0465,22.431504,-3.613153,28.587802,-165.83079,15.4914042,27.3471,68.14635,-83.14472,74.405523
Br1004,0.0507512,0.0167003,-0.0235019,Control,21.01,M,-207.9779,14.563873,-48.109417,-48.368597,65.47954,-174.4362052,-40.36349,-103.81338,11.08724,-84.670514
Br1007,0.0683202,0.00502359,-0.0595899,Control,57.1,M,119.2934,-184.287037,-127.987516,8.047659,100.51821,112.4048122,-44.07971,-13.33184,109.55059,55.891996
Br1017,0.0685061,-0.00844357,-0.0109324,Control,48.42,F,148.3514,-89.715577,-97.742602,7.07287,-78.40472,77.0296173,-33.33488,-34.9482,73.88633,65.622516
Br1021,0.0596699,0.00367841,0.0479552,Schizo,63.18,M,453.471,-132.387141,35.349695,80.759933,123.8786,-39.4338123,-122.19182,64.62712,-62.89107,-25.939892
Br1030,0.0166234,0.037136,-0.116838,Schizo,49.65,F,-212.5056,-3.017152,-46.171785,-91.779986,-28.95307,-0.2483534,63.4633,10.94601,62.95635,4.060549


#### Run for all

In [30]:
dim(meth_pc)

In [31]:
dim(unified_pc)

In [32]:
unified_pc

ID,genoPC1,genoPC2,genoPC3,Dx,Age,Sex,methPC1,methPC2,methPC3,methPC4,methPC5,methPC6,methPC7,methPC8,methPC9,methPC10
<chr>,<dbl>,<dbl>,<dbl>,<chr>,<dbl>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
Br1003,0.0591231,-0.039807700,0.02053080,Control,52.57,F,-157.04647,22.431504,-3.613153,28.587802,-165.830789,15.4914042,27.347098,68.1463463,-83.144721,74.405523
Br1004,0.0507512,0.016700300,-0.02350190,Control,21.01,M,-207.97793,14.563873,-48.109417,-48.368597,65.479538,-174.4362052,-40.363495,-103.8133780,11.087242,-84.670514
Br1007,0.0683202,0.005023590,-0.05958990,Control,57.10,M,119.29338,-184.287037,-127.987516,8.047659,100.518214,112.4048122,-44.079714,-13.3318373,109.550591,55.891996
Br1017,0.0685061,-0.008443570,-0.01093240,Control,48.42,F,148.35140,-89.715577,-97.742602,7.072870,-78.404717,77.0296173,-33.334885,-34.9482041,73.886328,65.622516
Br1021,0.0596699,0.003678410,0.04795520,Schizo,63.18,M,453.47100,-132.387141,35.349695,80.759933,123.878599,-39.4338123,-122.191818,64.6271242,-62.891067,-25.939892
Br1030,0.0166234,0.037136000,-0.11683800,Schizo,49.65,F,-212.50562,-3.017152,-46.171785,-91.779986,-28.953074,-0.2483534,63.463302,10.9460093,62.956351,4.060549
Br1039,0.0372755,0.000920873,0.04261920,Control,51.45,F,-103.04318,-97.394092,6.394317,9.246365,-153.099368,-57.4199032,-21.466761,61.2001242,4.913690,18.458765
Br1040,0.0418527,-0.041008900,-0.00594927,Control,20.93,M,-651.74144,243.329728,-234.910289,-21.365701,137.855050,-107.1335278,-9.355704,-32.7974052,51.679802,-82.487019
Br1050,0.0667423,-0.029448500,-0.06359590,Schizo,41.25,M,-346.35535,148.209663,-322.522824,-64.676761,121.759943,-70.2543686,2.514280,30.8563428,-19.582689,-27.813140
Br1053,0.0545700,0.010893500,-0.00503420,Control,40.18,F,14.25498,-218.737504,-116.037699,-46.148580,-61.733453,-40.1747788,-36.466078,-77.0255876,22.668648,28.637638


In [33]:
for(i in 1:nrow(merged_pc_only)){
    geno_pc <- fread(merged_pc_only$genotype_pc_path[i])
    meth_pc <- fread(merged_pc_only$meth_pc_path[i])
    pheno_pc <- fread(merged_pc_only$pheno_pc[i])

    meth_pc <- meth_pc[, 1:11]

    geno_pc <- geno_pc[, 1:4]

    pheno_pc <- pheno_pc[, c(2, 6, 7, 8)]

    colnames(meth_pc)[1] <- colnames(pheno_pc)[1] <- colnames(geno_pc)[1] <- "ID"

    colnames(meth_pc)[2:11] <- paste0("methPC", 1:10)

    colnames(geno_pc)[2:4] <- paste0("genoPC", 1:3)

    geno_pc$ID <- gsub("Br0", "Br", geno_pc$ID)
    pheno_pc$ID <- gsub("Br0", "Br", pheno_pc$ID)
    pheno_pc$Dx <- gsub("chizo", "CZ", pheno_pc$Dx)
    meth_pc$ID <- gsub("Br0", "Br", meth_pc$ID)

    #dim(meth_pc)
    #dim(geno_pc)
    #dim(pheno_pc)

    unified_pc <- merge(geno_pc, meth_pc)
    #dim(unified_pc)
    unified_pc <- merge(unified_pc, pheno_pc)
    #dim(unified_pc)
    unified_pc_no_meth <- merge(geno_pc, pheno_pc)
    unified_pc_no_meth_no_dx <- unified_pc_no_meth[, c(-5)]
    unified_pc_no_dx <- unified_pc[, -c(15)]

    if(nrow(unified_pc) != nrow(meth_pc)){
        print(setdiff(meth_pc$ID, unified_pc$ID))
        warning(paste0("We have a mismatch for row ", i))
        print(merged_pc_only[i, ])
        }
    outname <- paste0(outdir, merged_pc_only$subpopulation[i], "_", merged_pc_only$brain_region[i], ".csv")
    fwrite(unified_pc, outname)

    outname2 <- paste0(outdir, merged_pc_only$subpopulation[i], "_", merged_pc_only$brain_region[i], "-no-meth.csv")
    fwrite(unified_pc_no_meth, outname2)

    outname3 <- paste0(outdir, merged_pc_only$subpopulation[i], "_", merged_pc_only$brain_region[i], "-no-meth-no-dx.csv")
    fwrite(unified_pc_no_meth_no_dx, outname3)

    outname4 <- paste0(outdir, merged_pc_only$subpopulation[i], "_", merged_pc_only$brain_region[i], "-no-dx.csv")
    fwrite(unified_pc_no_dx, outname4)
    
    }


outname

merged$full_covariate_path <-
paste0(outdir, merged$subpopulation, "_", merged$brain_region, ".csv")

#fread("/dcs04/lieber/statsgen/mnagle/mwas/full_covariates/AA_caud.csv")

[1] "Br2267"


“We have a mismatch for row 2”


Key: <subpopulation, brain_region>
   subpopulation brain_region                            genotype_pc_path
          <char>       <char>                                      <char>
1:            AA        dlpfc ./09-IN_raw_cov/pca_wgbs/dlpfc/pca.eigenvec
                                                              meth_pc_path
                                                                    <char>
1: /expanse/lustre/projects/jhu152/naglemi/mwas/pheno//dlpfc/out/pc_AA.csv
                   pheno_pc
                     <char>
1: ./09-IN_raw_cov/pheno_PC
[1] "Br2267"


“We have a mismatch for row 3”


Key: <subpopulation, brain_region>
   subpopulation brain_region                            genotype_pc_path
          <char>       <char>                                      <char>
1:            AA        hippo ./09-IN_raw_cov/pca_wgbs/hippo/pca.eigenvec
                                                              meth_pc_path
                                                                    <char>
1: /expanse/lustre/projects/jhu152/naglemi/mwas/pheno//hippo/out/pc_AA.csv
                   pheno_pc
                     <char>
1: ./09-IN_raw_cov/pheno_PC
[1] "Br2267"


“We have a mismatch for row 8”


Key: <subpopulation, brain_region>
   subpopulation brain_region                            genotype_pc_path
          <char>       <char>                                      <char>
1:           all        dlpfc ./09-IN_raw_cov/pca_wgbs/dlpfc/pca.eigenvec
                                                               meth_pc_path
                                                                     <char>
1: /expanse/lustre/projects/jhu152/naglemi/mwas/pheno//dlpfc/out/pc_all.csv
                   pheno_pc
                     <char>
1: ./09-IN_raw_cov/pheno_PC
[1] "Br2267"


“We have a mismatch for row 9”


Key: <subpopulation, brain_region>
   subpopulation brain_region                            genotype_pc_path
          <char>       <char>                                      <char>
1:           all        hippo ./09-IN_raw_cov/pca_wgbs/hippo/pca.eigenvec
                                                               meth_pc_path
                                                                     <char>
1: /expanse/lustre/projects/jhu152/naglemi/mwas/pheno//hippo/out/pc_all.csv
                   pheno_pc
                     <char>
1: ./09-IN_raw_cov/pheno_PC


In [34]:
outdir

## Match covariate files to corresponding datasets

In [35]:
list.files("../../full_covariates_a2/")

In [38]:
long_format <- long_format_a2

In [39]:
long_format$population <- str_split_fixed(long_format$methylation_data,
                                          pattern = "_",
                                          2)[, 2]

In [40]:
long_format$population <- gsub("\\.rda", "", long_format$population)

In [41]:
long_format$population

In [42]:
long_format$region <- str_split_fixed(long_format$methylation_data,
                                      pattern = "pheno/",
                                      2)[, 2]
long_format$region <- str_split_fixed(long_format$region,
                                      pattern = "/out",
                                      2)[, 1]

In [43]:
head(long_format$region)

In [44]:
list.files("/expanse/lustre/projects/jhu152/naglemi/mwas/full_covariates_a2/")

In [45]:
long_format$cov_file <- paste0("/expanse/lustre/projects/jhu152/naglemi/mwas/full_covariates_a2/",
                               long_format$population,
                               "_",
                               long_format$region,
                               ".csv")

In [46]:
long_format$cov_file2 <- paste0("/expanse/lustre/projects/jhu152/naglemi/mwas/full_covariates_a2/",
                               long_format$population,
                               "_",
                               long_format$region,
                               "-no-meth.csv")

In [47]:
long_format$cov_file3 <- paste0("/expanse/lustre/projects/jhu152/naglemi/mwas/full_covariates_a2/",
                               long_format$population,
                               "_",
                               long_format$region,
                               "-no-meth-no-dx.csv")

In [48]:
long_format$cov_file4 <- paste0("/expanse/lustre/projects/jhu152/naglemi/mwas/full_covariates_a2/",
                               long_format$population,
                               "_",
                               long_format$region,
                               "-no-dx.csv")

In [49]:
file.exists(long_format$cov_file[1])
file.exists(long_format$cov_file2[1])
file.exists(long_format$cov_file3[1])
file.exists(long_format$cov_file4[1])

In [50]:
colnames(long_format)

In [51]:
fwrite(long_format, "09-OUT_matched_SNP_meth_cov_a2.csv")

## Inspect files

In [52]:
files <- unique(long_format$cov_file)

In [53]:
files

In [54]:
list.files("/expanse/lustre/projects/jhu152/naglemi/mwas/full_covariates_a2/")

In [55]:
for(file in files){
    print(file)
    file_in <- fread(file)
    print(file_in)
    cat("\n\n")
}

[1] "/expanse/lustre/projects/jhu152/naglemi/mwas/full_covariates_a2/AA_caud.csv"
         ID   genoPC1     genoPC2    genoPC3    methPC1    methPC2     methPC3
     <char>     <num>       <num>      <num>      <num>      <num>       <num>
  1: Br1003 0.0591231 -0.03980770  0.0205308 -157.04647   22.43150   -3.613153
  2: Br1004 0.0507512  0.01670030 -0.0235019 -207.97793   14.56387  -48.109417
  3: Br1007 0.0683202  0.00502359 -0.0595899  119.29338 -184.28704 -127.987516
  4: Br1017 0.0685061 -0.00844357 -0.0109324  148.35140  -89.71558  -97.742602
  5: Br1021 0.0596699  0.00367841  0.0479552  453.47100 -132.38714   35.349695
 ---                                                                          
160:  Br948 0.0639954  0.00695405 -0.0100903  413.58583 -263.70724    4.226894
161:  Br949 0.0712634 -0.02716360  0.0572070 -200.93232  265.80379   56.822449
162:  Br963 0.0120633  0.01905100 -0.0894283  -27.92241 -144.83283  -66.482063
163:  Br991 0.0436291  0.01022270 -0.0078481  -42

In [56]:
library(data.table)

for(file in files){
    print(file)
    other_file <- gsub("_a2", "", file)
    if(!file.exists(other_file)) next
    
    file_in <- fread(file)
    other_file_in <- fread(other_file)
    
    # Check if data.tables are identical
    are_identical <- all.equal(file_in, other_file_in)
    
    print(file_in)
    cat("\n\n")
    print(other_file_in)
    cat("\n\n")
    
    if(isTRUE(are_identical)) {
        cat("The data.tables are identical.\n\n")
    } else {
        cat("The data.tables are NOT identical.\n\n")
    }
}


[1] "/expanse/lustre/projects/jhu152/naglemi/mwas/full_covariates_a2/AA_caud.csv"
         ID   genoPC1     genoPC2    genoPC3    methPC1    methPC2     methPC3
     <char>     <num>       <num>      <num>      <num>      <num>       <num>
  1: Br1003 0.0591231 -0.03980770  0.0205308 -157.04647   22.43150   -3.613153
  2: Br1004 0.0507512  0.01670030 -0.0235019 -207.97793   14.56387  -48.109417
  3: Br1007 0.0683202  0.00502359 -0.0595899  119.29338 -184.28704 -127.987516
  4: Br1017 0.0685061 -0.00844357 -0.0109324  148.35140  -89.71558  -97.742602
  5: Br1021 0.0596699  0.00367841  0.0479552  453.47100 -132.38714   35.349695
 ---                                                                          
160:  Br948 0.0639954  0.00695405 -0.0100903  413.58583 -263.70724    4.226894
161:  Br949 0.0712634 -0.02716360  0.0572070 -200.93232  265.80379   56.822449
162:  Br963 0.0120633  0.01905100 -0.0894283  -27.92241 -144.83283  -66.482063
163:  Br991 0.0436291  0.01022270 -0.0078481  -42