# Merge dnAm data

In [2]:

# Define the path to the DNA methylation data directory
dnAm_dir <- "/dcs05/lieber/hanlab/mnagle/mwas/CpGWAS/scripts/aggregated_DNAm_data"  # Update if necessary

In [3]:
head(list.files(dnAm_dir))

In [None]:
# Check if the directory exists
if(!dir.exists(dnAm_dir)){
  stop(paste("DNA methylation data directory not found at path:", dnAm_dir))
}

# List all CSV files ending with '1166.csv'
dnAm_files <- list.files(dnAm_dir, pattern = "1166\\.csv$", full.names = TRUE)

# Print the DNA methylation files found
log_message("DNA Methylation Files Found:")
cat(paste(dnAm_files, collapse = "\n"), "\n\n")

# Initialize a list to store DNA methylation data
dnAm_list <- list()

# Loop through each DNA methylation file to load
for(file in dnAm_files){
  
  log_message(paste("Loading DNA methylation file:", basename(file)))
  
  dnAm_dt <- tryCatch({
    fread(file)
  }, error = function(e){
    stop(paste("Failed to read DNA methylation file:", file, "\nError:", e$message))
  })
  
  # Extract population and region from the filename
  # Assuming filename format: DNAm_meanvar_<Population>_<Region>_20241030095211_1166.csv
  file_base <- basename(file)
  pattern <- "DNAm_meanvar_(AA|EA|all)_(caud|hippo|dlpfc)_.*\\.csv$"
  matches <- regexec(pattern, file_base)
  match <- regmatches(file_base, matches)[[1]]
  
  if(length(match) == 0){
    warning(paste("Filename does not match expected pattern:", file_base))
    next
  }
  
  population <- match[2]
  region <- match[3]
  
  # Define new column names explicitly
  new_cor_name <- paste0(population, "_", region, "_cor")
  new_mean_name <- paste0(population, "_", region, "_Mean_DNAm_Level")
  new_var_name <- paste0(population, "_", region, "_Var_DNAm_Level")
  
  # Rename columns explicitly
  setnames(dnAm_dt, old = c("cor", "Mean_DNAm_Level", "Var_DNAm_Level"),
           new = c(new_cor_name, new_mean_name, new_var_name))
  
  # Keep only the relevant columns
  dnAm_dt <- dnAm_dt[, c("chr", "cg", new_cor_name, new_mean_name, new_var_name), with = FALSE]
  
  # Store in the list
  dnAm_list[[paste0(population, "_", region)]] <- dnAm_dt
  
  log_message(paste("Loaded and processed DNA methylation data for", population, region, "\n"))
}

# Combine all DNA methylation data
if(length(dnAm_list) > 0){
  dnAm_combined <- tryCatch({
    Reduce(function(x, y) merge(x, y, by = c("chr", "cg"), all = TRUE), dnAm_list)
  }, error = function(e){
    stop(paste("Failed to combine DNA methylation data.\nError:", e$message))
  })
  
  log_message("Combined DNA methylation data:")
  print(head(dnAm_combined, 5))
} else {
  dnAm_combined <- NULL
  warning("No DNA methylation data to combine.")
}