# Clarify my confusion with the outer loop

Get data ready

In [1]:
library(CpGWAS)
library(data.table)
library(glmnet)
library(e1071)
library(doParallel)
library(future.apply)
library(optparse)
library(pgenlibr)
library(bsseq)
library(tools)

Loading required package: Matrix

Loaded glmnet 4.1-8

Loading required package: foreach

Loading required package: iterators

Loading required package: parallel

Loading required package: future

Loading required package: BiocGenerics


Attaching package: ‘BiocGenerics’


The following objects are masked from ‘package:stats’:

    IQR, mad, sd, var, xtabs


The following objects are masked from ‘package:base’:

    anyDuplicated, aperm, append, as.data.frame, basename, cbind,
    colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
    get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
    match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
    Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
    table, tapply, union, unique, unsplit, which.max, which.min


Loading required package: GenomicRanges

Loading required package: stats4

Loading required package: S4Vectors


Attaching package: ‘S4Vectors’


The following objects are masked from

## Prepare input arguments

In [None]:
if(Sys.getenv("RSTUDIO") == 0){

  # Define command line options
  option_list <- list(
    make_option(c("-o", "--outdir"), type = "character", default = "./output/",
                help = "Output directory, default is './output/'"),
    make_option(c("-c1", "--chunk1"), type = "integer", default = 1,
                help = "Starting methylation site index for processing"),
    make_option(c("-c2", "--chunk2"), type = "integer", default = NA,
                help = "Ending methylation site index for processing"),
    make_option(c("-s", "--snp_data_path"), type = "character", default = NULL,
                help = "Path to SNP data (required)"),
    make_option(c("-m", "--methylation_data_path"), type = "character", default = NULL,
                help = "Path to methylation data (required)")
  )

  # Parse options
  args <- parse_args(OptionParser(option_list = option_list))
} else {
  args <- list(
    outdir = "./output/",
    chunk1 = 1000000,
    chunk2 = 1000100,
    snp_data_path = "/Users/michaelnagle/code/mwas/gwas/libd_chr1.pgen",
    methylation_data_path = "/Users/michaelnagle/code/mwas/pheno/dlpfc/out/chr1_AA.rda"
  )
}

if(!dir.exists(args$outdir)) {
  dir.create(args$outdir)
}

# Check required arguments
if (is.null(args$snp_data_path) || is.null(args$methylation_data_path)) {
  stop("Paths to both SNP and methylation data are required.")
}

load(args$methylation_data_path)

# Pt. 2: Initialize MethylationInput object -------------------------------

methInput <- new("MethylationInput",
                 BSseq_obj = BSobj2,
                 snp_data_path = args$snp_data_path,
                 args = args)

# Pt. 3: Main loop to process SNP data for each methylation site ----------

start_time <- Sys.time()  # Start time capture

window_sizes <- c(1000, 2000, 5000, 10000, 20000, 50000, 100000, 200000, 500000)

scaffoldIdentifier <- paste0(tools::file_path_sans_ext(basename(args$snp_data_path)),
                             "-",
                             tools::file_path_sans_ext(basename(args$methylation_data_path)))

scaffold_name <- scaffoldIdentifier

## Load helper functions

In [None]:
# Function to reorder and filter geno matrix based on a vector of accepted genotype IDs
reorder_and_filter_geno <- function(geno, genotype_IDs) {
  # Filter out rows from geno that are not in genotype_IDs
  idx_geno <- which(row.names(geno) %in% genotype_IDs)
  geno_filtered <- geno[idx_geno, ]
  
  # Check if both geno and genotype_IDs have the same row names
  if (length(genotype_IDs) != nrow(geno_filtered) || !all(genotype_IDs %in% row.names(geno_filtered))) {
    stop("Row names do not match 100% between geno and genotype_IDs")
  }
  
  # Find the matching indices for reordering
  match_indices <- match(genotype_IDs, row.names(geno_filtered))
  
  # Reorder geno matrix to match the order of genotype_IDs
  reordered_geno <- geno_filtered[match_indices, ]
  
  return(reordered_geno)
}

regress_out_cov <- function(methylations, cov, n_benchmarks = NULL) {
  print("We just entered regress_out_cov()")
  
  if(is.null(methylations)){
    stop("Error: methylation data not found")
  }
  
  cat("Dimensions of methylations: ", dim(methylations), "\n")
  
  # Creating the model formula
  colnames(cov) <- gsub("\\(Intercept\\)", "Intercept", colnames(cov))
  cov <- as.data.frame(cov)
  model_formula <- as.formula(paste("y ~ ", paste(colnames(cov), collapse=" + ")))
  
  n_tests <- if (is.null(n_benchmarks)) ncol(methylations) else n_benchmarks
  residuals_matrix <- matrix(NA, nrow = nrow(methylations), ncol = n_tests)
  
  for(i in 1:n_tests) {
    y <- methylations[, i]
    lm_model <- lm(model_formula, data = cbind(y, cov))
    residuals_matrix[, i] <- residuals(lm_model)
  }
  
  cat("Residuals computed for ", n_tests, " tests.\n")
  return(residuals_matrix)
}

regress_out_cov_parallel <- function(methylations, cov_matrix, n_benchmarks = NULL) {
  print("We just entered regress_out_cov_parallel()")
  no_cores <- detectCores() / 2
  plan(sequential)
  
  if(is.null(methylations)){
    stop("Error: methylation data not found")
  }
  
  cat("Dimensions of methylations: ", dim(methylations), "\n")
  
  cat("Dimensions of cov_matrix: ", dim(cov_matrix), "\n")
  
  pseudoinv <- solve(t(cov_matrix) %*% cov_matrix) %*% t(cov_matrix)
  cat("Dimensions of pseudoinv: ", dim(pseudoinv), "\n")
  
  n_tests <- if (is.null(n_benchmarks)) ncol(methylations) else n_benchmarks
  chunk_size <- ceiling(n_tests / no_cores)
  chunks <- lapply(1:no_cores, function(i) {
    start_col <- (i - 1) * chunk_size + 1
    end_col <- min(i * chunk_size, n_tests)
    methylations[, start_col:end_col]
  })
  
  residuals_computation <- function(chunk, cov_matrix, pseudoinv) {
    tryCatch({
      # Compute the fitted values for the entire matrix
      fitted_values <- cov_matrix %*% (pseudoinv %*% chunk)
      # Compute the residuals for the entire matrix
      residuals <- chunk - fitted_values
      return(residuals)
    }, error = function(e) {
      cat("Error occurred: ", e$message, "\n")
      cat("Dimensions of cov_matrix: ", dim(cov_matrix), "\n")
      cat("Dimensions of pseudoinv: ", dim(pseudoinv), "\n")
      cat("Dimensions of chunk: ", dim(chunk), "\n")
      matrix(NA, nrow = nrow(chunk), ncol = ncol(chunk))
    })
  }
  
  start_time <- Sys.time()
  results <- future_lapply(chunks, residuals_computation, cov_matrix, pseudoinv)
  residuals_matrix <- do.call(cbind, results)
  
  if (!is.null(n_benchmarks)) {
    end_time <- Sys.time()
    elapsed_time <- end_time - start_time
    cat("Elapsed time: ", elapsed_time, "\n")
  }
  
  return(residuals_matrix)
}

processCovariates <- function(dataFrame, 
                              colsToExclude = c("ID.", "DNum", "BrNum",
                                                "brnum", "brnumerical"),
                              genotype_IDs = NULL) {
  
  if(is.null(genotype_IDs)){
    stop(paste0("`processCovariates must receive a vector of genotype IDs",
                " with order matching methylation trait file."))
  }
  
  # Sanity check: ensure rownames match brnum
  if (!all(rownames(dataFrame) == dataFrame$brnum)) {
    stop("Row names do not match 'brnum'")
  }
  
  # Exclude specific columns
  dataFrame <- dataFrame[, !(names(dataFrame) %in% colsToExclude)]
  
  # Exclude columns with only one factor level and print a warning
  single_factor <- sapply(dataFrame, function(x) length(levels(factor(x))) == 1)
  
  if (any(single_factor)) {
    # warning("Removing columns with only one factor level: ",
    #         paste(names(dataFrame)[single_factor], collapse = ", "))
    dataFrame <- dataFrame[, !single_factor]
  }
  
  dataFrame <- dataFrame[which(rownames(dataFrame) %in% genotype_IDs), ]
  dataFrame <- dataFrame[order(rownames(dataFrame)),]
  
  if(!all(rownames(dataFrame) == genotype_IDs)){
    stop("Mismatch with covariate data and genotype IDs")
  }
  
  # Get the indices of the factor columns
  factor_columns <- sapply(dataFrame, is.factor)
  
  # One-hot encode the factor columns using model.matrix
  one_hot_encoded <- model.matrix(~., data = dataFrame[, factor_columns])
  
  # Combine the one-hot encoded columns with the numeric columns
  final_data <- as.matrix(cbind(one_hot_encoded, dataFrame[, !factor_columns]))
  
  return(final_data)
}


## Load data

In [None]:

# Pt. 2: Prepare input data -----------------------------------------------

# Load methylation data
suppressWarnings(load(args$methylation_data_path))
methylations <- t((as.matrix(getMeth(BSobj2,
                                     type = "smooth", what  = "perBase"))))
colnames(methylations) <- paste0("pos_", start(ranges(rowRanges(BSobj2))))
# # Extract only once then subset

scaffold_name <- tools::file_path_sans_ext(basename(scaffold_name))
pgen_path <- gsub(args$snp_data_path, pattern = "pvar", replacement = "pgen")
pvar_path <- gsub(args$snp_data_path, pattern = "pgen", replacement = "pvar")
psam_path <- gsub(pvar_path, pattern = "pvar", replacement = "psam")

pvar.1 <- pgenlibr::NewPvar(pvar_path)
pvar.2 <- fread(pvar_path)
pgen <- pgenlibr::NewPgen(pgen_path, pvar=pvar.1)
psam <- fread(psam_path)
psam_in_wgbs <- psam[which(psam$`#IID` %in% rownames(methylations))]
genotype_IDs <- psam_in_wgbs$`#IID`
genotype_IDs <- intersect(rownames(methylations), genotype_IDs)
genotype_IDs <- genotype_IDs[order(genotype_IDs)]


cov <- processCovariates(dataFrame = colData(BSobj2),
                         colsToExclude = c("ID.", "DNum", "brnum",
                                           "BrNum", "brnumerical"),
                         genotype_IDs = genotype_IDs)

# Matching indices for genotypes in 'methylations'
# Note, it's possible not all found in methylations are also in SNP data.
#idx_methylations <- match(genotype_IDs, colnames(methylations))
methylations <- methylations[which(rownames(methylations) %in% genotype_IDs), ]
print("Do methylations and genotype_IDs match?")
print(all(rownames(methylations) == genotype_IDs))


## Extract methylation, SNPs for one test

In [None]:
wind <- c(1000, 2000, 5000, 10000, 20000, 50000, 100000, 200000, 500000)

In [None]:
i <- 2000000


w <- wind[9]

In [None]:
methylation <- methylations[, i]  # Extracting methylation for current site

# Get base position for the current methylation site

meth_site_pos <- start(ranges(granges(BSobj2)))[i]

In [None]:
cat("Processing window size: ", w, "\n")

# Calculate window bounds
lower_bound <- meth_site_pos - w - 1
upper_bound <- meth_site_pos + w - 1
  
# Find SNP indices within the window
snp_indices <- which(pvar.2$POS >= lower_bound & pvar.2$POS <= upper_bound)

if (length(snp_indices) == 0) {
  next
}

# Get SNPs for desired indices
SNPs <- pgenlibr::ReadList(pgen, variant_subset = snp_indices)

rownames(SNPs) <- psam$`#IID`

SNPs <- reorder_and_filter_geno(geno = SNPs, genotype_IDs)

#cv.pred(geno, methylation, fold=5)

## Load original version of `glmnet.tune.alpha`

In [None]:
glmnet.tune.alpha <- function(X,y,fold=5){
        if(nrow(X)!=length(y)){
                stop("Number of samples is differerent")
        } 

        print(paste0("Inside `glmnet.tune.alpha`, X and y have ", length(y), " samples"))

        # remove missing data
        idx <- !is.na(y)
        y <- y[idx]
        X <- X[idx,]

        # tune alpha  
        a <- seq(0, 1, 0.1)
        set.seed(2023)
        fold.id <- cut(sample(1:length(y)),breaks=fold,labels=FALSE)

        print("Folds have this many samples: ")
        print(table(fold.id))
    
    tune <- foreach(ai = a, .combine = rbind) %dopar% {
        cv <- cv.glmnet(
                        X,
                        y,
                        foldid = fold.id,
                        type.measure = "mse",
                        paralle = TRUE,
                        alpha = ai
                        )
        data.frame(
                        cvm = cv$cvm[cv$lambda == cv$lambda.1se],
                        lambda = cv$lambda.1se,
                        alpha = ai
                        )
        } 
    cv.opt <- tune[which.min(tune$cvm),] 

        # fit final model
        fit = glmnet(
        X,
        y,
        lambda = cv.opt$lambda,
        alpha = cv.opt$alpha
                )

        # extract feature names and effects
        coef <- coef(fit)
        coef.names <- rownames(coef)[-1] # drop intercept
        coef.values <- coef[-1]

        # keep features with non-zero effects
        coef.names <- coef.names[coef.values!=0]
        coef.values <- coef.values[coef.values!=0]
        fs <- data.frame(v=coef.names,coefs=coef.values)

        # train sample prediction
        predY <- predict(fit, X)
        if(var(predY) > 0){
                r <- cor(y,predY)
        } else{
                r <- NA
        }
        return(list(model=fit,features=fs,para=cv.opt,cor=r))
}

## Go inside `cv.pred`

In [None]:
X <- SNPs
y <- methylation
fold <- 5

In [None]:
set.seed(2018)
if(nrow(X)!=length(y)){
        stop("Number of observations is differerent")
}

# impute missing values
if(sum(is.na(X))>0){
        m <- colMeans(X,na.rm=TRUE)
        id <- apply(X,2,anyNA)
        id <- which(id==TRUE)
        for(j in 1:length(id)){
                X[is.na(X[,id[j]]),id[j]] <- m[id[j]]
        }
}

# segment dataset into n folds
N <- nrow(X)
idx <- sample(1:N)
X<-X[idx,]
y<-y[idx]
fold.id <- cut(1:N,breaks=fold,labels=FALSE)

# cross validation
cv <- matrix(NA,nrow=fold,ncol=2)
for(f in 1:fold){
        paste0("Fold: ", fold)
    
        testIndexes <- which(fold.id==f,arr.ind=TRUE)
        trainx <- X[-testIndexes,]
        trainy <- y[-testIndexes]
        testx <- X[testIndexes,]
        testy <- y[testIndexes]
        fit <- glmnet.tune.alpha(trainx,trainy,fold=5)

        print(fit$para)
    
        pred <- predict(fit$model, testx)
        cv[f,1] <- cor(pred,testy)
        cv[f,2] <- mean((pred-testy)^2)

        cat("\n\n")
}
cvm <- apply(cv,2,mean)

## Make a small change to `cv.pred`

Call `glmnet.tune.alpha` outside of loop, use it on full dataset, then extract optimal parameters, then run evaluation loop.

In [None]:
set.seed(2018)
if(nrow(X)!=length(y)){
        stop("Number of observations is differerent")
}

# impute missing values
if(sum(is.na(X))>0){
        m <- colMeans(X,na.rm=TRUE)
        id <- apply(X,2,anyNA)
        id <- which(id==TRUE)
        for(j in 1:length(id)){
                X[is.na(X[,id[j]]),id[j]] <- m[id[j]]
        }
}

# segment dataset into n folds
N <- nrow(X)
idx <- sample(1:N)
X<-X[idx,]
y<-y[idx]
fold.id <- cut(1:N,breaks=fold,labels=FALSE)

### Moved this line and changed trainx and trainy to X and Y
fit <- glmnet.tune.alpha(X,y,fold=5)
best_lambda <- fit$para$lambda
best_alpha <- fit$para$alpha

# cross validation
cv <- matrix(NA,nrow=fold,ncol=2)
for(f in 1:fold){
        paste0("Fold: ", fold)
    
        testIndexes <- which(fold.id==f,arr.ind=TRUE)
        trainx <- X[-testIndexes,]
        trainy <- y[-testIndexes]
        testx <- X[testIndexes,]
        testy <- y[testIndexes]

        fit <- glmnet.tune.alpha(trainx,trainy,fold=5)
    
        print(fit$para)
    
        pred <- predict(fit$model, testx)
        cv[f,1] <- cor(pred,testy)
        cv[f,2] <- mean((pred-testy)^2)

        cat("\n\n")
}
cvm <- apply(cv,2,mean)