# Compare MWAS methods for sanity tests and troubleshooting

In this version, we use the full new SNP set with a selected window

## Be ready with matched up SNP and covariate files

In [1]:
library("glmnet")
library("e1071")
library("doParallel")
library("data.table")

Loading required package: Matrix

Loaded glmnet 4.1-8

Loading required package: foreach

Loading required package: iterators

Loading required package: parallel



In [2]:
df <- fread("09-OUT_matched_SNP_meth_cov_a2.csv")

In [3]:
df <- df[which(df$Chr == 1), ]

In [4]:
i <- 2

In [5]:
df[1, ]

Chr,SNP_data,methylation_data,last_meth_value_with_SNP_coverage,first_meth_value_with_SNP_coverage,last_meth_index_with_SNP_coverage,first_meth_index_with_SNP_coverage,subpopulation,brain_region,population,region,cov_file,cov_file2,cov_file3,cov_file4
<int>,<chr>,<chr>,<int>,<int>,<int>,<int>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
1,/expanse/lustre/projects/jhu152/naglemi/mwas/gwas//libd_chr1.pgen,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/caud/out/chr1_AA.rda,248918358,1069461,2202702,8982,AA,caud,AA,caud,/expanse/lustre/projects/jhu152/naglemi/mwas/full_covariates_a2/AA_caud.csv,/expanse/lustre/projects/jhu152/naglemi/mwas/full_covariates_a2/AA_caud-no-meth.csv,/expanse/lustre/projects/jhu152/naglemi/mwas/full_covariates_a2/AA_caud-no-meth-no-dx.csv,/expanse/lustre/projects/jhu152/naglemi/mwas/full_covariates_a2/AA_caud-no-dx.csv


In [6]:
set.seed(2018)
wind <- c(10000)
# output directory
#outd <- "/dcl02/lieber/shan/shizhong/finemapping/GWAS/tags/scz3/mwas/chr22/1/"
outd <- "20-OUT_original_mwas_sanity_test/"

## Functions

In [7]:
stage1_mwas <- function(map3, snp3, cg, wind, p.residual, lambda.type = "min") {
    if (!lambda.type %in% c("min", "1se")) {
        stop("Invalid lambda type specified. Choose 'min' or '1se'.")
    }
    
    models.ea <- data.frame()
    models.all <- data.frame()

    for (k in 1:length(wind)) {
        print(paste0("Processing window size: ", wind[k]))
        
        for (i in 1:length(cg)) {
            cat(i, "\n")
            print(paste0("This cg is: ", cg[i]))

            range1 <- ifelse(cg[i] - wind[k] > 0, cg[i] - wind[k], 0)
            range2 <- cg[i] + wind[k]
            idx <- map3$POS > range1 & map3$POS < range2

            if (sum(idx) <= 1) next

            geno <- snp3[idx,]
            rownames(geno) <- map3$POS[idx]
            trainX <- t(geno)
            trainY <- p.residual[i,]

            fit <- tryCatch({
                elastic.net(trainX, trainY, lambda.type)
            }, error = function(e) { return(NULL) })

            if (is.null(fit) || nrow(fit) == 0) next

            fit$cg <- cg[i]
            models.all <- rbind(models.all, fit)

            trainX <- trainX[idx.ea,]
            if (sum(apply(trainX, 2, var) != 0) <= 1) next
            trainY <- trainY[idx.ea]

            fit <- tryCatch({
                elastic.net(trainX, trainY, lambda.type)
            }, error = function(e) { return(NULL) })

            if (is.null(fit) || nrow(fit) == 0) next

            fit$cg <- cg[i]
            models.ea <- rbind(models.ea, fit)
        }
    }

    results <- list(
        all = models.all[models.all$features != "(Intercept)", , drop = FALSE],
        ea = models.ea[models.ea$features != "(Intercept)", , drop = FALSE]
    )

    return(results)
}


In [8]:
# helper function to handle elastic net fitting
elastic.net <- function(trainX, trainY, lambda.type) {
    if (nrow(trainX) != length(trainY)) {
        stop("number of observations does not match")
    }

    set.seed(42)  # for reproducibility
    cv.fit <- cv.glmnet(
        trainX,
        trainY,
        nfold = 5,
        type.measure = "mse",
        parallel = TRUE,
        alpha = 0.5
    )

    chosen.lambda <- if (lambda.type == "min") cv.fit$lambda.min else cv.fit$lambda.1se
    yfit <- glmnet(trainX, trainY, lambda = chosen.lambda, alpha = 0.5)

    if (any(coef(yfit) != 0)) {
        coefs <- coef(yfit)
        idx <- which(coefs != 0)
        return(data.frame(
            features = names(coefs[idx]),
            coefs = coefs[idx]
        ))
    } else {
        return(data.frame())
    }
}


stage2_mwas <- function(results, snp.gwas2, map.1kg.eur2, outd) {
    if (!dir.exists(outd)) {
        dir.create(outd)
    }

    perform_mwas <- function(models, gwas_data, map_data) {
        cg2 <- unique(models$cg)
        mwas <- matrix(0, nrow = length(cg2), ncol = 2)
        colnames(mwas) <- c("z", "p")
        rownames(mwas) <- cg2

        for (i in seq_along(cg2)) {
            pos <- models[models$cg == cg2[i], "features"]
            gwas <- gwas_data$z[is.element(gwas_data$pos_hg38, pos)]
            weight <- models[models$cg == cg2[i], "coefs"]
            
            match_indices <- match(pos, map_data$POS)
            
            if (any(is.na(match_indices))) {
                cat("No match found for iteration", i, "\n")
                cat("pos:", pos, "\n")
                next
            }

            tryCatch({
                geno <- map_data[match_indices, , drop = FALSE]
                if (nrow(geno) == 0) {
                    cat("Skipping iteration", i, ": Genotype data 'geno' is empty for the given indices\n")
                    next
                }
                if (is.null(dim(geno))) stop("Genotype data 'geno' has no dimensions")
                if (!is.numeric(geno)) {
                    cat("Genotype data 'geno' class:", class(geno), "\n")
                    cat("Genotype data 'geno' structure:\n")
                    str(geno)
                    stop("Genotype data 'geno' is not numeric")
                }
                geno <- matrix(as.numeric(geno), nrow = nrow(geno), ncol = ncol(geno))
                if (any(is.na(geno))) stop("Genotype data 'geno' contains NA values")
                if (!all(is.finite(geno))) stop("Genotype data 'geno' contains non-finite values")
                mwas[i, ] <- MWAS(gwas, weight, t(geno))
            }, error = function(e) {
                cat("Error accessing genotype data at iteration:", i, "\n")
                cat("Error message:", e$message, "\n")
                cat("match_indices:", match_indices, "\n")
                stop("Stopping execution due to error.")
            })
        }
        return(mwas)
    }

    mwas.all <- perform_mwas(results$all, snp.gwas2, map.1kg.eur2)
    mwas.ea <- perform_mwas(results$ea, snp.gwas2, map.1kg.eur2)

    return(list(all = mwas.all, ea = mwas.ea))
}



In [9]:
MWAS <- function(gwas, weight, geno){
        z <- gwas %*% weight
        z.cor <- cor(geno)
        se <- sqrt(weight %*%  z.cor %*%  weight)
        z <- z/se
        p=pnorm(abs(z),lower.tail=F)*2
        return(c(z, p))
}

## Replace all old objects with new objects in same format

### Methylation data

In [10]:
suppressWarnings(library(bsseq))

Loading required package: BiocGenerics


Attaching package: ‘BiocGenerics’


The following objects are masked from ‘package:stats’:

    IQR, mad, sd, var, xtabs


The following objects are masked from ‘package:base’:

    anyDuplicated, aperm, append, as.data.frame, basename, cbind,
    colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
    get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
    match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
    Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
    table, tapply, union, unique, unsplit, which.max, which.min


Loading required package: GenomicRanges

Loading required package: stats4

Loading required package: S4Vectors


Attaching package: ‘S4Vectors’


The following objects are masked from ‘package:data.table’:

    first, second


The following objects are masked from ‘package:Matrix’:

    expand, unname


The following object is masked from ‘package:utils’:

    findMatches


The

In [11]:
# load data for mwas
# load("./rda/caudate_mwas_data_chr22.rda")
load(df$methylation_data[i])

p <- getMeth(BSobj2)


rownames(p) <- start(BSobj2)

sites_to_test_pos <- c(73274305, 73274312, 73292330, 73307769, 73308571, 73419188, 73419830, 73420076)
sites_to_test <- which(start(BSobj2) %in% sites_to_test_pos)
#sites_to_test <- c(73274305, 73274312, 73292330, 73307769, 73308571, 73419188, 73419830, 73420076)
p <- p[sites_to_test, ]
cg <- as.numeric(rownames(p))

### covariates

In [12]:
covs <- fread(df$cov_file[i])
covs <- t(covs)
colnames(covs) <- covs[1, ]
covs <- covs[2:nrow(covs), ]
# transpose so we have same orientation as original code

### Regress methylation data over covariates

In [13]:
BSobj2$brnum <- gsub("Br0", "Br", BSobj2$brnum)
colnames(covs) <- gsub("Br0", "Br", colnames(covs))

In [14]:
mat <- match(BSobj2$brnum,colnames(covs)) 
covs <- t(covs[,mat])
p.residual=matrix(NA,dim(p)[1],dim(p)[2])

In [15]:
rownames(covs)[is.na(covs[, 'genoPC1'])] <- BSobj2$brnum[is.na(covs[, 'genoPC1'])]

In [16]:
colnames(p.residual) <- BSobj2$brnum

In [17]:
covs <- as.data.frame(covs)
# Convert all columns except Dx and Sex from character to numeric
cols_to_convert <- setdiff(names(covs), c("Dx", "Sex"))

for (col in cols_to_convert) {
  covs[[col]] <- as.numeric(covs[[col]])
}

# Print the modified data frame to check the conversion
#print(dat)


In [18]:
for (i in 1:dim(p)[1]) { # For each methylation site
    dat <- as.data.frame(cbind(y = p[i,], covs))
    
    # Check for rows with NAs (the ones for which we don't have covariate data)
    valid_rows <- complete.cases(dat)
    
    if (sum(valid_rows) > 0) {
        dat_valid <- dat[valid_rows,]
        model.res <- lm(y ~ ., data = dat_valid)
        
        # Store residuals in the corresponding positions
        p.residual[i, valid_rows] <- resid(model.res)
    }
}


# for(i in 1:dim(p)[1]){ # foro each methylation site
#         dat <- as.data.frame(cbind(p[i,],covs))
#         colnames(dat) <- c("y",paste0("x",1:ncol(covs)))
#         model.res <- lm(reformulate(paste0("x",1:ncol(covs)), "y"),dat)
#         p.residual[i,] = resid(model.res) 
# }

In [19]:
snp.gwas2 <- NULL

In [20]:
load("p1.rda")

### summary stats

In [21]:
library(data.table)
library(CpGWAS)

In [22]:
ss_path <- "/home/naglemi/mwas/gwas/gwas_stat_scz"

In [23]:
snp.gwas2 <- fread(ss_path, skip = 1, header = FALSE)
colnames(snp.gwas2) <- strsplit(readLines(ss_path, n = 1), "\t")[[1]]

In [24]:
snp.gwas2$z <- log(snp.gwas2$OR)/snp.gwas2$SE

In [25]:
snp.gwas2 <- snp.gwas2[, c(2, 1, 3, 3, 8, 4, 5, 20, 11)]

In [26]:
head(snp.gwas2, n = 1)

SNP,CHR,BP,BP,INFO,A1,A2,z,P
<chr>,<int>,<int>,<int>.1,<dbl>,<chr>,<chr>,<dbl>,<dbl>
rs62513865,8,100579985,100579985,0.963,C,T,0.7016221,0.4847


In [27]:
colnames(snp.gwas2)[1:5] <- c("snp", "chr", "pos_hg38", "pos_hg38", "info")

In [28]:
snp.gwas2 <- snp.gwas2[which(snp.gwas2$chr == 1 & snp.gwas2$pos_hg38 >= (73274305-10000) & snp.gwas2$pos_hg38 <= (73419830 + 10000)), ]

In [29]:
snp.gwas2 <- snp.gwas2[order(snp.gwas2$pos_hg38), ]

In [30]:
# built predition models
idx.ea <- BSobj2$race == "CAUC"

### SNPs in LIBD population

#### For reference, first load Shizhong's formatted SNPs on Chr7

In [31]:
snp2_sorted <- snp2[, order(names(snp2))]

In [32]:
colnames(snp2) <- gsub("Br0", "Br", colnames(snp2))

In [33]:
snp2 <- snp2[, colnames(snp2) %in% colnames(p.residual)]

In [34]:
snp2_positions <- stringr::str_split_fixed(rownames(snp2), ":", 3)[, 2]

#### Now let's load ours on Chr1

In [35]:
paths <- list(pvar_path = "/expanse/lustre/projects/jhu152/naglemi/mwas/gwas/libd_chr1.pvar",
              pgen_path = "/expanse/lustre/projects/jhu152/naglemi/mwas/gwas/libd_chr1.pgen",
              psam_path = "/expanse/lustre/projects/jhu152/naglemi/mwas/gwas/libd_chr1.psam")

my_SNPs <- loadSNPData(paths$pvar_path, paths$pgen_path, paths$psam_path)

In [36]:
snp_indices_of_interest <- which(my_SNPs$pvar_dt$POS >= 73274305-10000 & my_SNPs$pvar_dt$POS <= 73419830 + 10000)

In [37]:
snp3 <- pgenlibr::ReadList(my_SNPs$pgen,
                        variant_subset = snp_indices_of_interest)
colnames(snp3) <- my_SNPs$pvar_dt$ID[snp_indices_of_interest]
rownames(snp3) <- my_SNPs$psam$`#IID`

In [38]:
map3 <- data.frame(POS = stringr::str_split_fixed(colnames(snp3), ":", 3)[, 2])

In [39]:
snp3 <- t(snp3)

In [40]:
map3 <- data.frame(POS = stringr::str_split_fixed(rownames(snp3), ":", 3)[, 2])

In [41]:
dim(map3)

### SNPs in reference population

In [42]:
#snp.1kg.eur2

In [43]:
paths <- list(pvar_path = "/expanse/lustre/projects/jhu152/naglemi/mwas/gwas/ref_EUR_chr1.pvar",
              pgen_path = "/expanse/lustre/projects/jhu152/naglemi/mwas/gwas/ref_EUR_chr1.pgen",
              psam_path = "/expanse/lustre/projects/jhu152/naglemi/mwas/gwas/ref_EUR_chr1.psam")

my_SNPs <- loadSNPData(paths$pvar_path, paths$pgen_path, paths$psam_path)

In [44]:
snp_indices_of_interest <- which(my_SNPs$pvar_dt$POS >= 73274305-10000 & my_SNPs$pvar_dt$POS <= 73419830 + 10000)

In [45]:
snp.1kg.eur2 <- pgenlibr::ReadList(my_SNPs$pgen,
                        variant_subset = snp_indices_of_interest)
colnames(snp.1kg.eur2) <- my_SNPs$pvar_dt$ID[snp_indices_of_interest]
rownames(snp.1kg.eur2) <- my_SNPs$psam$`IID`

In [46]:
map.1kg.eur2 <- my_SNPs$pvar_dt

In [47]:
map.1kg.eur2 <- map.1kg.eur2[snp_indices_of_interest, ]

In [48]:
snp.1kg.eur2 <- t(snp.1kg.eur2)

### Set window size and any other parameters

In [49]:
wind <- 10000

Is 1se vs min for lambda the problem?

#### Final formatting steps to prepare for MWAS - sort samples

In [50]:
p.residual <- p.residual[, order(colnames(p.residual))]

In [51]:
snp3 <- snp3[, colnames(snp3) %in% colnames(p.residual)]

In [52]:
snp3 <- snp3[, order(colnames(snp3))]

## MWAS

### With `lambda.min`

In [53]:
lambda_type_selected <- "min"  # or "1se" depending on your preference

# Call the stage1_mwas function with the required parameters
results <- stage1_mwas(map3, snp3, cg, wind, p.residual, lambda_type_selected)

# Call to stage2_mwas using the results from stage1
processed_results <- stage2_mwas(results, snp.gwas2, map.1kg.eur2, "26C-OUT_sanity-testing")

# Access the MWAS results for all samples
mwas_all_results <- processed_results$all
print("MWAS results for all samples:")
print(mwas_all_results)

[1] "Processing window size: 10000"
1 
[1] "This cg is: 73274305"


“executing %dopar% sequentially: no parallel backend registered”


2 
[1] "This cg is: 73274312"
3 
[1] "This cg is: 73292330"
4 
[1] "This cg is: 73307769"
5 
[1] "This cg is: 73308571"
6 
[1] "This cg is: 73419188"
7 
[1] "This cg is: 73419830"
8 
[1] "This cg is: 73420076"
[1] "MWAS results for all samples:"
     z p


In [54]:
dim(mwas_all_results)

### Same thing with `lambda.1se`

In [55]:
lambda_type_selected <- "1se"  # or "1se" depending on your preference

# Call the stage1_mwas function with the required parameters
results <- stage1_mwas(map3, snp3, cg, wind, p.residual, lambda_type_selected)

models.ea <- models.ea[models.ea[,1] != "(Intercept)",]
models.all <- models.all[models.all[,1] != "(Intercept)",]

# Call to stage2_mwas using the results from stage1
processed_results <- stage2_mwas(results, snp.gwas2, map.1kg.eur2, "path/to/output/directory")

# Access the MWAS results for all samples
mwas_all_results <- processed_results$all
print("MWAS results for all samples:")
print(mwas_all_results)

[1] "Processing window size: 10000"
1 
[1] "This cg is: 73274305"
2 
[1] "This cg is: 73274312"
3 
[1] "This cg is: 73292330"
4 
[1] "This cg is: 73307769"
5 
[1] "This cg is: 73308571"
6 
[1] "This cg is: 73419188"
7 
[1] "This cg is: 73419830"
8 
[1] "This cg is: 73420076"


ERROR: Error in eval(expr, envir, enclos): object 'models.ea' not found
