# Compare MWAS methods for sanity tests and troubleshooting

In this notebooks, we have my code and Shizhong's side by side, and compare every variable so we can find out exactly where the discrepancy originates.

# Pick the regions we will test

In [1]:
library(data.table)

# prev_hits <- fread("
# Chr     pos        old_z       old_p
# 11   38247902        -27.1535308285104       2.30024742330298e-162
# 2    47933357        18.3327793004811        4.53147699327216e-75
# 7    1987910         10.1255085321387        4.25759742738181e-24
# 7    1987896         10.1255085321387        4.25759742738189e-24
# 7    1987797         10.0241523897721        1.19379483108027e-23
# 7    1987778         10.0105851568856        1.36940148731912e-23
# 12   2194742         -10.0072634920486       1.41615523554955e-23
# ")


In [2]:
prev_hits <- fread("
Chr     pos
1    73274305
1    73418161
1    73418205
1    73418313
1    73419188
1    73419830
")

In [3]:
#df <- fread("09.5-OUT_matched_SNP_meth_cov_chunked_EXPANSE_a2.csv")

In [4]:
df <- fread("09-OUT_matched_SNP_meth_cov_a2.csv")

## Try original code

In [5]:
###### model: learn elastic net model on training data 
######---------Input: trainX, trainY
######---------Return: selected features and coefficents

# original
elastic.net <- function(trainX,trainY){
    if(nrow(trainX)!=length(trainY)){
            stop("Number of observations is differerent")
    } 

    # optimize alpha---mixing parameter  
    a <- 0.5
    search <- foreach(ai = a, .combine = rbind) %dopar% {
        cv.fit <- cv.glmnet(
                        trainX,
                        trainY,
                        nfold = 5,
                        type.measure = "mse",
                        paralle = TRUE,
                        alpha = ai
                        )
        data.frame(
                        cvm = min(cv.fit$cvm),
                        lambda = cv.fit$lambda.min,
                        alpha = ai
                        )
        } 
    cv.opt <- search[search$cvm == min(search$cvm),] 

        # fit model by optimized alpha and lambda
        yfit = glmnet(
        trainX,
        trainY,
        lambda = cv.opt$lambda,
        alpha = cv.opt$alpha
                )       
        idf <- coef(yfit)
        idx <- which(idf != 0)
        selectf <- data.frame(
                features = idf@Dimnames[[1]][idx], 
                coefs = idf [idx]
        )
}

# modified to use lambda 1se and appropriate cvm
elastic.net <- function(trainX,trainY){
    if(nrow(trainX)!=length(trainY)){
            stop("Number of observations is differerent")
    } 

    # optimize alpha---mixing parameter  
    a <- 0.5
    search <- foreach(ai = a, .combine = rbind) %dopar% {
        set.seed(42)
        cv.fit <- cv.glmnet(
                        trainX,
                        trainY,
                        nfold = 5,
                        type.measure = "mse",
                        paralle = TRUE,
                        alpha = ai
                        )
        print(paste0("Dim of trainX: ", dim(trainX)))
        print(paste0("Len of trainY: ", length(trainY)))
        coef_matrix <- as.matrix(coef(cv.fit))

        non_zero_coefs <- coef_matrix[coef_matrix != 0, , drop = FALSE]
        print("Coefficients when fitting: ")
        print(non_zero_coefs)

        data.frame(
                        cvm = cv.fit$cvm[cv.fit$lambda == cv.fit$lambda.1se],
                        lambda = cv.fit$lambda.1se,
                        alpha = ai
                        )
        } 
    cv.opt <- search[search$cvm == min(search$cvm),] 

        # fit model by optimized alpha and lambda
    set.seed(42)
    yfit <- glmnet(
    trainX,
    trainY,
    lambda = cv.opt$lambda,
    alpha = cv.opt$alpha)

    coef_matrix <- as.matrix(coef(yfit))

    non_zero_coefs <- coef_matrix[coef_matrix != 0, , drop = FALSE]
    print("Coefficients when optimal: ")
    print(non_zero_coefs)

    idf <- coef(yfit)
    idx <- which(idf != 0)
    selectf <- data.frame(
            features = idf@Dimnames[[1]][idx], 
            coefs = idf [idx]
    )
}

MWAS <- function(gwas, weight, geno){
        z <- gwas %*% weight
        z.cor <- cor(geno)
        se <- sqrt(weight %*%  z.cor %*%  weight)
        z <- z/se
        p=pnorm(abs(z),lower.tail=F)*2
        return(c(z, p))
}

In [6]:
df <- df[which(df$Chr == 1), ]

In [7]:
i <- 2

In [8]:
library("glmnet")
library("e1071")
library("doParallel")

set.seed(2018)
wind <- c(5000,10000)
# output directory
#outd <- "/dcl02/lieber/shan/shizhong/finemapping/GWAS/tags/scz3/mwas/chr22/1/"
outd <- "20-OUT_original_mwas_sanity_test/"

Loading required package: Matrix

Loaded glmnet 4.1-8

Loading required package: foreach

Loading required package: iterators

Loading required package: parallel



## Replace all old objects with new objects in same format

In [9]:
#load("p1.rda", verbose = TRUE)

In [10]:
#p[1:10, 1:10]

### Methylation data

#### Previous version

In [11]:
suppressWarnings(library(bsseq))

Loading required package: BiocGenerics


Attaching package: ‘BiocGenerics’


The following objects are masked from ‘package:stats’:

    IQR, mad, sd, var, xtabs


The following objects are masked from ‘package:base’:

    anyDuplicated, aperm, append, as.data.frame, basename, cbind,
    colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
    get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
    match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
    Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
    table, tapply, union, unique, unsplit, which.max, which.min


Loading required package: GenomicRanges

Loading required package: stats4

Loading required package: S4Vectors


Attaching package: ‘S4Vectors’


The following objects are masked from ‘package:Matrix’:

    expand, unname


The following objects are masked from ‘package:data.table’:

    first, second


The following object is masked from ‘package:utils’:

    findMatches


The

In [12]:
# load data for mwas
# load("./rda/caudate_mwas_data_chr22.rda")
load(df$methylation_data[i])

p <- getMeth(BSobj2)


rownames(p) <- start(BSobj2)

sites_to_test_pos <- c(73274305, 73274312, 73292330, 73307769, 73308571, 73419188, 73419830, 73420076)
sites_to_test <- which(start(BSobj2) %in% sites_to_test_pos)
#sites_to_test <- c(73274305, 73274312, 73292330, 73307769, 73308571, 73419188, 73419830, 73420076)
p <- p[sites_to_test, ]

In [13]:
p

<8 x 297> DelayedMatrix object of type "double":
              [,1]      [,2]      [,3] ...    [,296]    [,297]
73274305 0.8165381 0.8497022 0.8324623   . 0.8438266 0.8545318
73274312 0.8165612 0.8497333 0.8324705   . 0.8438781 0.8545820
73292330 0.8161550 0.8505156 0.9148587   . 0.8865503 0.8736273
73307769 0.8716738 0.8813357 0.8736131   . 0.8888626 0.8768323
73308571 0.8533571 0.8791877 0.8659200   . 0.8871417 0.8880064
73419188 0.8772179 0.8479983 0.8151479   . 0.8349730 0.8726653
73419830 0.8745607 0.8443300 0.8185331   . 0.8296466 0.8653615
73420076 0.8750428 0.8456053 0.8235227   . 0.8323508 0.8649822

In [14]:
# candidate cg
his_cg <- as.numeric(rownames(p))

# regress out covariates
#load("covs_for_meqtl.rda")

In [15]:
sites_to_test

#### cpgwas (also loading SNPs and covs as `methInput`)

In [16]:
library(CpGWAS)

In [17]:
#load(df$methylation_data[i])
methInput <- new("MethylationInput",
               BSseq_obj = BSobj2,
               snp_data_path = "/expanse/lustre/projects/jhu152/naglemi/mwas/gwas/libd_chr1.pgen",
               cov_path = df$cov_file[i],
               start_site = min(sites_to_test),
               end_site = max(sites_to_test),
               no_cores = 120)

“Row names not found in methylation matrix of BSseq object. Retrieving from $colData$brnum.”


In [18]:
names(attributes(methInput))

In [19]:
dim(methInput@methylations)

In [20]:
methInput@methylations[1:5, 1:5]

Unnamed: 0,pos_73274305,pos_73274312,pos_73274892,pos_73275060,pos_73275154
Br1003,0.004699246,0.004701542,0.004686685,0.004693386,0.00473482
Br1004,0.025714016,0.02574389,0.027493191,0.027756351,0.02790125
Br1007,-0.037249239,-0.037280282,-0.038909435,-0.039017383,-0.039011096
Br1016,0.029459027,0.029420813,0.026233422,0.025284769,0.024711038
Br1017,-0.003689116,-0.003700395,-0.004825404,-0.005209573,-0.005439233


In [92]:
methInput@methylations <- methInput@methylations[, methInput@methylations_positions %in% sites_to_test_pos]

In [95]:
methInput@methylations_positions <- methInput@methylations_positions[methInput@methylations_positions %in% sites_to_test_pos]

In [21]:
my_cg <- as.numeric(gsub("pos_", "", colnames(methInput@methylations)))

In [35]:
his_cg

In [36]:
my_cg

In [22]:
all(my_cg == his_cg)

“longer object length is not a multiple of shorter object length”


In [23]:
cg <- my_cg

### covariates (previous code)

In [24]:
covs <- fread(df$cov_file[i])
covs <- t(covs)
colnames(covs) <- covs[1, ]
covs <- covs[2:nrow(covs), ]
# transpose so we have same orientation as original code

### Regress methylation data over covariates

In [25]:
BSobj2$brnum <- gsub("Br0", "Br", BSobj2$brnum)
colnames(covs) <- gsub("Br0", "Br", colnames(covs))

In [26]:
mat <- match(BSobj2$brnum,colnames(covs)) 
covs <- t(covs[,mat])
p.residual=matrix(NA,dim(p)[1],dim(p)[2])

In [27]:
rownames(covs)[is.na(covs[, 'genoPC1'])] <- BSobj2$brnum[is.na(covs[, 'genoPC1'])]

In [28]:
colnames(p.residual) <- BSobj2$brnum

In [29]:
covs <- as.data.frame(covs)
# Convert all columns except Dx and Sex from character to numeric
cols_to_convert <- setdiff(names(covs), c("Dx", "Sex"))

for (col in cols_to_convert) {
  covs[[col]] <- as.numeric(covs[[col]])
}

# Print the modified data frame to check the conversion
#print(dat)


In [30]:
for (i in 1:dim(p)[1]) { # For each methylation site
    dat <- as.data.frame(cbind(y = p[i,], covs))
    
    # Check for rows with NAs (the ones for which we don't have covariate data)
    valid_rows <- complete.cases(dat)
    
    if (sum(valid_rows) > 0) {
        dat_valid <- dat[valid_rows,]
        model.res <- lm(y ~ ., data = dat_valid)
        
        # Store residuals in the corresponding positions
        p.residual[i, valid_rows] <- resid(model.res)
    }
}


# for(i in 1:dim(p)[1]){ # foro each methylation site
#         dat <- as.data.frame(cbind(p[i,],covs))
#         colnames(dat) <- c("y",paste0("x",1:ncol(covs)))
#         model.res <- lm(reformulate(paste0("x",1:ncol(covs)), "y"),dat)
#         p.residual[i,] = resid(model.res) 
# }

Check if result is same for me and old code

First we need to make sure columns are in same order

In [31]:
p.residual_tocompare <- p.residual

In [33]:
length(cg)

In [34]:
dim(p.residual_tocompare)

In [32]:
rownames(p.residual_tocompare) <- paste0("pos_", cg)

ERROR: Error in dimnames(x) <- dn: length of 'dimnames' [1] not equal to array extent


In [None]:
p.residual_tocompare <- t(p.residual_tocompare)

In [None]:
dim(p.residual_tocompare)

In [None]:
dim(methInput@methylations)

In [None]:
p.residual_tocompare <- p.residual_tocompare[order(rownames(p.residual_tocompare)), ]

In [None]:
is.data.frame(p.residual_tocompare)

In [None]:
is.data.frame(methInput@methylations)

In [None]:
names(attributes(p.residual_tocompare))

In [None]:
class(p.residual_tocompare)

In [None]:
class(methInput@methylations)

In [None]:
compare_matrices <- function(mat1, mat2) {
  diffs <- mat1 != mat2
  indices <- which(diffs, arr.ind = TRUE)
  rownames <- rownames(mat1)[indices[, 1]]
  colnames <- colnames(mat1)[indices[, 2]]
  differences <- mat1[indices] - mat2[indices]
  
  results <- data.table(
    row_index = indices[, 1],
    col_index = indices[, 2],
    rowname = rownames,
    colname = colnames,
    difference = differences
  )
  
  overall_mean_difference <- if (nrow(results) > 0) mean(abs(differences)) else 0
  percentage_differences <- (nrow(results) / (nrow(mat1) * ncol(mat1))) * 100
  largest_difference <- if (nrow(results) > 0) max(abs(differences)) else 0
  
  cat("Total differences:", nrow(results), "\n")
  cat("Percentage of values with differences:", percentage_differences, "%\n")
  cat("Largest difference:", largest_difference, "\n")
  cat("Overall mean difference:", overall_mean_difference, "\n\n")
  
  if (nrow(results) > 0) {
    cat("Differences found:\n")
    for (i in 1:nrow(results)) {
      cat("Row index:", results$row_index[i], 
          "Row name:", results$rowname[i], 
          "Column index:", results$col_index[i], 
          "Column name:", results$colname[i], 
          "Difference:", results$difference[i], "\n")
    }
  }
  
  # Round matrices to 3 decimal places and check again
  mat1_rounded <- round(mat1, 3)
  mat2_rounded <- round(mat2, 3)
  diffs_rounded <- mat1_rounded != mat2_rounded
  indices_rounded <- which(diffs_rounded, arr.ind = TRUE)
  rownames_rounded <- rownames(mat1_rounded)[indices_rounded[, 1]]
  colnames_rounded <- colnames(mat1_rounded)[indices_rounded[, 2]]
  differences_rounded <- mat1_rounded[indices_rounded] - mat2_rounded[indices_rounded]
  
  results_rounded <- data.table(
    row_index = indices_rounded[, 1],
    col_index = indices_rounded[, 2],
    rowname = rownames_rounded,
    colname = colnames_rounded,
    difference = differences_rounded
  )
  
  overall_mean_difference_rounded <- if (nrow(results_rounded) > 0) mean(abs(differences_rounded)) else 0
  percentage_differences_rounded <- (nrow(results_rounded) / (nrow(mat1_rounded) * ncol(mat1_rounded))) * 100
  largest_difference_rounded <- if (nrow(results_rounded) > 0) max(abs(differences_rounded)) else 0
  
  cat("\nAfter rounding to 3 decimal places:\n")
  cat("Total differences:", nrow(results_rounded), "\n")
  cat("Percentage of values with differences:", percentage_differences_rounded, "%\n")
  cat("Largest difference:", largest_difference_rounded, "\n")
  cat("Overall mean difference:", overall_mean_difference_rounded, "\n")
  
  if (nrow(results_rounded) > 0) {
    cat("Differences found:\n")
    for (i in 1:nrow(results_rounded)) {
      cat("Row index:", results_rounded$row_index[i], 
          "Row name:", results_rounded$rowname[i], 
          "Column index:", results_rounded$col_index[i], 
          "Column name:", results_rounded$colname[i], 
          "Difference:", results_rounded$difference[i], "\n")
    }
  }
}

#compare_matrices(methInput@methylations, p.residual_tocompare)


Let's round them down so they're identical

In [None]:
p.residual_tocompare <- round(p.residual_tocompare, 5)
methInput@methylations <- round(methInput@methylations, 5)

In [None]:
p.residual_tocompare[1:5, 1:5]

In [None]:
methInput@methylations[1:5, 1:5]

In [None]:
p.residual <- round(p.residual, 5)

In [None]:
snp.gwas2 <- NULL

In [None]:
load("p1.rda", verbose = TRUE)

In [None]:
# min(snp.gwas2$pos_hg38)
# max(snp.gwas2$pos_hg38)

In [None]:
# load("p1.rda")
# pos_we_got <- snp.gwas2$pos_hg38
# saveRDS(pos_we_got, "20-intermediate_positions_in_old_set.csv")

In [None]:
#pos_we_got <- readRDS("20-intermediate_positions_in_old_set.csv")

In [None]:
#pos_we_got

In [None]:
# head(snp.gwas2)

### summary stats

In [None]:
library(data.table)
library(CpGWAS)

In [None]:
ss_path <- "/home/naglemi/mwas/gwas/gwas_stat_scz"

In [None]:
snp.gwas2 <- fread(ss_path, skip = 1, header = FALSE)
colnames(snp.gwas2) <- strsplit(readLines(ss_path, n = 1), "\t")[[1]]

In [None]:
snp.gwas2$z <- log(snp.gwas2$OR)/snp.gwas2$SE

In [None]:
snp.gwas2 <- snp.gwas2[, c(2, 1, 3, 3, 8, 4, 5, 20, 11)]

In [None]:
head(snp.gwas2, n = 1)

In [None]:
colnames(snp.gwas2)[1:5] <- c("snp", "chr", "pos_hg38", "pos_hg38", "info")

In [None]:
snp.gwas2 <- snp.gwas2[which(snp.gwas2$chr == 1 & snp.gwas2$pos_hg38 >= (73274305-20000) & snp.gwas2$pos_hg38 <= (73419830 + 20000)), ]

In [None]:
dim(snp.gwas2)

In [None]:
snp.gwas2 <- snp.gwas2[order(snp.gwas2$pos_hg38), ]

In [None]:
head(snp.gwas2)

In [None]:
#snp.gwas2 <- snp.gwas2[which(snp.gwas2$pos_hg38 %in% pos_we_got), ]

In [None]:
dim(snp.gwas2)

In [None]:
# built predition models
idx.ea <- BSobj2$race == "CAUC"

### SNPs in LIBD population

#### For reference, first load Shizhong's formatted SNPs on Chr7

In [None]:
snp2_sorted <- snp2[, order(names(snp2))]

In [None]:
colnames(snp2) <- gsub("Br0", "Br", colnames(snp2))

In [None]:
snp2 <- snp2[, colnames(snp2) %in% colnames(p.residual)]

In [None]:
dim(snp2)

In [None]:
head(snp2)

In [None]:
snp2_positions <- stringr::str_split_fixed(rownames(snp2), ":", 3)[, 2]

#### Now let's load ours on Chr1

In [None]:
paths <- list(pvar_path = "/expanse/lustre/projects/jhu152/naglemi/mwas/gwas/libd_chr1.pvar",
              pgen_path = "/expanse/lustre/projects/jhu152/naglemi/mwas/gwas/libd_chr1.pgen",
              psam_path = "/expanse/lustre/projects/jhu152/naglemi/mwas/gwas/libd_chr1.psam")

my_SNPs <- loadSNPData(paths$pvar_path, paths$pgen_path, paths$psam_path)

In [None]:
snp_indices_of_interest <- which(my_SNPs$pvar_dt$POS >= 73274305-20000 & my_SNPs$pvar_dt$POS <= 73419830 + 20000)

In [None]:
snp3 <- pgenlibr::ReadList(my_SNPs$pgen,
                        variant_subset = snp_indices_of_interest)
colnames(snp3) <- my_SNPs$pvar_dt$ID[snp_indices_of_interest]
rownames(snp3) <- my_SNPs$psam$`#IID`

In [None]:
snp3[1:10, 1:10]

In [None]:
map3 <- data.frame(POS = stringr::str_split_fixed(colnames(snp3), ":", 3)[, 2])

In [None]:
snp3 <- t(snp3)

In [None]:
# snp3 <- snp3[which(map3$POS %in% pos_we_got), ]

In [None]:
map3 <- data.frame(POS = stringr::str_split_fixed(rownames(snp3), ":", 3)[, 2])

In [None]:
dim(map3)

In [None]:
head(map3)

In [None]:
dim(map3)

#### Make sure we're working with same SNPs in cpgwas (and check map3 object)

In [None]:
dim(methInput@pvar_dt[which(methInput@pvar_dt$POS >= 73274305-20000 & methInput@pvar_dt$POS <= 73419830 + 20000 &
                        methInput@pvar_dt$`#CHROM` == 1), ])

In [None]:
my_map3 <- data.frame(POS = methInput@pvar_dt$POS[which(methInput@pvar_dt$POS >= 73274305-20000 & methInput@pvar_dt$POS <= 73419830 + 20000 &
                        methInput@pvar_dt$`#CHROM` == 1)])

In [None]:
all(map3 == my_map3)

### SNPs in reference population

In [None]:
#snp.1kg.eur2

In [None]:
paths <- list(pvar_path = "/expanse/lustre/projects/jhu152/naglemi/mwas/gwas/ref_EUR_chr1.pvar",
              pgen_path = "/expanse/lustre/projects/jhu152/naglemi/mwas/gwas/ref_EUR_chr1.pgen",
              psam_path = "/expanse/lustre/projects/jhu152/naglemi/mwas/gwas/ref_EUR_chr1.psam")

my_SNPs <- loadSNPData(paths$pvar_path, paths$pgen_path, paths$psam_path)

snp_indices_of_interest <- which(my_SNPs$pvar_dt$POS >= 73274305-20000 & my_SNPs$pvar_dt$POS <= 73419830 + 20000)

In [None]:
snp.1kg.eur2 <- pgenlibr::ReadList(my_SNPs$pgen,
                        variant_subset = snp_indices_of_interest)
colnames(snp.1kg.eur2) <- my_SNPs$pvar_dt$ID[snp_indices_of_interest]
rownames(snp.1kg.eur2) <- my_SNPs$psam$`IID`

In [None]:
dim(snp.1kg.eur2)

In [None]:
map.1kg.eur2 <- my_SNPs$pvar_dt

In [None]:
map.1kg.eur2 <- map.1kg.eur2[snp_indices_of_interest, ]

In [None]:
dim(map.1kg.eur2)

In [None]:
#map.1kg.eur2 <- data.frame(POS = stringr::str_split_fixed(colnames(snp.1kg.eur2), ":", 3)[, 2])

In [None]:
snp.1kg.eur2 <- t(snp.1kg.eur2)

In [None]:
# snp3 <- snp3[which(map3$POS %in% pos_we_got), ]

### Set window size and any other parameters

In [None]:
wind <- 20000

Is 1se vs min for lambda the problem?

## Stage 1

#### Shizhong's version

In [None]:
#head(map3)

In [None]:
#dim(map3)

In [None]:
#dim(snp3)

In [None]:
p.residual <- p.residual[, order(colnames(p.residual))]

In [None]:
#dim(snp3)

In [None]:
#p.residual

In [None]:
#snp3[1:10, 1:10]

In [None]:
snp3 <- snp3[, colnames(snp3) %in% colnames(p.residual)]

In [None]:
snp3 <- snp3[, order(colnames(snp3))]

In [None]:
#dim(snp3)

In [None]:
for(k in 1:length(wind)){
    models.ea <- c()
    models.all <- c()
    for(i in 1:length(cg)){
    #for(i in 1){
            cat(i,"\n")
            print(paste0("This cg is: ", cg[i]))
            range1 <- ifelse(cg[i] - wind[k] > 0,cg[i] - wind[k],0)
            range2 <- cg[i] + wind[k]
            idx <- map3$POS > range1 & map3$POS < range2
            # go to next cg if no snps within window
            if(sum(idx) <= 1){
                    next
            }
            geno <- snp3[idx,] # changed snp2 to snp3
            rownames(geno) <- map3$POS[idx]
            trainX <- t(geno)
            trainY <- p.residual[i,]
            fit <- elastic.net(trainX,trainY)
            fit <- tryCatch(
                    elastic.net(trainX,trainY),
                    error = function(e) {return ("err")})
            if(!is.data.frame(fit)){
                if(fit == "err"){
                    next
                }
            }
            if(nrow(fit) == 0) next

            fit$cg <- cg[i]
            models.all <- rbind(models.all,fit)
            # EA only
            trainX <- trainX[idx.ea,]
            if(sum(apply(trainX,2,var)!=0) <= 1){
                    next
            }
            trainY <- trainY[idx.ea]
            fit <- tryCatch(
                    elastic.net(trainX,trainY),
                    error = function(e) {return ("err")})
            if(!is.data.frame(fit)){
                if(fit == "err"){
                    next
                }
            }
            if(nrow(fit) == 0) next
            fit$cg <- cg[i]
            models.ea <- rbind(models.ea,fit)
    }
}

In [None]:
models.ea <- models.ea[models.ea[,1] != "(Intercept)",]
models.all <- models.all[models.all[,1] != "(Intercept)",]

In [None]:
models.ea

In [None]:
models.all

#### My version

In [None]:
scaffoldIdentifier <- "debugging_test_071624"

scaffold_models <- fit_MWAS_models(
  BSobj = BSobj2,
  methInput = methInput,
  window_sizes = c(10000),
  chunk1 = 1,
  chunk2 = length(methInput@methylations_positions),
  n_fold = 5,
  scaffoldIdentifier = scaffoldIdentifier,
  outdir = "20-OUT_debugging_test",
  verbose = FALSE,
  lambda_choice = "1se",
  alphas = 0.5,
  cores_per_alpha = "all",
  num_cores = 120,
  allow_inefficient_parallelization = FALSE,
  save_evaluation_results_each_fold = FALSE,
  save_glmnet_object = FALSE,
  cv_eval_mode = "dynamic",
  omit_folds_with_na_r = TRUE,
  maf = 0,
  na.action = "remove"
)

In [None]:
methScaff <- readRDS("20-OUT_debugging_test/debugging_test_071624.rds")

In [None]:
library(data.table)

my_models.all <- rbindlist(lapply(methScaff@models, function(model) {
  data.table(
    features = sub("chr1:", "", names(model@snpWeights)),
    coefs = as.numeric(model@snpWeights),
    cg = rep(model@methylationPosition, length(model@snpWeights))
  )
}), use.names = TRUE, fill = TRUE)

my_models.all$features <- stringr::str_split_fixed(my_models.all$features, ":", 3)[,1]
setorder(my_models.all, cg)
head(my_models.all)


In [None]:
dim(my_models.all)

In [None]:
dim(models.all)

In [None]:
dim(na.omit(models.all))

In [None]:
head(my_models.all)

In [None]:
head(models.all)

For testing purposes, although I don't yet know why we have more rows in my_models.all than models.all, let's subset so they match and we can do a proper comparison.

In [None]:
models.all$tag <- paste0(models.all$features, "to", models.all$cg)
my_models.all$tag <- paste0(my_models.all$features, "to", my_models.all$cg)

In [None]:
my_models.all <- my_models.all[which(my_models.all$tag %in% models.all$tag), ]

In [None]:
dim(my_models.all)

In [None]:
identical(models.all, my_models.all)

In [None]:
models.all_compare <- models.all[order(models.all$tag), ]
my_models.all_compare <- my_models.all[order(my_models.all$tag), ]
rownames(models.all_compare) <- NULL
rownames(my_models.all_compare) <- NULL


In [None]:
my_models.all_compare <- as.data.frame(my_models.all_compare)

In [None]:
identical(models.all_compare, my_models.all_compare)

In [None]:
head(models.all_compare)

In [None]:
head(my_models.all_compare)

In [None]:
models.all <- as.data.frame(my_models.all)

### Make sure my old RDS pretty much matches up with what we have in the `my_models.all_compare` object.

In [None]:
# df2 <- fread("12-OUT_matched_SNP_meth_cov_outputs.csv")

In [None]:
# df2 <- df2[which(df2$Chr == 1 &
#                  df2$population == "all" &
#                  df2$region == "caud"), ]

In [None]:
test_in <- readRDS("..//output_EXPANSE_a2_caud/libd_chr1-chr1_all-libd_chr1-chr1_all-908982-928981-dynamic-1corestotal-allcorepera-20240416-172011.rds")

In [None]:
names(attributes(test_in))

## Stage 2

#### Shizhong's version

In [None]:
if(!dir.exists(outd)) dir.create(outd)

In [None]:
# mwas by models of all samples
cg2 <- unique(models.all$cg)
mwas.all <- matrix(0,nrow=length(cg2),ncol=2)
for(i in 1:length(cg2)){
    pos <- models.all[models.all$cg == cg2[i],1]
    
    gwas <- snp.gwas2$z[is.element(snp.gwas2$pos_hg38, pos)]
    weight <- models.all[models.all$cg == cg2[i],2]
    geno <- snp.1kg.eur2[match(pos,map.1kg.eur2$POS),]
    mwas.all[i,] <- MWAS(gwas, weight, t(geno))
}
rownames(mwas.all) <- cg2
colnames(mwas.all) <- c("z","p")

# mwas by models of EA samples
cg2 <- unique(models.ea$cg)
mwas.ea <- matrix(0,nrow=length(cg2),ncol=2)
for(i in 1:length(cg2)){
    pos <- models.ea[models.ea$cg == cg2[i],1]
    gwas <- snp.gwas2$z[is.element(snp.gwas2$pos_hg38, pos)]
    weight <- models.ea[models.ea$cg == cg2[i],2]
    geno <- snp.1kg.eur2[match(pos,map.1kg.eur2$POS),]
    mwas.ea[i,] <- MWAS(gwas, weight, t(geno))
}
rownames(mwas.ea) <- cg2
colnames(mwas.ea) <- c("z","p")

# output models and mwas results
outf <- paste0(outd,"/models-a9-covnew.all.wind.",wind[k])
write.csv(models.all,outf)
outf <- paste0(outd,"/models-a9-covnew.ea.wind.",wind[k])
write.csv(models.ea,outf)
outf <- paste0(outd,"/mwas-a9-covnew.all.wind.",wind[k])
write.csv(mwas.all,outf)
outf <- paste0(outd,"/mwas-a9-covnew.ea.wind.",wind[k])
write.csv(mwas.ea,outf)

In [None]:
mwas.all

##### Very verbose

In [None]:
# # mwas by models of all samples
# cg2 <- unique(models.all$cg)
# mwas.all <- matrix(0, nrow = length(cg2), ncol = 2)
# cat("Dimensions of relevant objects:\n")
# cat("models.all:", dim(models.all), "\n")
# cat("snp.gwas2:", dim(snp.gwas2), "\n")
# cat("map.1kg.eur2:", dim(map.1kg.eur2), "\n")
# cat("snp.1kg.eur2:", dim(snp.1kg.eur2), "\n\n")

# for (i in 1:length(cg2)) {
#     pos <- models.all[models.all$cg == cg2[i], 1]
#     gwas <- snp.gwas2$z[is.element(snp.gwas2$pos_hg38, pos)]
#     weight <- models.all[models.all$cg == cg2[i], 2]
#     match_indices <- match(pos, map.1kg.eur2$POS)
    
#     cat("Iteration:", i, "\n")
#     cat("Current CG:", cg2[i], "\n")
#     cat("Positions:\n")
#     print(head(pos))
#     cat("GWAS Z-scores:\n")
#     print(head(gwas))
#     cat("Weights:\n")
#     print(head(weight))
#     cat("Matching Indices:\n")
#     print(head(match_indices))
    
#     tryCatch({
#         if (any(is.na(match_indices))) stop("NA values found in match_indices")
#         if (any(match_indices > nrow(snp.1kg.eur2))) stop("Out of bounds indices found")
#     }, error = function(e) {
#         cat("Error detected:", e$message, "\n")
#         cat("Dimensions of relevant objects at error detection:\n")
#         cat("models.all:", dim(models.all), "\n")
#         cat("snp.gwas2:", dim(snp.gwas2), "\n")
#         cat("map.1kg.eur2:", dim(map.1kg.eur2), "\n")
#         cat("snp.1kg.eur2:", dim(snp.1kg.eur2), "\n")
#         cat("Positions causing error:\n")
#         print(pos)
#         cat("Matching Indices causing error:\n")
#         print(match_indices)
#         stop("Stopping execution due to error.")
#     })
    
#     geno <- NULL
#     tryCatch({
#         geno <- snp.1kg.eur2[match_indices, ]
#     }, error = function(e) {
#         cat("Error accessing genotype data at iteration:", i, "\n")
#         cat("Error message:", e$message, "\n")
#         stop("Stopping execution due to error.")
#     })
    
#     cat("Genotype Data:\n")
#     print(head(geno))
#     tryCatch({
#         mwas.all[i, ] <- MWAS(gwas, weight, t(geno))
#         cat("MWAS Results (z, p):\n")
#         print(mwas.all[i, ])
#         cat("\n")
#     }, error = function(e) {
#         cat("Error performing MWAS at iteration:", i, "\n")
#         cat("Error message:", e$message, "\n")
#         stop("Stopping execution due to error.")
#     })
# }
# rownames(mwas.all) <- cg2
# colnames(mwas.all) <- c("z", "p")

# # mwas by models of EA samples
# cg2 <- unique(models.ea$cg)
# mwas.ea <- matrix(0, nrow = length(cg2), ncol = 2)
# cat("Dimensions of relevant objects:\n")
# cat("models.ea:", dim(models.ea), "\n")
# cat("snp.gwas2:", dim(snp.gwas2), "\n")
# cat("map.1kg.eur2:", dim(map.1kg.eur2), "\n")
# cat("snp.1kg.eur2:", dim(snp.1kg.eur2), "\n\n")

# for (i in 1:length(cg2)) {
#     pos <- models.ea[models.ea$cg == cg2[i], 1]
#     gwas <- snp.gwas2$z[is.element(snp.gwas2$pos_hg38, pos)]
#     weight <- models.ea[models.ea$cg == cg2[i], 2]
#     match_indices <- match(pos, map.1kg.eur2$POS)
    
#     cat("Iteration:", i, "\n")
#     cat("Current CG:", cg2[i], "\n")
#     cat("Positions:\n")
#     print(head(pos))
#     cat("GWAS Z-scores:\n")
#     print(head(gwas))
#     cat("Weights:\n")
#     print(head(weight))
#     cat("Matching Indices:\n")
#     print(head(match_indices))
    
#     tryCatch({
#         if (any(is.na(match_indices))) stop("NA values found in match_indices")
#         if (any(match_indices > nrow(snp.1kg.eur2))) stop("Out of bounds indices found")
#     }, error = function(e) {
#         cat("Error detected:", e$message, "\n")
#         cat("Dimensions of relevant objects at error detection:\n")
#         cat("models.ea:", dim(models.ea), "\n")
#         cat("snp.gwas2:", dim(snp.gwas2), "\n")
#         cat("map.1kg.eur2:", dim(map.1kg.eur2), "\n")
#         cat("snp.1kg.eur2:", dim(snp.1kg.eur2), "\n")
#         cat("Positions causing error:\n")
#         print(pos)
#         cat("Matching Indices causing error:\n")
#         print(match_indices)
#         stop("Stopping execution due to error.")
#     })
    
#     geno <- NULL
#     tryCatch({
#         geno <- snp.1kg.eur2[match_indices, ]
#     }, error = function(e) {
#         cat("Error accessing genotype data at iteration:", i, "\n")
#         cat("Error message:", e$message, "\n")
#         stop("Stopping execution due to error.")
#     })
    
#     cat("Genotype Data:\n")
#     print(head(geno))
#     tryCatch({
#         mwas.ea[i, ] <- MWAS(gwas, weight, t(geno))
#         cat("MWAS Results (z, p):\n")
#         print(mwas.ea[i, ])
#         cat("\n")
#     }, error = function(e) {
#         cat("Error performing MWAS at iteration:", i, "\n")
#         cat("Error message:", e$message, "\n")
#         stop("Stopping execution due to error.")
#     })
# }
# rownames(mwas.ea) <- cg2
# colnames(mwas.ea) <- c("z", "p")


#### My version

In [None]:
# # Script C: script_C.R
# library(CpGWAS)
# library(data.table)
# library(stringr)
# library(optparse)

# # Command line options
# option_list <- list(
#   make_option(c("-g", "--genome_file_index"), type = "integer", default = 1,
#               help = "Index of genome file to process"),
#   make_option(c("-d", "--data_file"), type = "character", default = "/expanse/lustre/projects/jhu152/naglemi/mwas/CpGWAS/scripts/12-OUT_matched_SNP_meth_cov_outputs.csv",
#               help = "Path to data file")
# )

# opt <- parse_args(OptionParser(option_list = option_list))

# # Load genome files
# genome_files <- list.files("/expanse/lustre/projects/jhu152/naglemi/mwas/gwas",
#                            pattern = "EUR", full.names = TRUE)
# genome_files <- genome_files[grepl("pvar", genome_files)]

# genome_files <- data.table(path = genome_files, Chr = NA)

# genome_files$Chr <- str_split_fixed(genome_files$path, "chr", 2)[, 2]
# genome_files$Chr <- gsub(".pvar", "", genome_files$Chr)

# genome_files$Chr <- as.integer(genome_files$Chr)
# genome_files <- genome_files[order(genome_files$Chr), ]

# df <- fread(opt$data_file)

In [None]:
summary_stats_list <- "/home/naglemi/mwas/gwas/gwas_stat_scz"

# Pre-load all summary stats files into a list and clean/standardize column names
summary_stats_data <- lapply(summary_stats_list, function(path) {
  stats <- suppressWarnings(data.table::fread(path))
  colnames(stats) <- gsub("#CHROM", "CHR", colnames(stats))
  clean_and_standardize_colnames(stats)
})

In [None]:
# print("Starting genome file processing")
# # Process the specified genome file
# g <- opt$genome_file_index
# print(paste("Processing genome file index:", g))

# paths <- list(
#   pvar_path = genome_files[g]$path,
#   pgen_path = gsub("pvar", "pgen", genome_files[g]$path),
#   psam_path = gsub("pvar", "psam", genome_files[g]$path)
# )

# my_SNPs <- CpGWAS::loadSNPData(paths$pvar_path, paths$pgen_path, paths$psam_path)

In [None]:
paths <- list(pvar_path = "/expanse/lustre/projects/jhu152/naglemi/mwas/gwas/ref_EUR_chr1.pvar",
              pgen_path = "/expanse/lustre/projects/jhu152/naglemi/mwas/gwas/ref_EUR_chr1.pgen",
              psam_path = "/expanse/lustre/projects/jhu152/naglemi/mwas/gwas/ref_EUR_chr1.psam")

my_SNPs <- loadSNPData(paths$pvar_path, paths$pgen_path, paths$psam_path)

snp_indices_of_interest <- which(my_SNPs$pvar_dt$POS >= 73274305-20000 & my_SNPs$pvar_dt$POS <= 73419830 + 20000)

In [None]:
df <- fread("12-OUT_matched_SNP_meth_cov_outputs.csv")

In [None]:
df <- df[which(df$region == "caud" & df$population == "all"), ]

In [None]:
df <- df[which(df$chunk_start <= min(sites_to_test) &
         df$chunk_end >= max(sites_to_test)), ]

In [None]:
df <- df[which(df$Chr == "1")]

In [None]:
df

In [None]:
df <- df[1, ]

In [None]:
df_this_chr <- df

In [None]:
summary_stats_list

In [None]:
summary_stats_data <- lapply(summary_stats_data, function(stats) stats[`CHR` == 1])

print("Loaded SNP data")
print("Files for this Chr:")
print(nrow(df_this_chr))
for(j in 1:nrow(df_this_chr)){
  print(paste0("File number: ", j))
  if (grepl("empty", df_this_chr$path[j])) {
    message(paste0("no model for ", df_this_chr$path[j]))
    next
  }
  #
  my_rds <- tryCatch({
    readRDS(df_this_chr$path[j])
  }, error = function(e) {
    # Print an error message and skip this iteration
    message("ALERT!!! Error reading RDS file: ", e$message)
    return(NULL)  # Return NULL to signal failure
  })
    
  # Check if the readRDS call returned NULL (which indicates an error)
  if (is.null(my_rds)) {
    stop("oops")
    next  # Skip the rest of this loop iteration
  }

  print(paste("Loaded RDS file:", df_this_chr$path[j]))

  for (k in 1:length(summary_stats_list)) {
    print(paste0("k is ", k))
    outname <- "20-OUT_MWAS_debug_stage2_results.rds"
    #if(file.exists(outname)) next
    summary_stats <- summary_stats_data[[k]]
    print("head of summary stats before clean:")
    print(head(summary_stats))

    MWASmodels <- vector("list", length(my_rds@models))
    if (is.null(summary_stats)) {
      summary_stats <- suppressWarnings(fread(summary_stats_list[[k]]))
      summary_stats <- clean_and_standardize_colnames(summary_stats)
    }
    print("head of summary stats after clean:")
    print(head(summary_stats))
      
    for (i in seq_along(my_rds@models)) {
      #print(i)
      this_MethylationBase <- my_rds@models[[i]]
      SNP_split <- stringr::str_split_fixed(names(this_MethylationBase@snpWeights), ":", 4)
      SNP_split[, 1] <- gsub("chr", "", SNP_split[, 1])
      SNP_split_dt <- data.table::as.data.table(SNP_split)
      data.table::setnames(SNP_split_dt, c("chr", "post", "ref", "alt"))
      SNP_split_dt[, `:=`(chr = as.integer(chr), post = as.integer(post))]
      data.table::setkey(SNP_split_dt, chr, post)

      relevant_SNP_indices <- my_SNPs$pvar_dt[SNP_split_dt, on = .(`#CHROM` = chr, POS = post), which = TRUE, nomatch = 0]
      relevant_ids <- my_SNPs$pvar_dt$ID[relevant_SNP_indices]
      summary_stats_sub <- summary_stats[relevant_ids, nomatch = 0]

      if (!identical(summary_stats_sub$BP, SNP_split_dt$post)) {
        summary_stats_sub <- summary_stats_sub[order(summary_stats_sub$BP), ]
        if (!identical(summary_stats_sub$BP, SNP_split_dt$post)) {
          unmatched_positions <- !SNP_split_dt$post %in% summary_stats_sub$BP
          if (any(unmatched_positions)) {
            SNP_split_dt <- SNP_split_dt[!unmatched_positions, ]
            this_MethylationBase@snpWeights <- this_MethylationBase@snpWeights[!unmatched_positions]

            relevant_SNP_indices <- my_SNPs$pvar_dt[SNP_split_dt, on = .(`#CHROM` = chr, POS = post), which = TRUE, nomatch = 0]
            if (!identical(summary_stats_sub$BP, SNP_split_dt$post)) {
              stop("SNP order does not match even after removing unmatched positions. This should not happen. Code is broken.")
            }
          }
        }
      }

      if (!identical(SNP_split_dt$alt, summary_stats_sub$A2) | !identical(SNP_split_dt$ref, summary_stats_sub$A1)) {
        not_matching <- which(SNP_split_dt$alt != summary_stats_sub$A2)
        summary_stats_ref_flipped <- SNP_split_dt$ref[not_matching]
        summary_stats_alt_flipped <- SNP_split_dt$alt[not_matching]
        SNP_split_dt[not_matching, `:=`(ref = summary_stats_alt_flipped, alt = summary_stats_ref_flipped)]
        this_MethylationBase@snpWeights[not_matching] <- this_MethylationBase@snpWeights[not_matching] * -1
      }

      G <- pgenlibr::ReadList(my_SNPs$pgen, variant_subset = relevant_SNP_indices)
      #print(paste("Performing MWAS for model index:", i))
      mwas_out <- mwas(z = summary_stats_sub$BETA, w = this_MethylationBase@snpWeights, G = G)

      MWASmodels[[i]] <- mwas_out
    }

    results <- MWASresults(MWASmodels, paths$pvar_path, paths$pgen_path, paths$psam_path, summary_stats_list[[k]], df_this_chr$path[j])
    saveRDS(results, outname)
    print(paste("Saved results to:", outname))
  }
}

### Compare results

In [None]:
my_results <- readRDS("20-OUT_MWAS_debug_stage2_results.rds")

In [None]:
names(attributes(my_results))

In [None]:
length(my_results@MWASmodels)

In [None]:
my_results@MWASmodels[[4]]

In [None]:
trait <- "scz"
df$stage2_paths <- "20-OUT_MWAS_debug_stage2_results.rds"
#df$final_paths <- vector("list", length(df$stage2_paths))

In [None]:
vector("list", length(df$stage2_paths))

In [None]:
message("Processing trait: ", trait)
#df$final_paths <- paste0(df$stage2_paths, trait, "_results.rds")
output_file <- "20-OUT_MWAS_debug_stage2_results.csv"
header_written <- FALSE

In [None]:
i <- 1

In [None]:
df_this_chr

In [None]:
stage2_in <- my_results
stage1_in <- readRDS(df$path[i])

In [None]:
if (length(stage1_in@models) != length(stage2_in@MWASmodels)) {
    stop("Files don't match")
}

data_list <- vector("list", length(stage1_in@models))
for (j in seq_along(stage1_in@models)) {
    model1 <- stage1_in@models[[j]]
    model2 <- stage2_in@MWASmodels[[j]]

    data_list[[j]] <- data.table(
        z = model2["z"],
        p = model2["p"],
        n = model2["n"],
        pos = model1@methylationPosition,
        stats = stage2_in@summary_stats_path,
        scaff = stage1_in@scaffoldIdentifier
    )
}

combined_data <- rbindlist(data_list, use.names = TRUE, fill = TRUE)

# Write data incrementally
if (!header_written) {
    fwrite(combined_data, output_file)
    header_written <- TRUE
} else {
    fwrite(combined_data, output_file, append = TRUE)
}

In [None]:
combined_data[which(combined_data$pos %in% rownames(mwas.all)), ]

In [None]:
mwas.all

## Compare with results from CpGWAS

In [None]:
results <- fread("16a9par-OUT_stage2_MWAS_scz.csv")

In [None]:
smallerdata <- results[which(results$p < 10^-200), ]

In [None]:
dim(smallerdata)

In [None]:
dim(smallerdata)

In [None]:
table(smallerdata$population, smallerdata$region)

In [None]:
smallerdata[which(smallerdata$population == "all"), ]

In [None]:
smallerdata_matching_pos <- smallerdata[which(smallerdata$chr == 1 & smallerdata$pos >= 73418062 & smallerdata$pos <= 73418313), ]

In [None]:
smallerdata_matching_pos

In [None]:
smallerdata_matching_pos <- smallerdata[which(smallerdata$population == "all" & smallerdata$region == "caud"), ]

In [None]:
smallerdata_matching_pos

In [None]:
dim(results[which(results$p < 10^-200), ])

In [None]:
results_subset <- results[which(results$chr == 1 & results$pos >= 73418062 & results$pos <= 73418313), ]

In [None]:
results_subset

In [None]:
results_subset <- results_subset[which(results_subset$population == "all" & results_subset$region == "caud"), ]

head(results)