# Compare MWAS methods for sanity tests and troubleshooting

In this notebooks, we have my code and Shizhong's side by side, and compare every variable so we can find out exactly where the discrepancy originates.

# Pick the regions we will test

In [3]:
library(data.table)

# prev_hits <- fread("
# Chr     pos        old_z       old_p
# 11   38247902        -27.1535308285104       2.30024742330298e-162
# 2    47933357        18.3327793004811        4.53147699327216e-75
# 7    1987910         10.1255085321387        4.25759742738181e-24
# 7    1987896         10.1255085321387        4.25759742738189e-24
# 7    1987797         10.0241523897721        1.19379483108027e-23
# 7    1987778         10.0105851568856        1.36940148731912e-23
# 12   2194742         -10.0072634920486       1.41615523554955e-23
# ")


In [4]:
prev_hits <- fread("
Chr     pos
1    73274305
1    73418161
1    73418205
1    73418313
1    73419188
1    73419830
")

In [5]:
#df <- fread("09.5-OUT_matched_SNP_meth_cov_chunked_EXPANSE_a2.csv")

In [6]:
df <- fread("09-OUT_matched_SNP_meth_cov_a2.csv")

## Try original code

In [7]:
###### model: learn elastic net model on training data 
######---------Input: trainX, trainY
######---------Return: selected features and coefficents

# original
elastic.net <- function(trainX,trainY){
    if(nrow(trainX)!=length(trainY)){
            stop("Number of observations is differerent")
    } 

    # optimize alpha---mixing parameter  
    a <- 0.5
    search <- foreach(ai = a, .combine = rbind) %dopar% {
        cv.fit <- cv.glmnet(
                        trainX,
                        trainY,
                        nfold = 5,
                        type.measure = "mse",
                        paralle = TRUE,
                        alpha = ai
                        )
        data.frame(
                        cvm = min(cv.fit$cvm),
                        lambda = cv.fit$lambda.min,
                        alpha = ai
                        )
        } 
    cv.opt <- search[search$cvm == min(search$cvm),] 

        # fit model by optimized alpha and lambda
        yfit = glmnet(
        trainX,
        trainY,
        lambda = cv.opt$lambda,
        alpha = cv.opt$alpha
                )       
        idf <- coef(yfit)
        idx <- which(idf != 0)
        selectf <- data.frame(
                features = idf@Dimnames[[1]][idx], 
                coefs = idf [idx]
        )
}

# modified to use lambda 1se and appropriate cvm
elastic.net <- function(trainX,trainY){
    if(nrow(trainX)!=length(trainY)){
            stop("Number of observations is differerent")
    } 

    # optimize alpha---mixing parameter  
    a <- 0.5
    search <- foreach(ai = a, .combine = rbind) %dopar% {
        set.seed(42)
        cv.fit <- cv.glmnet(
                        trainX,
                        trainY,
                        nfold = 5,
                        type.measure = "mse",
                        paralle = TRUE,
                        alpha = ai
                        )
        print(paste0("Dim of trainX: ", dim(trainX)))
        print(paste0("Len of trainY: ", length(trainY)))
        coef_matrix <- as.matrix(coef(cv.fit))

        non_zero_coefs <- coef_matrix[coef_matrix != 0, , drop = FALSE]
        print("Coefficients when fitting: ")
        print(non_zero_coefs)

        data.frame(
                        cvm = cv.fit$cvm[cv.fit$lambda == cv.fit$lambda.1se],
                        lambda = cv.fit$lambda.1se,
                        alpha = ai
                        )
        } 
    cv.opt <- search[search$cvm == min(search$cvm),] 

        # fit model by optimized alpha and lambda
    set.seed(42)
    yfit <- glmnet(
    trainX,
    trainY,
    lambda = cv.opt$lambda,
    alpha = cv.opt$alpha)

    coef_matrix <- as.matrix(coef(yfit))

    non_zero_coefs <- coef_matrix[coef_matrix != 0, , drop = FALSE]
    print("Coefficients when optimal: ")
    print(non_zero_coefs)

    idf <- coef(yfit)
    idx <- which(idf != 0)
    selectf <- data.frame(
            features = idf@Dimnames[[1]][idx], 
            coefs = idf [idx]
    )
}

MWAS <- function(gwas, weight, geno){
        z <- gwas %*% weight
        z.cor <- cor(geno)
        se <- sqrt(weight %*%  z.cor %*%  weight)
        z <- z/se
        p=pnorm(abs(z),lower.tail=F)*2
        return(c(z, p))
}

In [8]:
df <- df[which(df$Chr == 1), ]

In [9]:
i <- 2

In [10]:
library("glmnet")
library("e1071")
library("doParallel")

set.seed(2018)
wind <- c(5000,10000)
# output directory
#outd <- "/dcl02/lieber/shan/shizhong/finemapping/GWAS/tags/scz3/mwas/chr22/1/"
outd <- "20-OUT_original_mwas_sanity_test/"

Loading required package: Matrix

Loaded glmnet 4.1-8

Loading required package: foreach

Loading required package: iterators

Loading required package: parallel



## Replace all old objects with new objects in same format

In [9]:
#load("p1.rda", verbose = TRUE)

In [10]:
#p[1:10, 1:10]

### Methylation data

#### Previous version

In [11]:
suppressWarnings(library(bsseq))

Loading required package: BiocGenerics


Attaching package: ‘BiocGenerics’


The following objects are masked from ‘package:stats’:

    IQR, mad, sd, var, xtabs


The following objects are masked from ‘package:base’:

    anyDuplicated, aperm, append, as.data.frame, basename, cbind,
    colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
    get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
    match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
    Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
    table, tapply, union, unique, unsplit, which.max, which.min


Loading required package: GenomicRanges

Loading required package: stats4

Loading required package: S4Vectors


Attaching package: ‘S4Vectors’


The following objects are masked from ‘package:Matrix’:

    expand, unname


The following objects are masked from ‘package:data.table’:

    first, second


The following object is masked from ‘package:utils’:

    findMatches


The

In [12]:
# load data for mwas
# load("./rda/caudate_mwas_data_chr22.rda")
load(df$methylation_data[i])

p <- getMeth(BSobj2)


rownames(p) <- start(BSobj2)

sites_to_test <- which(start(BSobj2) >= (73418205 - 500) & start(BSobj2) <= (73418205 + 500))
p <- p[sites_to_test, ]

In [13]:
p

<5 x 297> DelayedMatrix object of type "double":
              [,1]      [,2]      [,3] ...    [,296]    [,297]
73418062 0.8984465 0.8777756 0.8431057   . 0.8812308 0.9056437
73418161 0.8961731 0.8746082 0.8396626   . 0.8765445 0.9024178
73418186 0.8956018 0.8738120 0.8388076   . 0.8753571 0.9015994
73418205 0.8951686 0.8732083 0.8381624   . 0.8744542 0.9009768
73418313 0.8927296 0.8698082 0.8345782   . 0.8693295 0.8974361

In [14]:
# candidate cg
his_cg <- as.numeric(rownames(p))

# regress out covariates
#load("covs_for_meqtl.rda")

In [15]:
sites_to_test

#### cpgwas (also loading SNPs and covs as `methInput`)

In [16]:
library(CpGWAS)

In [17]:
#load(df$methylation_data[i])
methInput <- new("MethylationInput",
               BSseq_obj = BSobj2,
               snp_data_path = "/expanse/lustre/projects/jhu152/naglemi/mwas/gwas/libd_chr1.pgen",
               cov_path = df$cov_file[i],
               start_site = min(sites_to_test),
               end_site = max(sites_to_test),
               no_cores = 120)

“Row names not found in methylation matrix of BSseq object. Retrieving from $colData$brnum.”


In [18]:
names(attributes(methInput))

In [19]:
dim(methInput@methylations)

In [20]:
methInput@methylations[1:5, 1:5]

Unnamed: 0,pos_73418062,pos_73418161,pos_73418186,pos_73418205,pos_73418313
Br1003,0.01990706,0.02246961,0.02312395,0.02362284,0.02647802
Br1004,-0.01983603,-0.01966969,-0.01961677,-0.01957356,-0.01927871
Br1007,0.02560136,0.02750728,0.02799101,0.02835905,0.03045297
Br1016,-0.01924838,-0.01925891,-0.0192511,-0.01924228,-0.01914366
Br1017,-0.03593285,-0.03794162,-0.03844075,-0.03881746,-0.04090786


In [21]:
my_cg <- as.numeric(gsub("pos_", "", colnames(methInput@methylations)))

In [22]:
all(my_cg == his_cg)

In [23]:
cg <- my_cg

### covariates (previous code)

In [24]:
covs <- fread(df$cov_file[i])
covs <- t(covs)
colnames(covs) <- covs[1, ]
covs <- covs[2:nrow(covs), ]
# transpose so we have same orientation as original code

### Regress methylation data over covariates

In [25]:
BSobj2$brnum <- gsub("Br0", "Br", BSobj2$brnum)
colnames(covs) <- gsub("Br0", "Br", colnames(covs))

In [26]:
mat <- match(BSobj2$brnum,colnames(covs)) 
covs <- t(covs[,mat])
p.residual=matrix(NA,dim(p)[1],dim(p)[2])

In [27]:
rownames(covs)[is.na(covs[, 'genoPC1'])] <- BSobj2$brnum[is.na(covs[, 'genoPC1'])]

In [28]:
colnames(p.residual) <- BSobj2$brnum

In [29]:
covs <- as.data.frame(covs)
# Convert all columns except Dx and Sex from character to numeric
cols_to_convert <- setdiff(names(covs), c("Dx", "Sex"))

for (col in cols_to_convert) {
  covs[[col]] <- as.numeric(covs[[col]])
}

# Print the modified data frame to check the conversion
#print(dat)


In [30]:
for (i in 1:dim(p)[1]) { # For each methylation site
    dat <- as.data.frame(cbind(y = p[i,], covs))
    
    # Check for rows with NAs (the ones for which we don't have covariate data)
    valid_rows <- complete.cases(dat)
    
    if (sum(valid_rows) > 0) {
        dat_valid <- dat[valid_rows,]
        model.res <- lm(y ~ ., data = dat_valid)
        
        # Store residuals in the corresponding positions
        p.residual[i, valid_rows] <- resid(model.res)
    }
}


# for(i in 1:dim(p)[1]){ # foro each methylation site
#         dat <- as.data.frame(cbind(p[i,],covs))
#         colnames(dat) <- c("y",paste0("x",1:ncol(covs)))
#         model.res <- lm(reformulate(paste0("x",1:ncol(covs)), "y"),dat)
#         p.residual[i,] = resid(model.res) 
# }

Check if result is same for me and old code

First we need to make sure columns are in same order

In [31]:
p.residual_tocompare <- p.residual

In [32]:
rownames(p.residual_tocompare) <- paste0("pos_", cg)

In [33]:
p.residual_tocompare <- t(p.residual_tocompare)

In [34]:
dim(p.residual_tocompare)

In [35]:
dim(methInput@methylations)

In [36]:
p.residual_tocompare <- p.residual_tocompare[order(rownames(p.residual_tocompare)), ]

In [37]:
is.data.frame(p.residual_tocompare)

In [38]:
is.data.frame(methInput@methylations)

In [39]:
names(attributes(p.residual_tocompare))

In [40]:
class(p.residual_tocompare)

In [41]:
class(methInput@methylations)

In [42]:
compare_matrices <- function(mat1, mat2) {
  diffs <- mat1 != mat2
  indices <- which(diffs, arr.ind = TRUE)
  rownames <- rownames(mat1)[indices[, 1]]
  colnames <- colnames(mat1)[indices[, 2]]
  differences <- mat1[indices] - mat2[indices]
  
  results <- data.table(
    row_index = indices[, 1],
    col_index = indices[, 2],
    rowname = rownames,
    colname = colnames,
    difference = differences
  )
  
  overall_mean_difference <- if (nrow(results) > 0) mean(abs(differences)) else 0
  percentage_differences <- (nrow(results) / (nrow(mat1) * ncol(mat1))) * 100
  largest_difference <- if (nrow(results) > 0) max(abs(differences)) else 0
  
  cat("Total differences:", nrow(results), "\n")
  cat("Percentage of values with differences:", percentage_differences, "%\n")
  cat("Largest difference:", largest_difference, "\n")
  cat("Overall mean difference:", overall_mean_difference, "\n\n")
  
  if (nrow(results) > 0) {
    cat("Differences found:\n")
    for (i in 1:nrow(results)) {
      cat("Row index:", results$row_index[i], 
          "Row name:", results$rowname[i], 
          "Column index:", results$col_index[i], 
          "Column name:", results$colname[i], 
          "Difference:", results$difference[i], "\n")
    }
  }
  
  # Round matrices to 3 decimal places and check again
  mat1_rounded <- round(mat1, 3)
  mat2_rounded <- round(mat2, 3)
  diffs_rounded <- mat1_rounded != mat2_rounded
  indices_rounded <- which(diffs_rounded, arr.ind = TRUE)
  rownames_rounded <- rownames(mat1_rounded)[indices_rounded[, 1]]
  colnames_rounded <- colnames(mat1_rounded)[indices_rounded[, 2]]
  differences_rounded <- mat1_rounded[indices_rounded] - mat2_rounded[indices_rounded]
  
  results_rounded <- data.table(
    row_index = indices_rounded[, 1],
    col_index = indices_rounded[, 2],
    rowname = rownames_rounded,
    colname = colnames_rounded,
    difference = differences_rounded
  )
  
  overall_mean_difference_rounded <- if (nrow(results_rounded) > 0) mean(abs(differences_rounded)) else 0
  percentage_differences_rounded <- (nrow(results_rounded) / (nrow(mat1_rounded) * ncol(mat1_rounded))) * 100
  largest_difference_rounded <- if (nrow(results_rounded) > 0) max(abs(differences_rounded)) else 0
  
  cat("\nAfter rounding to 3 decimal places:\n")
  cat("Total differences:", nrow(results_rounded), "\n")
  cat("Percentage of values with differences:", percentage_differences_rounded, "%\n")
  cat("Largest difference:", largest_difference_rounded, "\n")
  cat("Overall mean difference:", overall_mean_difference_rounded, "\n")
  
  if (nrow(results_rounded) > 0) {
    cat("Differences found:\n")
    for (i in 1:nrow(results_rounded)) {
      cat("Row index:", results_rounded$row_index[i], 
          "Row name:", results_rounded$rowname[i], 
          "Column index:", results_rounded$col_index[i], 
          "Column name:", results_rounded$colname[i], 
          "Difference:", results_rounded$difference[i], "\n")
    }
  }
}

#compare_matrices(methInput@methylations, p.residual_tocompare)


Let's round them down so they're identical

In [43]:
p.residual_tocompare <- round(p.residual_tocompare, 5)
methInput@methylations <- round(methInput@methylations, 5)

In [44]:
p.residual_tocompare[1:5, 1:5]

Unnamed: 0,pos_73418062,pos_73418161,pos_73418186,pos_73418205,pos_73418313
Br1003,0.01991,0.02247,0.02312,0.02362,0.02648
Br1004,-0.01984,-0.01967,-0.01962,-0.01957,-0.01928
Br1007,0.0256,0.02751,0.02799,0.02836,0.03045
Br1016,-0.01925,-0.01926,-0.01925,-0.01924,-0.01914
Br1017,-0.03593,-0.03794,-0.03844,-0.03882,-0.04091


In [45]:
methInput@methylations[1:5, 1:5]

Unnamed: 0,pos_73418062,pos_73418161,pos_73418186,pos_73418205,pos_73418313
Br1003,0.01991,0.02247,0.02312,0.02362,0.02648
Br1004,-0.01984,-0.01967,-0.01962,-0.01957,-0.01928
Br1007,0.0256,0.02751,0.02799,0.02836,0.03045
Br1016,-0.01925,-0.01926,-0.01925,-0.01924,-0.01914
Br1017,-0.03593,-0.03794,-0.03844,-0.03882,-0.04091


In [46]:
p.residual <- round(p.residual, 5)

In [47]:
snp.gwas2 <- NULL

In [48]:
load("p1.rda", verbose = TRUE)

Loading objects:
  snp.gwas2
  snp.1kg.eur2
  map.1kg.eur2
  snp2
  map2
  p
  BSsample


In [49]:
# min(snp.gwas2$pos_hg38)
# max(snp.gwas2$pos_hg38)

In [50]:
# load("p1.rda")
# pos_we_got <- snp.gwas2$pos_hg38
# saveRDS(pos_we_got, "20-intermediate_positions_in_old_set.csv")

In [51]:
#pos_we_got <- readRDS("20-intermediate_positions_in_old_set.csv")

In [52]:
#pos_we_got

In [53]:
# head(snp.gwas2)

### summary stats

In [54]:
library(data.table)
library(CpGWAS)

In [55]:
ss_path <- "/home/naglemi/mwas/gwas/gwas_stat_scz"

In [56]:
snp.gwas2 <- fread(ss_path, skip = 1, header = FALSE)
colnames(snp.gwas2) <- strsplit(readLines(ss_path, n = 1), "\t")[[1]]

In [57]:
snp.gwas2$z <- log(snp.gwas2$OR)/snp.gwas2$SE

In [58]:
snp.gwas2 <- snp.gwas2[, c(2, 1, 3, 3, 8, 4, 5, 20, 11)]

In [59]:
head(snp.gwas2, n = 1)

SNP,CHR,BP,BP,INFO,A1,A2,z,P
<chr>,<int>,<int>,<int>.1,<dbl>,<chr>,<chr>,<dbl>,<dbl>
rs62513865,8,100579985,100579985,0.963,C,T,0.7016221,0.4847


In [60]:
colnames(snp.gwas2)[1:5] <- c("snp", "chr", "pos_hg38", "pos_hg38", "info")

In [61]:
snp.gwas2 <- snp.gwas2[which(snp.gwas2$chr == 1 & snp.gwas2$pos_hg38 >= (73274305-10000) & snp.gwas2$pos_hg38 <= (73419830 + 10000)), ]

In [62]:
dim(snp.gwas2)

In [63]:
snp.gwas2 <- snp.gwas2[order(snp.gwas2$pos_hg38), ]

In [64]:
head(snp.gwas2)

snp,chr,pos_hg38,pos_hg38,info,A1,A2,z,P
<chr>,<int>,<int>,<int>.1,<dbl>,<chr>,<chr>,<dbl>,<dbl>
rs6672818,1,73265462,73265462,0.991,C,T,-7.24430537,5.189e-13
rs72676673,1,73267315,73267315,0.971,A,G,0.04124304,0.9672
rs61765637,1,73269720,73269720,0.993,G,C,7.25603083,4.623e-13
rs4571923,1,73270879,73270879,0.994,G,A,-7.41892929,1.37e-13
rs12759031,1,73271206,73271206,0.992,C,T,-7.22079453,5.844e-13
rs10890025,1,73272480,73272480,0.994,A,G,-7.25544375,4.328e-13


In [65]:
#snp.gwas2 <- snp.gwas2[which(snp.gwas2$pos_hg38 %in% pos_we_got), ]

In [66]:
dim(snp.gwas2)

In [67]:
# built predition models
idx.ea <- BSobj2$race == "CAUC"

### SNPs in LIBD population

#### For reference, first load Shizhong's formatted SNPs on Chr7

In [68]:
snp2_sorted <- snp2[, order(names(snp2))]

In [69]:
colnames(snp2) <- gsub("Br0", "Br", colnames(snp2))

In [70]:
snp2 <- snp2[, colnames(snp2) %in% colnames(p.residual)]

In [71]:
dim(snp2)

In [72]:
head(snp2)

Unnamed: 0_level_0,Br836,Br845,Br848,Br863,Br914,Br948,Br949,Br963,Br983,Br991,⋯,Br5373,Br5398,Br5422,Br5426,Br5460,Br5467,Br5475,Br5488,Br5584,Br5590
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
chr7:1963098:T:C,2.0,2,2.0,2,2.0,1.999,2.0,1.974,1.0,2,⋯,1.954,2,2,2,2,2,2,1,1,1
chr7:1963408:C:T,0.001,2,1.991,0,1.969,0.006,0.004,0.14,2.0,0,⋯,2.0,0,1,2,2,1,1,2,2,2
chr7:1964786:C:T,2.0,2,1.0,2,2.0,2.0,2.0,2.0,1.0,2,⋯,0.001,2,2,0,2,2,2,1,1,2
chr7:1966112:T:C,2.0,2,1.001,2,2.0,2.0,2.0,2.0,1.002,2,⋯,0.015,2,2,0,2,2,2,1,1,2
chr7:1973362:G:A,2.0,2,1.095,2,2.0,2.0,0.004,2.0,1.002,2,⋯,0.015,2,2,0,2,1,2,1,1,2
chr7:1975412:T:G,2.0,2,2.0,2,2.0,2.0,2.0,2.0,1.0,2,⋯,2.0,2,2,2,2,2,2,1,1,1


In [73]:
snp2_positions <- stringr::str_split_fixed(rownames(snp2), ":", 3)[, 2]

#### Now let's load ours on Chr1

In [83]:
paths <- list(pvar_path = "/expanse/lustre/projects/jhu152/naglemi/mwas/gwas/libd_chr1.pvar",
              pgen_path = "/expanse/lustre/projects/jhu152/naglemi/mwas/gwas/libd_chr1.pgen",
              psam_path = "/expanse/lustre/projects/jhu152/naglemi/mwas/gwas/libd_chr1.psam")

my_SNPs <- loadSNPData(paths$pvar_path, paths$pgen_path, paths$psam_path)

In [75]:
snp_indices_of_interest <- which(my_SNPs$pvar_dt$POS >= 73274305-10000 & my_SNPs$pvar_dt$POS <= 73419830 + 10000)

In [76]:
snp3 <- pgenlibr::ReadList(my_SNPs$pgen,
                        variant_subset = snp_indices_of_interest)
colnames(snp3) <- my_SNPs$pvar_dt$ID[snp_indices_of_interest]
rownames(snp3) <- my_SNPs$psam$`#IID`

In [77]:
snp3[1:10, 1:10]

Unnamed: 0,chr1:73265462:C:T,chr1:73267315:A:G,chr1:73269720:G:C,chr1:73270879:G:A,chr1:73271206:C:T,chr1:73272480:A:G,chr1:73273958:G:C,chr1:73276935:T:G,chr1:73277452:A:G,chr1:73278190:A:G
Br1602,0.0,0,0.9959717,0,0.0,0,0.0,0.0,0,0.9940186
Br1203,2.0,0,0.0,2,2.0,2,2.0,2.0,0,0.0
Br1214,2.0,0,0.0,2,2.0,2,2.0,2.0,0,0.0
Br2149,0.9970093,0,0.9959717,1,0.9970093,1,0.9979858,0.9979858,0,0.9940186
Br1016,0.0,0,1.9520264,0,0.0,0,0.0,0.0,0,1.9910278
Br1580,1.0,0,1.0,1,1.0,1,1.0,1.0,0,1.0
Br1646,2.0,0,0.0,2,2.0,2,2.0,2.0,0,0.0
Br1823,0.0,0,2.0,0,0.0,0,0.0,0.0,0,2.0
Br1696,2.0,0,0.0,2,2.0,2,2.0,2.0,0,0.0
Br1513,1.0,0,1.0,1,1.0,1,1.0,1.0,0,1.0


In [78]:
map3 <- data.frame(POS = stringr::str_split_fixed(colnames(snp3), ":", 3)[, 2])

In [79]:
snp3 <- t(snp3)

In [80]:
# snp3 <- snp3[which(map3$POS %in% pos_we_got), ]

In [81]:
map3 <- data.frame(POS = stringr::str_split_fixed(rownames(snp3), ":", 3)[, 2])

In [82]:
dim(map3)

In [86]:
head(map3)

Unnamed: 0_level_0,POS
Unnamed: 0_level_1,<chr>
1,73265462
2,73267315
3,73269720
4,73270879
5,73271206
6,73272480


In [87]:
dim(map3)

#### Make sure we're working with same SNPs in cpgwas (and check map3 object)

In [96]:
dim(methInput@pvar_dt[which(methInput@pvar_dt$POS >= 73274305-10000 & methInput@pvar_dt$POS <= 73419830 + 10000 &
                        methInput@pvar_dt$`#CHROM` == 1), ])

In [99]:
my_map3 <- data.frame(POS = methInput@pvar_dt$POS[which(methInput@pvar_dt$POS >= 73274305-10000 & methInput@pvar_dt$POS <= 73419830 + 10000 &
                        methInput@pvar_dt$`#CHROM` == 1)])

In [101]:
all(map3 == my_map3)

### SNPs in reference population

In [102]:
#snp.1kg.eur2

In [None]:
paths <- list(pvar_path = "/expanse/lustre/projects/jhu152/naglemi/mwas/gwas/ref_EUR_chr1.pvar",
              pgen_path = "/expanse/lustre/projects/jhu152/naglemi/mwas/gwas/ref_EUR_chr1.pgen",
              psam_path = "/expanse/lustre/projects/jhu152/naglemi/mwas/gwas/ref_EUR_chr1.psam")

my_SNPs <- loadSNPData(paths$pvar_path, paths$pgen_path, paths$psam_path)

snp_indices_of_interest <- which(my_SNPs$pvar_dt$POS >= 73274305-10000 & my_SNPs$pvar_dt$POS <= 73419830 + 10000)

In [105]:
snp.1kg.eur2 <- pgenlibr::ReadList(my_SNPs$pgen,
                        variant_subset = snp_indices_of_interest)
colnames(snp.1kg.eur2) <- my_SNPs$pvar_dt$ID[snp_indices_of_interest]
rownames(snp.1kg.eur2) <- my_SNPs$psam$`IID`

In [106]:
dim(snp.1kg.eur2)

In [107]:
map.1kg.eur2 <- my_SNPs$pvar_dt

In [108]:
map.1kg.eur2 <- map.1kg.eur2[snp_indices_of_interest, ]

In [109]:
dim(map.1kg.eur2)

In [110]:
#map.1kg.eur2 <- data.frame(POS = stringr::str_split_fixed(colnames(snp.1kg.eur2), ":", 3)[, 2])

In [111]:
snp.1kg.eur2 <- t(snp.1kg.eur2)

In [112]:
# snp3 <- snp3[which(map3$POS %in% pos_we_got), ]

### Set window size and any other parameters

In [113]:
wind <- 10000

Is 1se vs min for lambda the problem?

## Stage 1

#### Shizhong's version

In [115]:
#head(map3)

In [116]:
#dim(map3)

In [117]:
#dim(snp3)

In [118]:
p.residual <- p.residual[, order(colnames(p.residual))]

In [119]:
#dim(snp3)

In [120]:
#p.residual

In [121]:
#snp3[1:10, 1:10]

In [122]:
snp3 <- snp3[, colnames(snp3) %in% colnames(p.residual)]

In [123]:
snp3 <- snp3[, order(colnames(snp3))]

In [124]:
#dim(snp3)

In [125]:
for(k in 1:length(wind)){
    models.ea <- c()
    models.all <- c()
    for(i in 1:length(cg)){
    #for(i in 1){
            cat(i,"\n")
            print(paste0("This cg is: ", cg[i]))
            range1 <- ifelse(cg[i] - wind[k] > 0,cg[i] - wind[k],0)
            range2 <- cg[i] + wind[k]
            idx <- map3$POS > range1 & map3$POS < range2
            # go to next cg if no snps within window
            if(sum(idx) <= 1){
                    next
            }
            geno <- snp3[idx,] # changed snp2 to snp3
            rownames(geno) <- map3$POS[idx]
            trainX <- t(geno)
            trainY <- p.residual[i,]
            fit <- elastic.net(trainX,trainY)
            fit <- tryCatch(
                    elastic.net(trainX,trainY),
                    error = function(e) {return ("err")})
            if(!is.data.frame(fit)){
                if(fit == "err"){
                    next
                }
            }
            if(nrow(fit) == 0) next

            fit$cg <- cg[i]
            models.all <- rbind(models.all,fit)
            # EA only
            trainX <- trainX[idx.ea,]
            if(sum(apply(trainX,2,var)!=0) <= 1){
                    next
            }
            trainY <- trainY[idx.ea]
            fit <- tryCatch(
                    elastic.net(trainX,trainY),
                    error = function(e) {return ("err")})
            if(!is.data.frame(fit)){
                if(fit == "err"){
                    next
                }
            }
            if(nrow(fit) == 0) next
            fit$cg <- cg[i]
            models.ea <- rbind(models.ea,fit)
    }
}

1 
[1] "This cg is: 73418062"


“executing %dopar% sequentially: no parallel backend registered”


[1] "Dim of trainX: 297" "Dim of trainX: 43" 
[1] "Len of trainY: 297"
[1] "Coefficients when fitting: "
                       s1
(Intercept)  2.383551e-02
73408072    -7.460869e-03
73416795    -4.897557e-03
73417197    -8.083936e-05
73419155    -4.908524e-03
73424910    -2.370414e-04
73426204    -9.676025e-04
73426737    -9.617348e-04
73426896    -9.460529e-04
73426930    -9.352965e-04
73427141    -8.828308e-04
[1] "Coefficients when optimal: "
                       s0
(Intercept)  0.0238309592
73408072    -0.0074849040
73416795    -0.0048863698
73417197    -0.0000624824
73419155    -0.0049141384
73424910    -0.0002555151
73426204    -0.0009385057
73426737    -0.0008991830
73426896    -0.0009113627
73426930    -0.0009672789
73427141    -0.0009502828
[1] "Dim of trainX: 297" "Dim of trainX: 43" 
[1] "Len of trainY: 297"
[1] "Coefficients when fitting: "
                       s1
(Intercept)  2.383551e-02
73408072    -7.460869e-03
73416795    -4.897557e-03
73417197    -8.083936e-05
73

In [126]:
models.ea <- models.ea[models.ea[,1] != "(Intercept)",]
models.all <- models.all[models.all[,1] != "(Intercept)",]

In [128]:
models.ea

Unnamed: 0_level_0,features,coefs,cg
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>
2,73408072,-5.539013e-03,73418062
3,73411882,6.031568e-04,73418062
4,73416795,-5.473329e-03,73418062
5,73419155,-5.539376e-03,73418062
6,73425398,-4.337124e-05,73418062
7,73426204,-9.944934e-04,73418062
8,73426737,-9.286057e-04,73418062
9,73426896,-8.821380e-04,73418062
10,73426930,-9.025849e-04,73418062
11,73427141,-8.904469e-04,73418062


In [127]:
models.all

Unnamed: 0_level_0,features,coefs,cg
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>
2,73408072,-0.007484904,73418062
3,73416795,-0.00488637,73418062
4,73417197,-6.24824e-05,73418062
5,73419155,-0.004914138,73418062
6,73424910,-0.0002555151,73418062
7,73426204,-0.0009385057,73418062
8,73426737,-0.000899183,73418062
9,73426896,-0.0009113627,73418062
10,73426930,-0.0009672789,73418062
11,73427141,-0.0009502828,73418062


#### My version

In [175]:
scaffoldIdentifier <- "debugging_test_071624"

scaffold_models <- fit_MWAS_models(
  BSobj = BSobj2,
  methInput = methInput,
  window_sizes = c(10000),
  chunk1 = 1,
  chunk2 = length(methInput@methylations_positions),
  n_fold = 5,
  scaffoldIdentifier = scaffoldIdentifier,
  outdir = "20-OUT_debugging_test",
  verbose = FALSE,
  lambda_choice = "1se",
  alphas = 0.5,
  cores_per_alpha = "all",
  num_cores = 120,
  allow_inefficient_parallelization = FALSE,
  save_evaluation_results_each_fold = FALSE,
  save_glmnet_object = FALSE,
  cv_eval_mode = "dynamic",
  omit_folds_with_na_r = TRUE,
  maf = 0,
  na.action = "remove"
)

MethylationScaff object saved to 20-OUT_debugging_test/debugging_test_071624.rds



In [176]:
methScaff <- readRDS("20-OUT_debugging_test/debugging_test_071624.rds")

In [177]:
library(data.table)

my_models.all <- rbindlist(lapply(methScaff@models, function(model) {
  data.table(
    features = sub("chr1:", "", names(model@snpWeights)),
    coefs = as.numeric(model@snpWeights),
    cg = rep(model@methylationPosition, length(model@snpWeights))
  )
}), use.names = TRUE, fill = TRUE)

my_models.all$features <- stringr::str_split_fixed(my_models.all$features, ":", 3)[,1]
setorder(my_models.all, cg)
head(my_models.all)


features,coefs,cg
<chr>,<dbl>,<int>
73408072,-0.007484904,73418062
73416795,-0.0048863698,73418062
73417197,-6.24824e-05,73418062
73419155,-0.0049141384,73418062
73424910,-0.0002555151,73418062
73426204,-0.0009385057,73418062


In [178]:
dim(my_models.all)

In [179]:
dim(models.all)

In [180]:
dim(na.omit(models.all))

In [181]:
head(my_models.all)

features,coefs,cg
<chr>,<dbl>,<int>
73408072,-0.007484904,73418062
73416795,-0.0048863698,73418062
73417197,-6.24824e-05,73418062
73419155,-0.0049141384,73418062
73424910,-0.0002555151,73418062
73426204,-0.0009385057,73418062


In [182]:
head(models.all)

Unnamed: 0_level_0,features,coefs,cg
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>
2,73408072,-0.007484904,73418062
3,73416795,-0.0048863698,73418062
4,73417197,-6.24824e-05,73418062
5,73419155,-0.0049141384,73418062
6,73424910,-0.0002555151,73418062
7,73426204,-0.0009385057,73418062


For testing purposes, although I don't yet know why we have more rows in my_models.all than models.all, let's subset so they match and we can do a proper comparison.

In [185]:
models.all$tag <- paste0(models.all$features, "to", models.all$cg)
my_models.all$tag <- paste0(my_models.all$features, "to", my_models.all$cg)

In [186]:
my_models.all <- my_models.all[which(my_models.all$tag %in% models.all$tag), ]

In [187]:
dim(my_models.all)

In [188]:
identical(models.all, my_models.all)

In [190]:
models.all_compare <- models.all[order(models.all$tag), ]
my_models.all_compare <- my_models.all[order(my_models.all$tag), ]
rownames(models.all_compare) <- NULL
rownames(my_models.all_compare) <- NULL


In [195]:
my_models.all_compare <- as.data.frame(my_models.all_compare)

In [196]:
identical(models.all_compare, my_models.all_compare)

In [197]:
head(models.all_compare)

Unnamed: 0_level_0,features,coefs,cg,tag
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>,<chr>
1,73408072,-0.007484904,73418062,73408072to73418062
2,73409670,-3.515759e-05,73418313,73409670to73418313
3,73411882,0.0006545192,73418161,73411882to73418161
4,73411882,0.0006674926,73418186,73411882to73418186
5,73411882,0.0006772293,73418205,73411882to73418205
6,73411882,0.001147792,73418313,73411882to73418313


In [198]:
head(my_models.all_compare)

Unnamed: 0_level_0,features,coefs,cg,tag
Unnamed: 0_level_1,<chr>,<dbl>,<int>,<chr>
1,73408072,-0.007484904,73418062,73408072to73418062
2,73409670,-0.0002345668,73418313,73409670to73418313
3,73411882,0.0014164816,73418161,73411882to73418161
4,73411882,0.0019004045,73418186,73411882to73418186
5,73411882,0.0018515422,73418205,73411882to73418205
6,73411882,0.0015695685,73418313,73411882to73418313


In [199]:
models.all <- as.data.frame(my_models.all)

### Make sure my old RDS pretty much matches up with what we have in the `my_models.all_compare` object.

In [23]:
# df2 <- fread("12-OUT_matched_SNP_meth_cov_outputs.csv")

In [24]:
# df2 <- df2[which(df2$Chr == 1 &
#                  df2$population == "all" &
#                  df2$region == "caud"), ]

In [28]:
test_in <- readRDS("..//output_EXPANSE_a2_caud/libd_chr1-chr1_all-libd_chr1-chr1_all-908982-928981-dynamic-1corestotal-allcorepera-20240416-172011.rds")

In [29]:
names(attributes(test_in))

In [30]:
test_in$models

Loading required package: CpGWAS



ERROR: Error in test_in$models: $ operator not defined for this S4 class


## Stage 2

#### Shizhong's version

In [None]:
if(!dir.exists(outd)) dir.create(outd)

In [200]:
# mwas by models of all samples
cg2 <- unique(models.all$cg)
mwas.all <- matrix(0,nrow=length(cg2),ncol=2)
for(i in 1:length(cg2)){
    pos <- models.all[models.all$cg == cg2[i],1]
    
    gwas <- snp.gwas2$z[is.element(snp.gwas2$pos_hg38, pos)]
    weight <- models.all[models.all$cg == cg2[i],2]
    geno <- snp.1kg.eur2[match(pos,map.1kg.eur2$POS),]
    mwas.all[i,] <- MWAS(gwas, weight, t(geno))
}
rownames(mwas.all) <- cg2
colnames(mwas.all) <- c("z","p")

# mwas by models of EA samples
cg2 <- unique(models.ea$cg)
mwas.ea <- matrix(0,nrow=length(cg2),ncol=2)
for(i in 1:length(cg2)){
    pos <- models.ea[models.ea$cg == cg2[i],1]
    gwas <- snp.gwas2$z[is.element(snp.gwas2$pos_hg38, pos)]
    weight <- models.ea[models.ea$cg == cg2[i],2]
    geno <- snp.1kg.eur2[match(pos,map.1kg.eur2$POS),]
    mwas.ea[i,] <- MWAS(gwas, weight, t(geno))
}
rownames(mwas.ea) <- cg2
colnames(mwas.ea) <- c("z","p")

# output models and mwas results
outf <- paste0(outd,"/models-a9-covnew.all.wind.",wind[k])
write.csv(models.all,outf)
outf <- paste0(outd,"/models-a9-covnew.ea.wind.",wind[k])
write.csv(models.ea,outf)
outf <- paste0(outd,"/mwas-a9-covnew.all.wind.",wind[k])
write.csv(mwas.all,outf)
outf <- paste0(outd,"/mwas-a9-covnew.ea.wind.",wind[k])
write.csv(mwas.ea,outf)

In [201]:
mwas.all

Unnamed: 0,z,p
73418062,-12.62631,1.512075e-36
73418161,-15.6353,4.184865e-55
73418186,-17.13924,7.563337e-66
73418205,-17.14866,6.43223e-66
73418313,-19.04349,7.439452e-81


##### Very verbose

In [None]:
# # mwas by models of all samples
# cg2 <- unique(models.all$cg)
# mwas.all <- matrix(0, nrow = length(cg2), ncol = 2)
# cat("Dimensions of relevant objects:\n")
# cat("models.all:", dim(models.all), "\n")
# cat("snp.gwas2:", dim(snp.gwas2), "\n")
# cat("map.1kg.eur2:", dim(map.1kg.eur2), "\n")
# cat("snp.1kg.eur2:", dim(snp.1kg.eur2), "\n\n")

# for (i in 1:length(cg2)) {
#     pos <- models.all[models.all$cg == cg2[i], 1]
#     gwas <- snp.gwas2$z[is.element(snp.gwas2$pos_hg38, pos)]
#     weight <- models.all[models.all$cg == cg2[i], 2]
#     match_indices <- match(pos, map.1kg.eur2$POS)
    
#     cat("Iteration:", i, "\n")
#     cat("Current CG:", cg2[i], "\n")
#     cat("Positions:\n")
#     print(head(pos))
#     cat("GWAS Z-scores:\n")
#     print(head(gwas))
#     cat("Weights:\n")
#     print(head(weight))
#     cat("Matching Indices:\n")
#     print(head(match_indices))
    
#     tryCatch({
#         if (any(is.na(match_indices))) stop("NA values found in match_indices")
#         if (any(match_indices > nrow(snp.1kg.eur2))) stop("Out of bounds indices found")
#     }, error = function(e) {
#         cat("Error detected:", e$message, "\n")
#         cat("Dimensions of relevant objects at error detection:\n")
#         cat("models.all:", dim(models.all), "\n")
#         cat("snp.gwas2:", dim(snp.gwas2), "\n")
#         cat("map.1kg.eur2:", dim(map.1kg.eur2), "\n")
#         cat("snp.1kg.eur2:", dim(snp.1kg.eur2), "\n")
#         cat("Positions causing error:\n")
#         print(pos)
#         cat("Matching Indices causing error:\n")
#         print(match_indices)
#         stop("Stopping execution due to error.")
#     })
    
#     geno <- NULL
#     tryCatch({
#         geno <- snp.1kg.eur2[match_indices, ]
#     }, error = function(e) {
#         cat("Error accessing genotype data at iteration:", i, "\n")
#         cat("Error message:", e$message, "\n")
#         stop("Stopping execution due to error.")
#     })
    
#     cat("Genotype Data:\n")
#     print(head(geno))
#     tryCatch({
#         mwas.all[i, ] <- MWAS(gwas, weight, t(geno))
#         cat("MWAS Results (z, p):\n")
#         print(mwas.all[i, ])
#         cat("\n")
#     }, error = function(e) {
#         cat("Error performing MWAS at iteration:", i, "\n")
#         cat("Error message:", e$message, "\n")
#         stop("Stopping execution due to error.")
#     })
# }
# rownames(mwas.all) <- cg2
# colnames(mwas.all) <- c("z", "p")

# # mwas by models of EA samples
# cg2 <- unique(models.ea$cg)
# mwas.ea <- matrix(0, nrow = length(cg2), ncol = 2)
# cat("Dimensions of relevant objects:\n")
# cat("models.ea:", dim(models.ea), "\n")
# cat("snp.gwas2:", dim(snp.gwas2), "\n")
# cat("map.1kg.eur2:", dim(map.1kg.eur2), "\n")
# cat("snp.1kg.eur2:", dim(snp.1kg.eur2), "\n\n")

# for (i in 1:length(cg2)) {
#     pos <- models.ea[models.ea$cg == cg2[i], 1]
#     gwas <- snp.gwas2$z[is.element(snp.gwas2$pos_hg38, pos)]
#     weight <- models.ea[models.ea$cg == cg2[i], 2]
#     match_indices <- match(pos, map.1kg.eur2$POS)
    
#     cat("Iteration:", i, "\n")
#     cat("Current CG:", cg2[i], "\n")
#     cat("Positions:\n")
#     print(head(pos))
#     cat("GWAS Z-scores:\n")
#     print(head(gwas))
#     cat("Weights:\n")
#     print(head(weight))
#     cat("Matching Indices:\n")
#     print(head(match_indices))
    
#     tryCatch({
#         if (any(is.na(match_indices))) stop("NA values found in match_indices")
#         if (any(match_indices > nrow(snp.1kg.eur2))) stop("Out of bounds indices found")
#     }, error = function(e) {
#         cat("Error detected:", e$message, "\n")
#         cat("Dimensions of relevant objects at error detection:\n")
#         cat("models.ea:", dim(models.ea), "\n")
#         cat("snp.gwas2:", dim(snp.gwas2), "\n")
#         cat("map.1kg.eur2:", dim(map.1kg.eur2), "\n")
#         cat("snp.1kg.eur2:", dim(snp.1kg.eur2), "\n")
#         cat("Positions causing error:\n")
#         print(pos)
#         cat("Matching Indices causing error:\n")
#         print(match_indices)
#         stop("Stopping execution due to error.")
#     })
    
#     geno <- NULL
#     tryCatch({
#         geno <- snp.1kg.eur2[match_indices, ]
#     }, error = function(e) {
#         cat("Error accessing genotype data at iteration:", i, "\n")
#         cat("Error message:", e$message, "\n")
#         stop("Stopping execution due to error.")
#     })
    
#     cat("Genotype Data:\n")
#     print(head(geno))
#     tryCatch({
#         mwas.ea[i, ] <- MWAS(gwas, weight, t(geno))
#         cat("MWAS Results (z, p):\n")
#         print(mwas.ea[i, ])
#         cat("\n")
#     }, error = function(e) {
#         cat("Error performing MWAS at iteration:", i, "\n")
#         cat("Error message:", e$message, "\n")
#         stop("Stopping execution due to error.")
#     })
# }
# rownames(mwas.ea) <- cg2
# colnames(mwas.ea) <- c("z", "p")


#### My version

In [None]:
# # Script C: script_C.R
# library(CpGWAS)
# library(data.table)
# library(stringr)
# library(optparse)

# # Command line options
# option_list <- list(
#   make_option(c("-g", "--genome_file_index"), type = "integer", default = 1,
#               help = "Index of genome file to process"),
#   make_option(c("-d", "--data_file"), type = "character", default = "/expanse/lustre/projects/jhu152/naglemi/mwas/CpGWAS/scripts/12-OUT_matched_SNP_meth_cov_outputs.csv",
#               help = "Path to data file")
# )

# opt <- parse_args(OptionParser(option_list = option_list))

# # Load genome files
# genome_files <- list.files("/expanse/lustre/projects/jhu152/naglemi/mwas/gwas",
#                            pattern = "EUR", full.names = TRUE)
# genome_files <- genome_files[grepl("pvar", genome_files)]

# genome_files <- data.table(path = genome_files, Chr = NA)

# genome_files$Chr <- str_split_fixed(genome_files$path, "chr", 2)[, 2]
# genome_files$Chr <- gsub(".pvar", "", genome_files$Chr)

# genome_files$Chr <- as.integer(genome_files$Chr)
# genome_files <- genome_files[order(genome_files$Chr), ]

# df <- fread(opt$data_file)

In [202]:
summary_stats_list <- "/home/naglemi/mwas/gwas/gwas_stat_scz"

# Pre-load all summary stats files into a list and clean/standardize column names
summary_stats_data <- lapply(summary_stats_list, function(path) {
  stats <- suppressWarnings(data.table::fread(path))
  colnames(stats) <- gsub("#CHROM", "CHR", colnames(stats))
  clean_and_standardize_colnames(stats)
})

In [None]:
# print("Starting genome file processing")
# # Process the specified genome file
# g <- opt$genome_file_index
# print(paste("Processing genome file index:", g))

# paths <- list(
#   pvar_path = genome_files[g]$path,
#   pgen_path = gsub("pvar", "pgen", genome_files[g]$path),
#   psam_path = gsub("pvar", "psam", genome_files[g]$path)
# )

# my_SNPs <- CpGWAS::loadSNPData(paths$pvar_path, paths$pgen_path, paths$psam_path)

In [269]:
paths <- list(pvar_path = "/expanse/lustre/projects/jhu152/naglemi/mwas/gwas/ref_EUR_chr1.pvar",
              pgen_path = "/expanse/lustre/projects/jhu152/naglemi/mwas/gwas/ref_EUR_chr1.pgen",
              psam_path = "/expanse/lustre/projects/jhu152/naglemi/mwas/gwas/ref_EUR_chr1.psam")

my_SNPs <- loadSNPData(paths$pvar_path, paths$pgen_path, paths$psam_path)

snp_indices_of_interest <- which(my_SNPs$pvar_dt$POS >= 73274305-10000 & my_SNPs$pvar_dt$POS <= 73419830 + 10000)

In [270]:
df <- fread("12-OUT_matched_SNP_meth_cov_outputs.csv")

In [271]:
df <- df[which(df$region == "caud" & df$population == "all"), ]

In [273]:
df <- df[which(df$chunk_start <= min(sites_to_test) &
         df$chunk_end >= max(sites_to_test)), ]

In [274]:
df <- df[which(df$Chr == "1")]

In [275]:
df

Chr,population,region,chunk_start,chunk_end,SNP_data,methylation_data,last_meth_value_with_SNP_coverage,first_meth_value_with_SNP_coverage,last_meth_index_with_SNP_coverage,first_meth_index_with_SNP_coverage,cov_file,modified_methylation_data,path
<int>,<chr>,<chr>,<int>,<dbl>,<chr>,<chr>,<int>,<int>,<int>,<int>,<chr>,<chr>,<chr>
1,all,caud,908982,928981,/dcs04/lieber/statsgen/shizhong/michael/mwas/gwas//libd_chr1.pgen,/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno/caud/out/chr1_all.rda,248918358,1069461,2202702,8982,/dcs04/lieber/statsgen/mnagle/mwas/full_covariates/all_caud.csv,/dcs04/lieber/statsgen/mnagle/mwas/pheno/caud/out/chr1_all_908982-928981.rds,..//output_EXPANSE_a2_caud/libd_chr1-chr1_all-libd_chr1-chr1_all-908982-928981-dynamic-1corestotal-allcorepera-20240416-172011.rds
1,all,caud,908982,928981,/dcs04/lieber/statsgen/shizhong/michael/mwas/gwas//libd_chr1.pgen,/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno/caud/out/chr1_all.rda,248918358,1069461,2202702,8982,/dcs04/lieber/statsgen/mnagle/mwas/full_covariates/all_caud.csv,/dcs04/lieber/statsgen/mnagle/mwas/pheno/caud/out/chr1_all_908982-928981.rds,..//output_EXPANSE_a2_caud/libd_chr1-chr1_all-libd_chr1-chr1_all-908982-928981-dynamic-1corestotal-allcorepera-caud-20240510-145818.rds


In [278]:
df <- df[1, ]

In [279]:
df_this_chr <- df

In [282]:
summary_stats_list

In [285]:
length(my_rds@models)

In [286]:
my_rds@models[[1]]

An object of class "MethylationBase"
Slot "methylationPosition":
[1] 73190890

Slot "windowSize":
[1] 10000

Slot "n_SNPs":
[1] 46

Slot "glmnetModel":
NULL

Slot "snpWeights":
chr1:73181248:C:T chr1:73185521:C:A chr1:73185836:A:T chr1:73186439:T:C 
    -0.0005646050     -0.0016379649      0.0019968620     -0.0016128907 
chr1:73192402:G:C chr1:73192879:T:C chr1:73195978:T:C chr1:73196486:A:G 
    -0.0008903797     -0.0017562081     -0.0009321344     -0.0009785899 
chr1:73197497:G:A chr1:73197772:G:A chr1:73198926:T:C chr1:73200161:A:G 
    -0.0009888311     -0.0010172362     -0.0010242944     -0.0017517977 

Slot "intercept":
         s0 
0.007864249 

Slot "alpha":
[1] 0.5

Slot "lambda":
[1] 0.004858786

Slot "evaluation_results":
        cor         mse 
0.570290973 0.000319468 

Slot "cv_eval_mode":
[1] "dynamic"

Slot "full_model_metrics":
           r          mse 
0.5841064481 0.0002993808 


In [284]:
my_rds

In [291]:
seq_along(my_rds@models)

In [294]:
summary_stats_data <- lapply(summary_stats_data, function(stats) stats[`CHR` == 1])

print("Loaded SNP data")
print("Files for this Chr:")
print(nrow(df_this_chr))
for(j in 1:nrow(df_this_chr)){
  print(paste0("File number: ", j))
  if (grepl("empty", df_this_chr$path[j])) {
    message(paste0("no model for ", df_this_chr$path[j]))
    next
  }
  #
  my_rds <- tryCatch({
    readRDS(df_this_chr$path[j])
  }, error = function(e) {
    # Print an error message and skip this iteration
    message("ALERT!!! Error reading RDS file: ", e$message)
    return(NULL)  # Return NULL to signal failure
  })
    
  # Check if the readRDS call returned NULL (which indicates an error)
  if (is.null(my_rds)) {
    stop("oops")
    next  # Skip the rest of this loop iteration
  }

  print(paste("Loaded RDS file:", df_this_chr$path[j]))

  for (k in 1:length(summary_stats_list)) {
    print(paste0("k is ", k))
    outname <- "20-OUT_MWAS_debug_stage2_results.rds"
    #if(file.exists(outname)) next
    summary_stats <- summary_stats_data[[k]]
    print("head of summary stats before clean:")
    print(head(summary_stats))

    MWASmodels <- vector("list", length(my_rds@models))
    if (is.null(summary_stats)) {
      summary_stats <- suppressWarnings(fread(summary_stats_list[[k]]))
      summary_stats <- clean_and_standardize_colnames(summary_stats)
    }
    print("head of summary stats after clean:")
    print(head(summary_stats))
      
    for (i in seq_along(my_rds@models)) {
      #print(i)
      this_MethylationBase <- my_rds@models[[i]]
      SNP_split <- stringr::str_split_fixed(names(this_MethylationBase@snpWeights), ":", 4)
      SNP_split[, 1] <- gsub("chr", "", SNP_split[, 1])
      SNP_split_dt <- data.table::as.data.table(SNP_split)
      data.table::setnames(SNP_split_dt, c("chr", "post", "ref", "alt"))
      SNP_split_dt[, `:=`(chr = as.integer(chr), post = as.integer(post))]
      data.table::setkey(SNP_split_dt, chr, post)

      relevant_SNP_indices <- my_SNPs$pvar_dt[SNP_split_dt, on = .(`#CHROM` = chr, POS = post), which = TRUE, nomatch = 0]
      relevant_ids <- my_SNPs$pvar_dt$ID[relevant_SNP_indices]
      summary_stats_sub <- summary_stats[relevant_ids, nomatch = 0]

      if (!identical(summary_stats_sub$BP, SNP_split_dt$post)) {
        summary_stats_sub <- summary_stats_sub[order(summary_stats_sub$BP), ]
        if (!identical(summary_stats_sub$BP, SNP_split_dt$post)) {
          unmatched_positions <- !SNP_split_dt$post %in% summary_stats_sub$BP
          if (any(unmatched_positions)) {
            SNP_split_dt <- SNP_split_dt[!unmatched_positions, ]
            this_MethylationBase@snpWeights <- this_MethylationBase@snpWeights[!unmatched_positions]

            relevant_SNP_indices <- my_SNPs$pvar_dt[SNP_split_dt, on = .(`#CHROM` = chr, POS = post), which = TRUE, nomatch = 0]
            if (!identical(summary_stats_sub$BP, SNP_split_dt$post)) {
              stop("SNP order does not match even after removing unmatched positions. This should not happen. Code is broken.")
            }
          }
        }
      }

      if (!identical(SNP_split_dt$alt, summary_stats_sub$A2) | !identical(SNP_split_dt$ref, summary_stats_sub$A1)) {
        not_matching <- which(SNP_split_dt$alt != summary_stats_sub$A2)
        summary_stats_ref_flipped <- SNP_split_dt$ref[not_matching]
        summary_stats_alt_flipped <- SNP_split_dt$alt[not_matching]
        SNP_split_dt[not_matching, `:=`(ref = summary_stats_alt_flipped, alt = summary_stats_ref_flipped)]
        this_MethylationBase@snpWeights[not_matching] <- this_MethylationBase@snpWeights[not_matching] * -1
      }

      G <- pgenlibr::ReadList(my_SNPs$pgen, variant_subset = relevant_SNP_indices)
      #print(paste("Performing MWAS for model index:", i))
      mwas_out <- mwas(z = summary_stats_sub$BETA, w = this_MethylationBase@snpWeights, G = G)

      MWASmodels[[i]] <- mwas_out
    }

    results <- MWASresults(MWASmodels, paths$pvar_path, paths$pgen_path, paths$psam_path, summary_stats_list[[k]], df_this_chr$path[j])
    saveRDS(results, outname)
    print(paste("Saved results to:", outname))
  }
}

[1] "Loaded SNP data"
[1] "Files for this Chr:"
[1] 1
[1] "File number: 1"
[1] "Loaded RDS file: ..//output_EXPANSE_a2_caud/libd_chr1-chr1_all-libd_chr1-chr1_all-908982-928981-dynamic-1corestotal-allcorepera-20240416-172011.rds"
[1] "k is 1"
[1] "head of summary stats before clean:"
Key: <SNP>
     CHR             SNP        BP     A1     A2 FRQ_A_53386 FRQ_U_77258  INFO
   <int>          <char>     <int> <char> <char>       <num>       <num> <num>
1:     1  1:10013014_A_C   9952956      A      C       0.922       0.921 0.992
2:     1 1:101070597_A_C 100605041      A      C       0.926       0.928 0.993
3:     1 1:101600749_C_A 101135193      C      A       0.887       0.889 0.978
4:     1 1:102023297_A_C 101557741      A      C       0.348       0.347 0.985
5:     1 1:102432899_G_T 101967343      G      T       0.825       0.829 0.974
6:     1 1:102962040_C_A 102496484      C      A       0.917       0.919 0.969
        OR     SE      P   ngt
     <num>  <num>  <num> <int>
1: 1.01918 

### Compare results

In [295]:
my_results <- readRDS("20-OUT_MWAS_debug_stage2_results.rds")

In [296]:
names(attributes(my_results))

In [297]:
length(my_results@MWASmodels)

In [298]:
my_results@MWASmodels[[4]]

In [299]:
trait <- "scz"
df$stage2_paths <- "20-OUT_MWAS_debug_stage2_results.rds"
#df$final_paths <- vector("list", length(df$stage2_paths))

In [300]:
vector("list", length(df$stage2_paths))

In [301]:
message("Processing trait: ", trait)
#df$final_paths <- paste0(df$stage2_paths, trait, "_results.rds")
output_file <- "20-OUT_MWAS_debug_stage2_results.csv"
header_written <- FALSE

Processing trait: scz



In [303]:
i <- 1

In [305]:
df_this_chr

Chr,population,region,chunk_start,chunk_end,SNP_data,methylation_data,last_meth_value_with_SNP_coverage,first_meth_value_with_SNP_coverage,last_meth_index_with_SNP_coverage,first_meth_index_with_SNP_coverage,cov_file,modified_methylation_data,path
<int>,<chr>,<chr>,<int>,<dbl>,<chr>,<chr>,<int>,<int>,<int>,<int>,<chr>,<chr>,<chr>
1,all,caud,908982,928981,/dcs04/lieber/statsgen/shizhong/michael/mwas/gwas//libd_chr1.pgen,/dcs04/lieber/statsgen/shizhong/michael/mwas/pheno/caud/out/chr1_all.rda,248918358,1069461,2202702,8982,/dcs04/lieber/statsgen/mnagle/mwas/full_covariates/all_caud.csv,/dcs04/lieber/statsgen/mnagle/mwas/pheno/caud/out/chr1_all_908982-928981.rds,..//output_EXPANSE_a2_caud/libd_chr1-chr1_all-libd_chr1-chr1_all-908982-928981-dynamic-1corestotal-allcorepera-20240416-172011.rds


In [306]:
stage2_in <- my_results
stage1_in <- readRDS(df$path[i])

In [307]:
if (length(stage1_in@models) != length(stage2_in@MWASmodels)) {
    stop("Files don't match")
}

data_list <- vector("list", length(stage1_in@models))
for (j in seq_along(stage1_in@models)) {
    model1 <- stage1_in@models[[j]]
    model2 <- stage2_in@MWASmodels[[j]]

    data_list[[j]] <- data.table(
        z = model2["z"],
        p = model2["p"],
        n = model2["n"],
        pos = model1@methylationPosition,
        stats = stage2_in@summary_stats_path,
        scaff = stage1_in@scaffoldIdentifier
    )
}

combined_data <- rbindlist(data_list, use.names = TRUE, fill = TRUE)

# Write data incrementally
if (!header_written) {
    fwrite(combined_data, output_file)
    header_written <- TRUE
} else {
    fwrite(combined_data, output_file, append = TRUE)
}

In [309]:
combined_data[which(combined_data$pos %in% rownames(mwas.all)), ]

z,p,n,pos,stats,scaff
<dbl>,<dbl>,<dbl>,<int>,<chr>,<chr>
-12.21219,2.675782e-34,10,73418062,/home/naglemi/mwas/gwas/gwas_stat_scz,libd_chr1-chr1_all-libd_chr1-chr1_all-908982-928981-dynamic-1corestotal-allcorepera-20240416-172011
-15.00357,6.957044e-51,9,73418161,/home/naglemi/mwas/gwas/gwas_stat_scz,libd_chr1-chr1_all-libd_chr1-chr1_all-908982-928981-dynamic-1corestotal-allcorepera-20240416-172011
-15.01511,5.846412e-51,9,73418186,/home/naglemi/mwas/gwas/gwas_stat_scz,libd_chr1-chr1_all-libd_chr1-chr1_all-908982-928981-dynamic-1corestotal-allcorepera-20240416-172011
-19.95268,1.420689e-88,18,73418205,/home/naglemi/mwas/gwas/gwas_stat_scz,libd_chr1-chr1_all-libd_chr1-chr1_all-908982-928981-dynamic-1corestotal-allcorepera-20240416-172011
-18.37019,2.276177e-75,18,73418313,/home/naglemi/mwas/gwas/gwas_stat_scz,libd_chr1-chr1_all-libd_chr1-chr1_all-908982-928981-dynamic-1corestotal-allcorepera-20240416-172011


In [310]:
mwas.all

Unnamed: 0,z,p
73418062,-12.62631,1.512075e-36
73418161,-15.6353,4.184865e-55
73418186,-17.13924,7.563337e-66
73418205,-17.14866,6.43223e-66
73418313,-19.04349,7.439452e-81


## Compare with results from CpGWAS

In [313]:
results <- fread("16a9par-OUT_stage2_MWAS_scz.csv")

In [321]:
smallerdata <- results[which(results$p < 10^-200), ]

In [322]:
dim(smallerdata)

In [329]:
dim(smallerdata)

In [330]:
table(smallerdata$population, smallerdata$region)

     
      caud dlpfc hippo
  AA   373   235   206
  all  533   370   239
  EA  1434   761   768

In [331]:
smallerdata[which(smallerdata$population == "all"), ]

z,p,n,chr,pos,population,region,stats,scaff
<dbl>,<dbl>,<int>,<int>,<int>,<chr>,<chr>,<chr>,<chr>
31.19838,1.120726e-213,10,1,71738191,all,caud,/expanse/lustre/projects/jhu152/naglemi/mwas/gwas/gwas_stat_scz,libd_chr1-chr1_all-libd_chr1-chr1_all-888982-908981-dynamic-1corestotal-allcorepera-20240416-171910
34.78845,3.636349e-265,11,1,71738219,all,caud,/expanse/lustre/projects/jhu152/naglemi/mwas/gwas/gwas_stat_scz,libd_chr1-chr1_all-libd_chr1-chr1_all-888982-908981-dynamic-1corestotal-allcorepera-20240416-171910
39.97068,0.000000e+00,10,1,71738231,all,caud,/expanse/lustre/projects/jhu152/naglemi/mwas/gwas/gwas_stat_scz,libd_chr1-chr1_all-libd_chr1-chr1_all-888982-908981-dynamic-1corestotal-allcorepera-20240416-171910
-30.82684,1.145005e-208,9,1,72287393,all,caud,/expanse/lustre/projects/jhu152/naglemi/mwas/gwas/gwas_stat_scz,libd_chr1-chr1_all-libd_chr1-chr1_all-888982-908981-dynamic-1corestotal-allcorepera-20240416-171910
30.29021,1.542576e-201,12,1,72981992,all,caud,/expanse/lustre/projects/jhu152/naglemi/mwas/gwas/gwas_stat_scz,libd_chr1-chr1_all-libd_chr1-chr1_all-888982-908981-dynamic-1corestotal-allcorepera-20240416-171910
30.44355,1.457952e-203,11,1,72982091,all,caud,/expanse/lustre/projects/jhu152/naglemi/mwas/gwas/gwas_stat_scz,libd_chr1-chr1_all-libd_chr1-chr1_all-888982-908981-dynamic-1corestotal-allcorepera-20240416-171910
30.39450,6.492708e-203,11,1,72982096,all,caud,/expanse/lustre/projects/jhu152/naglemi/mwas/gwas/gwas_stat_scz,libd_chr1-chr1_all-libd_chr1-chr1_all-888982-908981-dynamic-1corestotal-allcorepera-20240416-171910
30.58453,1.965619e-205,58,1,73178470,all,caud,/expanse/lustre/projects/jhu152/naglemi/mwas/gwas/gwas_stat_scz,libd_chr1-chr1_all-libd_chr1-chr1_all-888982-908981-dynamic-1corestotal-allcorepera-20240416-171910
32.12254,2.136178e-226,53,1,73178530,all,caud,/expanse/lustre/projects/jhu152/naglemi/mwas/gwas/gwas_stat_scz,libd_chr1-chr1_all-libd_chr1-chr1_all-888982-908981-dynamic-1corestotal-allcorepera-20240416-171910
30.88364,1.980943e-209,52,1,73178618,all,caud,/expanse/lustre/projects/jhu152/naglemi/mwas/gwas/gwas_stat_scz,libd_chr1-chr1_all-libd_chr1-chr1_all-888982-908981-dynamic-1corestotal-allcorepera-20240416-171910


In [325]:
smallerdata_matching_pos <- smallerdata[which(smallerdata$chr == 1 & smallerdata$pos >= 73418062 & smallerdata$pos <= 73418313), ]

In [326]:
smallerdata_matching_pos

z,p,n,chr,pos,population,region,stats,scaff
<dbl>,<dbl>,<int>,<int>,<int>,<chr>,<chr>,<chr>,<chr>
-40.22723,0.0,18,1,73418161,AA,caud,/expanse/lustre/projects/jhu152/naglemi/mwas/gwas/gwas_stat_scz,libd_chr1-chr1_AA-libd_chr1-chr1_AA-908982-928981-dynamic-1corestotal-allcorepera-20240415-134429
-37.49298,1.1985559999999999e-307,18,1,73418205,AA,caud,/expanse/lustre/projects/jhu152/naglemi/mwas/gwas/gwas_stat_scz,libd_chr1-chr1_AA-libd_chr1-chr1_AA-908982-928981-dynamic-1corestotal-allcorepera-20240415-134429
-43.28514,0.0,18,1,73418313,AA,caud,/expanse/lustre/projects/jhu152/naglemi/mwas/gwas/gwas_stat_scz,libd_chr1-chr1_AA-libd_chr1-chr1_AA-908982-928981-dynamic-1corestotal-allcorepera-20240415-134429
-38.16869,0.0,18,1,73418205,AA,caud,/expanse/lustre/projects/jhu152/naglemi/mwas/gwas/gwas_stat_scz,libd_chr1-chr1_AA-libd_chr1-chr1_AA-908982-928981-dynamic-1corestotal-allcorepera-caud-20240510-130545
-45.25395,0.0,18,1,73418313,AA,caud,/expanse/lustre/projects/jhu152/naglemi/mwas/gwas/gwas_stat_scz,libd_chr1-chr1_AA-libd_chr1-chr1_AA-908982-928981-dynamic-1corestotal-allcorepera-caud-20240510-130545


In [317]:
smallerdata_matching_pos <- smallerdata[which(smallerdata$population == "all" & smallerdata$region == "caud"), ]

In [320]:
smallerdata_matching_pos

z,p,n,chr,pos,population,region,stats,scaff
<dbl>,<dbl>,<int>,<int>,<int>,<chr>,<chr>,<chr>,<chr>


In [312]:
dim(results[which(results$p < 10^-200), ])

ERROR: Error in eval(expr, envir, enclos): object 'bigdata' not found


In [327]:
results_subset <- results[which(results$chr == 1 & results$pos >= 73418062 & results$pos <= 73418313), ]

In [328]:
results_subset

z,p,n,chr,pos,population,region,stats,scaff
<dbl>,<dbl>,<int>,<int>,<int>,<chr>,<chr>,<chr>,<chr>
-14.014607,1.268991e-44,8,1,73418062,AA,caud,/expanse/lustre/projects/jhu152/naglemi/mwas/gwas/gwas_stat_scz,libd_chr1-chr1_AA-libd_chr1-chr1_AA-908982-928981-dynamic-1corestotal-allcorepera-20240415-134429
-40.227232,0.000000e+00,18,1,73418161,AA,caud,/expanse/lustre/projects/jhu152/naglemi/mwas/gwas/gwas_stat_scz,libd_chr1-chr1_AA-libd_chr1-chr1_AA-908982-928981-dynamic-1corestotal-allcorepera-20240415-134429
-27.257689,1.347302e-163,16,1,73418186,AA,caud,/expanse/lustre/projects/jhu152/naglemi/mwas/gwas/gwas_stat_scz,libd_chr1-chr1_AA-libd_chr1-chr1_AA-908982-928981-dynamic-1corestotal-allcorepera-20240415-134429
-37.492982,1.198556e-307,18,1,73418205,AA,caud,/expanse/lustre/projects/jhu152/naglemi/mwas/gwas/gwas_stat_scz,libd_chr1-chr1_AA-libd_chr1-chr1_AA-908982-928981-dynamic-1corestotal-allcorepera-20240415-134429
-43.285137,0.000000e+00,18,1,73418313,AA,caud,/expanse/lustre/projects/jhu152/naglemi/mwas/gwas/gwas_stat_scz,libd_chr1-chr1_AA-libd_chr1-chr1_AA-908982-928981-dynamic-1corestotal-allcorepera-20240415-134429
-14.117190,2.976244e-45,8,1,73418062,AA,caud,/expanse/lustre/projects/jhu152/naglemi/mwas/gwas/gwas_stat_scz,libd_chr1-chr1_AA-libd_chr1-chr1_AA-908982-928981-dynamic-1corestotal-allcorepera-caud-20240510-130545
-19.968476,1.035656e-88,8,1,73418161,AA,caud,/expanse/lustre/projects/jhu152/naglemi/mwas/gwas/gwas_stat_scz,libd_chr1-chr1_AA-libd_chr1-chr1_AA-908982-928981-dynamic-1corestotal-allcorepera-caud-20240510-130545
-20.094272,8.282009e-90,8,1,73418186,AA,caud,/expanse/lustre/projects/jhu152/naglemi/mwas/gwas/gwas_stat_scz,libd_chr1-chr1_AA-libd_chr1-chr1_AA-908982-928981-dynamic-1corestotal-allcorepera-caud-20240510-130545
-38.168687,0.000000e+00,18,1,73418205,AA,caud,/expanse/lustre/projects/jhu152/naglemi/mwas/gwas/gwas_stat_scz,libd_chr1-chr1_AA-libd_chr1-chr1_AA-908982-928981-dynamic-1corestotal-allcorepera-caud-20240510-130545
-45.253948,0.000000e+00,18,1,73418313,AA,caud,/expanse/lustre/projects/jhu152/naglemi/mwas/gwas/gwas_stat_scz,libd_chr1-chr1_AA-libd_chr1-chr1_AA-908982-928981-dynamic-1corestotal-allcorepera-caud-20240510-130545


In [None]:
results_subset <- results_subset[which(results_subset$population == "all" & results_subset$region == "caud"), ]

head(results)