# Compare MWAS methods for sanity tests and troubleshooting

In this version, we use the full new SNP set with a selected window

# Pick the regions we will test

In [1]:
library(data.table)

# prev_hits <- fread("
# Chr     pos        old_z       old_p
# 11   38247902        -27.1535308285104       2.30024742330298e-162
# 2    47933357        18.3327793004811        4.53147699327216e-75
# 7    1987910         10.1255085321387        4.25759742738181e-24
# 7    1987896         10.1255085321387        4.25759742738189e-24
# 7    1987797         10.0241523897721        1.19379483108027e-23
# 7    1987778         10.0105851568856        1.36940148731912e-23
# 12   2194742         -10.0072634920486       1.41615523554955e-23
# ")


In [2]:
prev_hits <- fread("
Chr     pos
1    73274305
1    73418161
1    73418205
1    73418313
1    73419188
1    73419830
")

In [3]:
#df <- fread("09.5-OUT_matched_SNP_meth_cov_chunked_EXPANSE_a2.csv")

In [4]:
df <- fread("09-OUT_matched_SNP_meth_cov_a2.csv")

## Try original code

In [5]:
###### model: learn elastic net model on training data 
######---------Input: trainX, trainY
######---------Return: selected features and coefficents

# original
elastic.net <- function(trainX,trainY){
    if(nrow(trainX)!=length(trainY)){
            stop("Number of observations is differerent")
    } 

    # optimize alpha---mixing parameter  
    a <- 0.5
    search <- foreach(ai = a, .combine = rbind) %dopar% {
        cv.fit <- cv.glmnet(
                        trainX,
                        trainY,
                        nfold = 5,
                        type.measure = "mse",
                        paralle = TRUE,
                        alpha = ai
                        )
        data.frame(
                        cvm = min(cv.fit$cvm),
                        lambda = cv.fit$lambda.min,
                        alpha = ai
                        )
        } 
    cv.opt <- search[search$cvm == min(search$cvm),] 

        # fit model by optimized alpha and lambda
        yfit = glmnet(
        trainX,
        trainY,
        lambda = cv.opt$lambda,
        alpha = cv.opt$alpha
                )       
        idf <- coef(yfit)
        idx <- which(idf != 0)
        selectf <- data.frame(
                features = idf@Dimnames[[1]][idx], 
                coefs = idf [idx]
        )
}

# modified to use lambda 1se and appropriate cvm
elastic.net <- function(trainX,trainY){
    if(nrow(trainX)!=length(trainY)){
            stop("Number of observations is differerent")
    } 

    # optimize alpha---mixing parameter  
    a <- 0.5
    search <- foreach(ai = a, .combine = rbind) %dopar% {
        set.seed(42)
        cv.fit <- cv.glmnet(
                        trainX,
                        trainY,
                        nfold = 5,
                        type.measure = "mse",
                        paralle = TRUE,
                        alpha = ai
                        )
        print(paste0("Dim of trainX: ", dim(trainX)))
        print(paste0("Len of trainY: ", length(trainY)))
        coef_matrix <- as.matrix(coef(cv.fit))

        non_zero_coefs <- coef_matrix[coef_matrix != 0, , drop = FALSE]
        print("Coefficients when fitting: ")
        print(non_zero_coefs)

        data.frame(
                        cvm = cv.fit$cvm[cv.fit$lambda == cv.fit$lambda.1se],
                        lambda = cv.fit$lambda.1se,
                        alpha = ai
                        )
        } 
    cv.opt <- search[search$cvm == min(search$cvm),] 

        # fit model by optimized alpha and lambda
    set.seed(42)
    yfit <- glmnet(
    trainX,
    trainY,
    lambda = cv.opt$lambda,
    alpha = cv.opt$alpha)

    coef_matrix <- as.matrix(coef(yfit))

    non_zero_coefs <- coef_matrix[coef_matrix != 0, , drop = FALSE]
    print("Coefficients when optimal: ")
    print(non_zero_coefs)

    idf <- coef(yfit)
    idx <- which(idf != 0)
    selectf <- data.frame(
            features = idf@Dimnames[[1]][idx], 
            coefs = idf [idx]
    )
}

MWAS <- function(gwas, weight, geno){
        z <- gwas %*% weight
        z.cor <- cor(geno)
        se <- sqrt(weight %*%  z.cor %*%  weight)
        z <- z/se
        p=pnorm(abs(z),lower.tail=F)*2
        return(c(z, p))
}

In [6]:
df <- df[which(df$Chr == 1), ]

In [7]:
i <- 2

In [8]:
library("glmnet")
library("e1071")
library("doParallel")

set.seed(2018)
wind <- c(5000,10000)
# output directory
#outd <- "/dcl02/lieber/shan/shizhong/finemapping/GWAS/tags/scz3/mwas/chr22/1/"
outd <- "20-OUT_original_mwas_sanity_test/"

Loading required package: Matrix

Loaded glmnet 4.1-8

Loading required package: foreach

Loading required package: iterators

Loading required package: parallel



## Replace all old objects with new objects in same format

In [9]:
#load("p1.rda", verbose = TRUE)

In [10]:
#p[1:10, 1:10]

### Methylation data

In [11]:
suppressWarnings(library(bsseq))

Loading required package: BiocGenerics


Attaching package: ‘BiocGenerics’


The following objects are masked from ‘package:stats’:

    IQR, mad, sd, var, xtabs


The following objects are masked from ‘package:base’:

    anyDuplicated, aperm, append, as.data.frame, basename, cbind,
    colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
    get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
    match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
    Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
    table, tapply, union, unique, unsplit, which.max, which.min


Loading required package: GenomicRanges

Loading required package: stats4

Loading required package: S4Vectors


Attaching package: ‘S4Vectors’


The following objects are masked from ‘package:Matrix’:

    expand, unname


The following objects are masked from ‘package:data.table’:

    first, second


The following object is masked from ‘package:utils’:

    findMatches


The

In [12]:
# load data for mwas
# load("./rda/caudate_mwas_data_chr22.rda")
load(df$methylation_data[i])

p <- getMeth(BSobj2)


rownames(p) <- start(BSobj2)

sites_to_test_pos <- c(73274305, 73274312, 73292330, 73307769, 73308571, 73419188, 73419830, 73420076)
sites_to_test <- which(start(BSobj2) %in% sites_to_test_pos)
#sites_to_test <- c(73274305, 73274312, 73292330, 73307769, 73308571, 73419188, 73419830, 73420076)
p <- p[sites_to_test, ]
cg <- as.numeric(rownames(p))

In [13]:
# # load data for mwas
# # load("./rda/caudate_mwas_data_chr22.rda")
# load(df$methylation_data[i])

# p <- getMeth(BSobj2)


# rownames(p) <- start(BSobj2)

# sites_to_test <- which(start(BSobj2) >= (73418205 - 500) & start(BSobj2) <= (73418205 + 500))
# p <- p[sites_to_test, ]

# # candidate cg
# cg <- as.numeric(rownames(p))

# # regress out covariates
# #load("covs_for_meqtl.rda")

### covariates

In [14]:
covs <- fread(df$cov_file[i])
covs <- t(covs)
colnames(covs) <- covs[1, ]
covs <- covs[2:nrow(covs), ]
# transpose so we have same orientation as original code

### Regress methylation data over covariates

In [15]:
BSobj2$brnum <- gsub("Br0", "Br", BSobj2$brnum)
colnames(covs) <- gsub("Br0", "Br", colnames(covs))

In [16]:
mat <- match(BSobj2$brnum,colnames(covs)) 
covs <- t(covs[,mat])
p.residual=matrix(NA,dim(p)[1],dim(p)[2])

In [17]:
rownames(covs)[is.na(covs[, 'genoPC1'])] <- BSobj2$brnum[is.na(covs[, 'genoPC1'])]

In [18]:
colnames(p.residual) <- BSobj2$brnum

In [19]:
covs <- as.data.frame(covs)
# Convert all columns except Dx and Sex from character to numeric
cols_to_convert <- setdiff(names(covs), c("Dx", "Sex"))

for (col in cols_to_convert) {
  covs[[col]] <- as.numeric(covs[[col]])
}

# Print the modified data frame to check the conversion
#print(dat)


In [20]:
for (i in 1:dim(p)[1]) { # For each methylation site
    dat <- as.data.frame(cbind(y = p[i,], covs))
    
    # Check for rows with NAs (the ones for which we don't have covariate data)
    valid_rows <- complete.cases(dat)
    
    if (sum(valid_rows) > 0) {
        dat_valid <- dat[valid_rows,]
        model.res <- lm(y ~ ., data = dat_valid)
        
        # Store residuals in the corresponding positions
        p.residual[i, valid_rows] <- resid(model.res)
    }
}


# for(i in 1:dim(p)[1]){ # foro each methylation site
#         dat <- as.data.frame(cbind(p[i,],covs))
#         colnames(dat) <- c("y",paste0("x",1:ncol(covs)))
#         model.res <- lm(reformulate(paste0("x",1:ncol(covs)), "y"),dat)
#         p.residual[i,] = resid(model.res) 
# }

In [21]:
snp.gwas2 <- NULL

In [22]:
load("p1.rda")

In [23]:
# min(snp.gwas2$pos_hg38)
# max(snp.gwas2$pos_hg38)

In [24]:
# load("p1.rda")
# pos_we_got <- snp.gwas2$pos_hg38
# saveRDS(pos_we_got, "20-intermediate_positions_in_old_set.csv")

In [25]:
#pos_we_got <- readRDS("20-intermediate_positions_in_old_set.csv")

In [26]:
#pos_we_got

In [27]:
# head(snp.gwas2)

### summary stats

In [28]:
library(data.table)
library(CpGWAS)

In [29]:
ss_path <- "/home/naglemi/mwas/gwas/gwas_stat_scz"

In [30]:
snp.gwas2 <- fread(ss_path, skip = 1, header = FALSE)
colnames(snp.gwas2) <- strsplit(readLines(ss_path, n = 1), "\t")[[1]]

In [31]:
snp.gwas2$z <- log(snp.gwas2$OR)/snp.gwas2$SE

In [32]:
snp.gwas2 <- snp.gwas2[, c(2, 1, 3, 3, 8, 4, 5, 20, 11)]

In [33]:
head(snp.gwas2, n = 1)

SNP,CHR,BP,BP,INFO,A1,A2,z,P
<chr>,<int>,<int>,<int>.1,<dbl>,<chr>,<chr>,<dbl>,<dbl>
rs62513865,8,100579985,100579985,0.963,C,T,0.7016221,0.4847


In [34]:
colnames(snp.gwas2)[1:5] <- c("snp", "chr", "pos_hg38", "pos_hg38", "info")

In [35]:
snp.gwas2 <- snp.gwas2[which(snp.gwas2$chr == 1 & snp.gwas2$pos_hg38 >= (73274305-10000) & snp.gwas2$pos_hg38 <= (73419830 + 10000)), ]

In [36]:
dim(snp.gwas2)

In [37]:
snp.gwas2 <- snp.gwas2[order(snp.gwas2$pos_hg38), ]

In [38]:
head(snp.gwas2)

snp,chr,pos_hg38,pos_hg38,info,A1,A2,z,P
<chr>,<int>,<int>,<int>.1,<dbl>,<chr>,<chr>,<dbl>,<dbl>
rs6672818,1,73265462,73265462,0.991,C,T,-7.24430537,5.189e-13
rs72676673,1,73267315,73267315,0.971,A,G,0.04124304,0.9672
rs61765637,1,73269720,73269720,0.993,G,C,7.25603083,4.623e-13
rs4571923,1,73270879,73270879,0.994,G,A,-7.41892929,1.37e-13
rs12759031,1,73271206,73271206,0.992,C,T,-7.22079453,5.844e-13
rs10890025,1,73272480,73272480,0.994,A,G,-7.25544375,4.328e-13


In [39]:
#snp.gwas2 <- snp.gwas2[which(snp.gwas2$pos_hg38 %in% pos_we_got), ]

In [40]:
dim(snp.gwas2)

In [41]:
# built predition models
idx.ea <- BSobj2$race == "CAUC"

### SNPs in LIBD population

#### For reference, first load Shizhong's formatted SNPs on Chr7

In [42]:
snp2_sorted <- snp2[, order(names(snp2))]

In [43]:
colnames(snp2) <- gsub("Br0", "Br", colnames(snp2))

In [44]:
snp2 <- snp2[, colnames(snp2) %in% colnames(p.residual)]

In [45]:
dim(snp2)

In [46]:
head(snp2)

Unnamed: 0_level_0,Br836,Br845,Br848,Br863,Br914,Br948,Br949,Br963,Br983,Br991,⋯,Br5373,Br5398,Br5422,Br5426,Br5460,Br5467,Br5475,Br5488,Br5584,Br5590
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
chr7:1963098:T:C,2.0,2,2.0,2,2.0,1.999,2.0,1.974,1.0,2,⋯,1.954,2,2,2,2,2,2,1,1,1
chr7:1963408:C:T,0.001,2,1.991,0,1.969,0.006,0.004,0.14,2.0,0,⋯,2.0,0,1,2,2,1,1,2,2,2
chr7:1964786:C:T,2.0,2,1.0,2,2.0,2.0,2.0,2.0,1.0,2,⋯,0.001,2,2,0,2,2,2,1,1,2
chr7:1966112:T:C,2.0,2,1.001,2,2.0,2.0,2.0,2.0,1.002,2,⋯,0.015,2,2,0,2,2,2,1,1,2
chr7:1973362:G:A,2.0,2,1.095,2,2.0,2.0,0.004,2.0,1.002,2,⋯,0.015,2,2,0,2,1,2,1,1,2
chr7:1975412:T:G,2.0,2,2.0,2,2.0,2.0,2.0,2.0,1.0,2,⋯,2.0,2,2,2,2,2,2,1,1,1


In [47]:
snp2_positions <- stringr::str_split_fixed(rownames(snp2), ":", 3)[, 2]

#### Now let's load ours on Chr1

In [48]:
paths <- list(pvar_path = "/expanse/lustre/projects/jhu152/naglemi/mwas/gwas/libd_chr1.pvar",
              pgen_path = "/expanse/lustre/projects/jhu152/naglemi/mwas/gwas/libd_chr1.pgen",
              psam_path = "/expanse/lustre/projects/jhu152/naglemi/mwas/gwas/libd_chr1.psam")

my_SNPs <- loadSNPData(paths$pvar_path, paths$pgen_path, paths$psam_path)

In [49]:
snp_indices_of_interest <- which(my_SNPs$pvar_dt$POS >= 73274305-10000 & my_SNPs$pvar_dt$POS <= 73419830 + 10000)

In [50]:
snp3 <- pgenlibr::ReadList(my_SNPs$pgen,
                        variant_subset = snp_indices_of_interest)
colnames(snp3) <- my_SNPs$pvar_dt$ID[snp_indices_of_interest]
rownames(snp3) <- my_SNPs$psam$`#IID`

In [51]:
snp3[1:10, 1:10]

Unnamed: 0,chr1:73265462:C:T,chr1:73267315:A:G,chr1:73269720:G:C,chr1:73270879:G:A,chr1:73271206:C:T,chr1:73272480:A:G,chr1:73273958:G:C,chr1:73276935:T:G,chr1:73277452:A:G,chr1:73278190:A:G
Br1602,0.0,0,0.9959717,0,0.0,0,0.0,0.0,0,0.9940186
Br1203,2.0,0,0.0,2,2.0,2,2.0,2.0,0,0.0
Br1214,2.0,0,0.0,2,2.0,2,2.0,2.0,0,0.0
Br2149,0.9970093,0,0.9959717,1,0.9970093,1,0.9979858,0.9979858,0,0.9940186
Br1016,0.0,0,1.9520264,0,0.0,0,0.0,0.0,0,1.9910278
Br1580,1.0,0,1.0,1,1.0,1,1.0,1.0,0,1.0
Br1646,2.0,0,0.0,2,2.0,2,2.0,2.0,0,0.0
Br1823,0.0,0,2.0,0,0.0,0,0.0,0.0,0,2.0
Br1696,2.0,0,0.0,2,2.0,2,2.0,2.0,0,0.0
Br1513,1.0,0,1.0,1,1.0,1,1.0,1.0,0,1.0


In [52]:
map3 <- data.frame(POS = stringr::str_split_fixed(colnames(snp3), ":", 3)[, 2])

In [53]:
snp3 <- t(snp3)

In [54]:
# snp3 <- snp3[which(map3$POS %in% pos_we_got), ]

In [55]:
map3 <- data.frame(POS = stringr::str_split_fixed(rownames(snp3), ":", 3)[, 2])

In [56]:
dim(map3)

### SNPs in reference population

In [57]:
#snp.1kg.eur2

In [58]:
paths <- list(pvar_path = "/expanse/lustre/projects/jhu152/naglemi/mwas/gwas/ref_EUR_chr1.pvar",
              pgen_path = "/expanse/lustre/projects/jhu152/naglemi/mwas/gwas/ref_EUR_chr1.pgen",
              psam_path = "/expanse/lustre/projects/jhu152/naglemi/mwas/gwas/ref_EUR_chr1.psam")

my_SNPs <- loadSNPData(paths$pvar_path, paths$pgen_path, paths$psam_path)

In [59]:
snp_indices_of_interest <- which(my_SNPs$pvar_dt$POS >= 73274305-10000 & my_SNPs$pvar_dt$POS <= 73419830 + 10000)

In [60]:
snp.1kg.eur2 <- pgenlibr::ReadList(my_SNPs$pgen,
                        variant_subset = snp_indices_of_interest)
colnames(snp.1kg.eur2) <- my_SNPs$pvar_dt$ID[snp_indices_of_interest]
rownames(snp.1kg.eur2) <- my_SNPs$psam$`IID`

In [61]:
dim(snp.1kg.eur2)

In [62]:
map.1kg.eur2 <- my_SNPs$pvar_dt

In [63]:
map.1kg.eur2 <- map.1kg.eur2[snp_indices_of_interest, ]

In [64]:
dim(map.1kg.eur2)

In [65]:
#map.1kg.eur2 <- data.frame(POS = stringr::str_split_fixed(colnames(snp.1kg.eur2), ":", 3)[, 2])

In [66]:
snp.1kg.eur2 <- t(snp.1kg.eur2)

In [67]:
# snp3 <- snp3[which(map3$POS %in% pos_we_got), ]

### Set window size and any other parameters

In [68]:
wind <- 10000

Is 1se vs min for lambda the problem?

## Run for all

In [69]:
head(map3)

Unnamed: 0_level_0,POS
Unnamed: 0_level_1,<chr>
1,73265462
2,73267315
3,73269720
4,73270879
5,73271206
6,73272480


In [70]:
dim(map3)

In [71]:
dim(snp3)

In [72]:
p.residual <- p.residual[, order(colnames(p.residual))]

In [73]:
dim(snp3)

In [74]:
p.residual

Br1003,Br1004,Br1007,Br1016,Br1017,Br1021,Br1023,Br1030,Br1033,Br1034,⋯,Br845,Br848,Br863,Br914,Br948,Br949,Br963,Br983,Br991,Br993
0.004699246,0.02571402,-0.03724924,0.029459027,-0.003689116,0.03377052,0.010897282,-0.009638877,-0.028965553,-0.001248259,⋯,0.0041717893,-0.022545095,-0.0005289453,0.03346497,-0.0078625663,0.009086433,-0.0474169627,-0.00521712,-0.0240964879,-0.009464661
0.004701542,0.02574389,-0.03728028,0.029420813,-0.003700395,0.03373959,0.010895697,-0.009658648,-0.028948593,-0.001249709,⋯,0.0041184984,-0.022551546,-0.000483395,0.03349055,-0.0078640494,0.009052419,-0.0474524174,-0.005237978,-0.024070654,-0.009446197
-0.05041026,0.02960383,0.03674256,0.048401104,0.01651267,0.04541931,-0.041216132,-0.003618215,-0.002491255,0.018839625,⋯,-0.0006592129,-0.007265386,-0.0104376271,0.0279516,0.0062000869,0.052417442,-0.0737256959,-0.007831388,-0.0005480618,0.024678804
0.016037111,-0.00222813,-0.02921014,-0.002843074,0.010354481,0.01613772,0.003823707,-0.007465657,0.007413381,-0.006660194,⋯,-0.0088318007,-0.002604848,0.0188877623,0.01197726,0.0212200515,-0.022467554,0.0003630071,-0.00332052,0.0193907791,0.041295768
0.020975304,-8.592978e-05,-0.01648887,-0.001757054,0.013341716,0.01247528,0.009722023,-0.021927559,0.016224245,-0.002048185,⋯,-0.0128737803,-0.001210844,0.0214527827,-0.00175747,0.0209985984,-0.030339553,0.0092963478,-0.004999894,0.0258559245,0.050084181
0.047460153,-0.0139074,0.04520569,-0.015065382,-0.051236116,0.01709664,-0.082008236,-0.050388733,0.031388948,-0.037438313,⋯,-0.0495807555,0.015542011,0.0222914604,-0.06693339,-0.0049046015,-0.006569342,0.0127675911,0.005541765,-0.0133311479,0.05582426
0.053531233,-0.009067518,0.04867378,-0.008760808,-0.048625371,0.01574857,-0.079057388,-0.05600788,0.028576964,-0.028542524,⋯,-0.05288765,0.011478709,0.0236355439,-0.0751457,0.0002879324,-0.004131188,0.0102234824,0.005422597,-0.0139477531,0.052970266
0.053073097,-0.007994718,0.0472402,-0.006036562,-0.046269791,0.01400002,-0.074617677,-0.054920575,0.025522718,-0.023414089,⋯,-0.0516101785,0.009058243,0.0233499051,-0.07468453,0.0025589482,-0.003132686,0.009175958,0.004971472,-0.0141279005,0.049531009


In [75]:
snp3[1:10, 1:10]

Unnamed: 0,Br1602,Br1203,Br1214,Br2149,Br1016,Br1580,Br1646,Br1823,Br1696,Br1513
chr1:73265462:C:T,0.0,2,2,0.9970093,0.0,1,2,0,2,1
chr1:73267315:A:G,0.0,0,0,0.0,0.0,0,0,0,0,0
chr1:73269720:G:C,0.9959717,0,0,0.9959717,1.952026,1,0,2,0,1
chr1:73270879:G:A,0.0,2,2,1.0,0.0,1,2,0,2,1
chr1:73271206:C:T,0.0,2,2,0.9970093,0.0,1,2,0,2,1
chr1:73272480:A:G,0.0,2,2,1.0,0.0,1,2,0,2,1
chr1:73273958:G:C,0.0,2,2,0.9979858,0.0,1,2,0,2,1
chr1:73276935:T:G,0.0,2,2,0.9979858,0.0,1,2,0,2,1
chr1:73277452:A:G,0.0,0,0,0.0,0.0,0,0,0,0,0
chr1:73278190:A:G,0.9940186,0,0,0.9940186,1.991028,1,0,2,0,1


In [76]:
snp3 <- snp3[, colnames(snp3) %in% colnames(p.residual)]

In [77]:
snp3 <- snp3[, order(colnames(snp3))]

In [78]:
dim(snp3)

In [79]:
for(k in 1:length(wind)){
    models.ea <- c()
    models.all <- c()
    for(i in 1:length(cg)){
    #for(i in 1){
            cat(i,"\n")
            print(paste0("This cg is: ", cg[i]))
            range1 <- ifelse(cg[i] - wind[k] > 0,cg[i] - wind[k],0)
            range2 <- cg[i] + wind[k]
            idx <- map3$POS > range1 & map3$POS < range2
            # go to next cg if no snps within window
            if(sum(idx) <= 1){
                    next
            }
            geno <- snp3[idx,] # changed snp2 to snp3
            rownames(geno) <- map3$POS[idx]
            trainX <- t(geno)
            trainY <- p.residual[i,]
            fit <- elastic.net(trainX,trainY)
            fit <- tryCatch(
                    elastic.net(trainX,trainY),
                    error = function(e) {return ("err")})
            if(!is.data.frame(fit)){
                if(fit == "err"){
                    next
                }
            }
            if(nrow(fit) == 0) next

            fit$cg <- cg[i]
            models.all <- rbind(models.all,fit)
            # EA only
            trainX <- trainX[idx.ea,]
            if(sum(apply(trainX,2,var)!=0) <= 1){
                    next
            }
            trainY <- trainY[idx.ea]
            fit <- tryCatch(
                    elastic.net(trainX,trainY),
                    error = function(e) {return ("err")})
            if(!is.data.frame(fit)){
                if(fit == "err"){
                    next
                }
            }
            if(nrow(fit) == 0) next
            fit$cg <- cg[i]
            models.ea <- rbind(models.ea,fit)
    }
}

1 
[1] "This cg is: 73274305"


“executing %dopar% sequentially: no parallel backend registered”


[1] "Dim of trainX: 297" "Dim of trainX: 14" 
[1] "Len of trainY: 297"
[1] "Coefficients when fitting: "
                      s1
(Intercept) 2.981556e-19
[1] "Coefficients when optimal: "
                      s0
(Intercept) 2.981556e-19
[1] "Dim of trainX: 297" "Dim of trainX: 14" 
[1] "Len of trainY: 297"
[1] "Coefficients when fitting: "
                      s1
(Intercept) 2.981556e-19
[1] "Coefficients when optimal: "
                      s0
(Intercept) 2.981556e-19
[1] "Dim of trainX: 133" "Dim of trainX: 14" 
[1] "Len of trainY: 133"
[1] "Coefficients when fitting: "
                       s1
(Intercept) -0.0006844528
[1] "Coefficients when optimal: "
                       s0
(Intercept) -6.844528e-04
73276935    -2.969554e-18
2 
[1] "This cg is: 73274312"
[1] "Dim of trainX: 297" "Dim of trainX: 14" 
[1] "Len of trainY: 297"
[1] "Coefficients when fitting: "
                       s1
(Intercept) -3.049319e-19
[1] "Coefficients when optimal: "
                       s0
(Inter

In [80]:
models.ea <- models.ea[models.ea[,1] != "(Intercept)",]
models.all <- models.all[models.all[,1] != "(Intercept)",]

In [81]:
if(!dir.exists(outd)) dir.create(outd)

In [82]:
head(models.all)

Unnamed: 0_level_0,features,coefs,cg
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>
4,73283600,0.001231042,73292330
5,73284739,1.749503e-05,73292330
6,73284884,0.001593309,73292330
7,73286326,-0.003118988,73292330
8,73289846,-0.003040319,73292330
9,73300354,0.0006361423,73292330


In [83]:
dim(models.all)

In [84]:
models.all

Unnamed: 0_level_0,features,coefs,cg
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>
4,73283600,1.231042e-03,73292330
5,73284739,1.749503e-05,73292330
6,73284884,1.593309e-03,73292330
7,73286326,-3.118988e-03,73292330
8,73289846,-3.040319e-03,73292330
9,73300354,6.361423e-04,73292330
10,73300437,5.912558e-04,73292330
11,73300479,6.168964e-04,73292330
13,73317255,-2.386654e-18,73307769
15,73299420,1.714432e-06,73308571


In [85]:
head(snp.1kg.eur2)

Unnamed: 0,HG00096,HG00097,HG00099,HG00101,HG00102,HG00103,HG00105,HG00107,HG00108,HG00109,⋯,NA20814,NA20815,NA20818,NA20819,NA20821,NA20822,NA20826,NA20827,NA20828,NA20832
rs6672818,1,0,2,1,1,2,2,1,2,1,⋯,0,0,1,0,0,0,1,1,0,1
rs72676673,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,1,0
rs61765637,1,0,2,1,1,2,2,1,2,1,⋯,0,0,1,0,0,0,1,1,0,1
rs4571923,1,0,2,1,1,2,2,1,2,1,⋯,0,0,1,0,0,0,1,1,1,1
rs12759031,1,0,2,1,1,2,2,1,2,1,⋯,0,0,1,0,0,0,1,1,0,1
rs10890025,1,0,2,1,1,2,2,1,2,1,⋯,0,0,1,0,0,0,1,1,0,1


In [86]:
head(map.1kg.eur2)

#CHROM,POS,ID
<int>,<int>,<chr>
1,73265462,rs6672818
1,73267315,rs72676673
1,73269720,rs61765637
1,73270879,rs4571923
1,73271206,rs12759031
1,73272480,rs10890025


In [90]:
length(geno)

In [102]:
# mwas by models of all samples
cg2 <- unique(models.all$cg)
mwas.all <- matrix(0, nrow = length(cg2), ncol = 2)
cat("Dimensions of relevant objects:\n")
cat("models.all:", dim(models.all), "\n")
cat("snp.gwas2:", dim(snp.gwas2), "\n")
cat("map.1kg.eur2:", dim(map.1kg.eur2), "\n")
cat("snp.1kg.eur2:", dim(snp.1kg.eur2), "\n\n")

for (i in 1:length(cg2)) {
    pos <- models.all[models.all$cg == cg2[i], 1]
    gwas <- snp.gwas2$z[is.element(snp.gwas2$pos_hg38, pos)]
    weight <- models.all[models.all$cg == cg2[i], 2]
    match_indices <- match(pos, map.1kg.eur2$POS)
    
    cat("Iteration:", i, "\n")
    cat("Current CG:", cg2[i], "\n")
    cat("Positions:\n")
    print(head(pos))
    cat("GWAS Z-scores:\n")
    print(head(gwas))
    cat("Weights:\n")
    print(head(weight))
    cat("Matching Indices:\n")
    print(head(match_indices))
    
    tryCatch({
        if (any(is.na(match_indices))) stop("NA values found in match_indices")
        if (any(match_indices > nrow(snp.1kg.eur2))) stop("Out of bounds indices found")
    }, error = function(e) {
        cat("Error detected:", e$message, "\n")
        cat("Dimensions of relevant objects at error detection:\n")
        cat("models.all:", dim(models.all), "\n")
        cat("snp.gwas2:", dim(snp.gwas2), "\n")
        cat("map.1kg.eur2:", dim(map.1kg.eur2), "\n")
        cat("snp.1kg.eur2:", dim(snp.1kg.eur2), "\n")
        cat("Positions causing error:\n")
        print(pos)
        cat("Matching Indices causing error:\n")
        print(match_indices)
        stop("Stopping execution due to error.")
    })
    
    geno <- NULL
    tryCatch({
        geno <- snp.1kg.eur2[match_indices, , drop = FALSE]
    }, error = function(e) {
        cat("Error accessing genotype data at iteration:", i, "\n")
        cat("Error message:", e$message, "\n")
        stop("Stopping execution due to error.")
    })
    
    cat("Genotype Data:\n")
    genorow <- min(nrow(geno), 10)
    genocol <- min(ncol(geno), 10)
    print(geno[1:genorow, 1:genocol])
    tryCatch({
        mwas.all[i, ] <- MWAS(gwas, weight, t(geno))
        cat("MWAS Results (z, p):\n")
        print(mwas.all[i, ])
        cat("\n")
    }, error = function(e) {
        cat("Error performing MWAS at iteration:", i, "\n")
        cat("Error message:", e$message, "\n")
        stop("Stopping execution due to error.")
    })
}
rownames(mwas.all) <- cg2
colnames(mwas.all) <- c("z", "p")

Dimensions of relevant objects:
models.all: 98 3 
snp.gwas2: 321 9 
map.1kg.eur2: 321 3 
snp.1kg.eur2: 321 489 

Iteration: 1 
Current CG: 73292330 
Positions:
[1] "73283600" "73284739" "73284884" "73286326" "73289846" "73300354"
GWAS Z-scores:
[1]  7.290984  7.301905  7.233087 -7.267821 -7.267821  7.174068
Weights:
[1]  1.231042e-03  1.749503e-05  1.593309e-03 -3.118988e-03 -3.040319e-03
[6]  6.361423e-04
Matching Indices:
[1] 13 15 16 18 19 30
Genotype Data:
           HG00096 HG00097 HG00099 HG00101 HG00102 HG00103 HG00105 HG00107
rs11210195       1       0       2       1       1       2       2       1
rs12044218       1       0       2       1       1       2       2       1
rs11210196       1       0       2       1       1       2       2       1
rs12142515       1       0       2       1       1       2       2       1
rs7549372        1       0       2       1       1       2       2       1
rs7555507        1       0       2       1       1       2       2       1
rs7522217 

In [103]:
# mwas by models of EA samples
cg2 <- unique(models.ea$cg)
mwas.ea <- matrix(0, nrow = length(cg2), ncol = 2)
cat("Dimensions of relevant objects:\n")
cat("models.ea:", dim(models.ea), "\n")
cat("snp.gwas2:", dim(snp.gwas2), "\n")
cat("map.1kg.eur2:", dim(map.1kg.eur2), "\n")
cat("snp.1kg.eur2:", dim(snp.1kg.eur2), "\n\n")

for (i in 1:length(cg2)) {
    pos <- models.ea[models.ea$cg == cg2[i], 1]
    gwas <- snp.gwas2$z[is.element(snp.gwas2$pos_hg38, pos)]
    weight <- models.ea[models.ea$cg == cg2[i], 2]
    match_indices <- match(pos, map.1kg.eur2$POS)
    
    cat("Iteration:", i, "\n")
    cat("Current CG:", cg2[i], "\n")
    cat("Positions:\n")
    print(head(pos))
    cat("GWAS Z-scores:\n")
    print(head(gwas))
    cat("Weights:\n")
    print(head(weight))
    cat("Matching Indices:\n")
    print(head(match_indices))
    
    tryCatch({
        if (any(is.na(match_indices))) stop("NA values found in match_indices")
        if (any(match_indices > nrow(snp.1kg.eur2))) stop("Out of bounds indices found")
    }, error = function(e) {
        cat("Error detected:", e$message, "\n")
        cat("Dimensions of relevant objects at error detection:\n")
        cat("models.ea:", dim(models.ea), "\n")
        cat("snp.gwas2:", dim(snp.gwas2), "\n")
        cat("map.1kg.eur2:", dim(map.1kg.eur2), "\n")
        cat("snp.1kg.eur2:", dim(snp.1kg.eur2), "\n")
        cat("Positions causing error:\n")
        print(pos)
        cat("Matching Indices causing error:\n")
        print(match_indices)
        stop("Stopping execution due to error.")
    })
    
    geno <- NULL
    tryCatch({
        geno <- snp.1kg.eur2[match_indices, , drop = FALSE]
    }, error = function(e) {
        cat("Error accessing genotype data at iteration:", i, "\n")
        cat("Error message:", e$message, "\n")
        stop("Stopping execution due to error.")
    })
    
    cat("Genotype Data:\n")
    genorow <- min(nrow(geno), 10)
    genocol <- min(ncol(geno), 10)
    print(geno[1:genorow, 1:genocol])
    tryCatch({
        mwas.ea[i, ] <- MWAS(gwas, weight, t(geno))
        cat("MWAS Results (z, p):\n")
        print(mwas.ea[i, ])
        cat("\n")
    }, error = function(e) {
        cat("Error performing MWAS at iteration:", i, "\n")
        cat("Error message:", e$message, "\n")
        stop("Stopping execution due to error.")
    })
}
rownames(mwas.ea) <- cg2
colnames(mwas.ea) <- c("z", "p")

Dimensions of relevant objects:
models.ea: 99 3 
snp.gwas2: 321 9 
map.1kg.eur2: 321 3 
snp.1kg.eur2: 321 489 

Iteration: 1 
Current CG: 73274305 
Positions:
[1] "73276935"
GWAS Z-scores:
[1] -7.244305
Weights:
[1] -2.969554e-18
Matching Indices:
[1] 8
Genotype Data:
HG00096 HG00097 HG00099 HG00101 HG00102 HG00103 HG00105 HG00107 HG00108 HG00109 
      1       0       2       1       1       2       2       1       2       1 
MWAS Results (z, p):
[1] 7.244305e+00 4.346611e-13

Iteration: 2 
Current CG: 73274312 
Positions:
[1] "73276935"
GWAS Z-scores:
[1] -7.244305
Weights:
[1] -2.971137e-18
Matching Indices:
[1] 8
Genotype Data:
HG00096 HG00097 HG00099 HG00101 HG00102 HG00103 HG00105 HG00107 HG00108 HG00109 
      1       0       2       1       1       2       2       1       2       1 
MWAS Results (z, p):
[1] 7.244305e+00 4.346611e-13

Iteration: 3 
Current CG: 73292330 
Positions:
[1] "73283600" "73284739" "73284884" "73286018" "73286326" "73289846"
GWAS Z-scores:
[1]  7.290984 

In [94]:
genorow

In [95]:
genocol

In [96]:
geno[1:genorow, ]

ERROR: Error in geno[1:genorow, ]: incorrect number of dimensions


In [97]:
geno

In [None]:
mwas.ea

In [None]:
# mwas by models of all samples
cg2 <- unique(models.all$cg)
mwas.all <- matrix(0,nrow=length(cg2),ncol=2)
for(i in 1:length(cg2)){
    pos <- models.all[models.all$cg == cg2[i],1]
    
    gwas <- snp.gwas2$z[is.element(snp.gwas2$pos_hg38, pos)]
    weight <- models.all[models.all$cg == cg2[i],2]
    geno <- snp.1kg.eur2[match(pos,map.1kg.eur2$POS),]
    mwas.all[i,] <- MWAS(gwas, weight, t(geno))
}
rownames(mwas.all) <- cg2
colnames(mwas.all) <- c("z","p")

# mwas by models of EA samples
cg2 <- unique(models.ea$cg)
mwas.ea <- matrix(0,nrow=length(cg2),ncol=2)
for(i in 1:length(cg2)){
    pos <- models.ea[models.ea$cg == cg2[i],1]
    gwas <- snp.gwas2$z[is.element(snp.gwas2$pos_hg38, pos)]
    weight <- models.ea[models.ea$cg == cg2[i],2]
    geno <- snp.1kg.eur2[match(pos,map.1kg.eur2$POS),]
    mwas.ea[i,] <- MWAS(gwas, weight, t(geno))
}
rownames(mwas.ea) <- cg2
colnames(mwas.ea) <- c("z","p")

# output models and mwas results
outf <- paste0(outd,"/models-a8-covnew.all.wind.",wind[k])
write.csv(models.all,outf)
outf <- paste0(outd,"/models-a8-covnew.ea.wind.",wind[k])
write.csv(models.ea,outf)
outf <- paste0(outd,"/mwas-a8-covnew.all.wind.",wind[k])
write.csv(mwas.all,outf)
outf <- paste0(outd,"/mwas-a8-covnew.ea.wind.",wind[k])
write.csv(mwas.ea,outf)

In [None]:
mwas.all

We get the same results with old SNPs,

## Compare with results from CpGWAS

In [None]:
# results <- fread("16a9par-OUT_stage2_MWAS_scz.csv")

# results <- results[which(results$chr == 7 & results$pos >= 1987413 & results$pos <= 1988332), ]

# results <- results[which(results$population == "EA" & results$region == "caud"), ]

# head(results)