# Compare MWAS methods for sanity tests and troubleshooting

In version e, we unify the elastic.net functions and make sure 1se and 1min produce different results.

In version f, we try old covariates with all new data for everything else

In version g, we do another quick sanity test with "all" sample methylation data instead of AA.

In version h, we try old covariates, old BSseq data, new everything else. Since we're using old BSseq data, we must change back to Chr 7 for the selected peak for which we have sample data.

In [1]:
chr <- 7 # code set up for old bsseq chr 7, or anything with new bsseq

In [2]:
filter_snps <- FALSE # speed things up by pre-filtering whole chromosomes to desired regions

## Be ready with matched up SNP and covariate files

In [3]:
library("glmnet")
library("e1071")
library("doParallel")
library("data.table")

Loading required package: Matrix

Loaded glmnet 4.1-8

Loading required package: foreach

Loading required package: iterators

Loading required package: parallel



In [4]:
df <- fread("09-OUT_matched_SNP_meth_cov_a2.csv")

In [5]:
df <- df[which(df$Chr == chr), ]

In [6]:
df <- df[which(df$population == "all"), ]
df <- df[which(df$brain_region == "caud"), ]

In [7]:
df

Chr,SNP_data,methylation_data,last_meth_value_with_SNP_coverage,first_meth_value_with_SNP_coverage,last_meth_index_with_SNP_coverage,first_meth_index_with_SNP_coverage,subpopulation,brain_region,population,region,cov_file,cov_file2,cov_file3,cov_file4
<int>,<chr>,<chr>,<int>,<int>,<int>,<int>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
7,/expanse/lustre/projects/jhu152/naglemi/mwas/gwas//libd_chr7.pgen,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/caud/out/chr7_all.rda,159334659,49742,1490198,1,all,caud,all,caud,/expanse/lustre/projects/jhu152/naglemi/mwas/full_covariates_a2/all_caud.csv,/expanse/lustre/projects/jhu152/naglemi/mwas/full_covariates_a2/all_caud-no-meth.csv,/expanse/lustre/projects/jhu152/naglemi/mwas/full_covariates_a2/all_caud-no-meth-no-dx.csv,/expanse/lustre/projects/jhu152/naglemi/mwas/full_covariates_a2/all_caud-no-dx.csv


In [8]:
i <- 1

In [9]:
df[1, ]

Chr,SNP_data,methylation_data,last_meth_value_with_SNP_coverage,first_meth_value_with_SNP_coverage,last_meth_index_with_SNP_coverage,first_meth_index_with_SNP_coverage,subpopulation,brain_region,population,region,cov_file,cov_file2,cov_file3,cov_file4
<int>,<chr>,<chr>,<int>,<int>,<int>,<int>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
7,/expanse/lustre/projects/jhu152/naglemi/mwas/gwas//libd_chr7.pgen,/expanse/lustre/projects/jhu152/naglemi/mwas/pheno/caud/out/chr7_all.rda,159334659,49742,1490198,1,all,caud,all,caud,/expanse/lustre/projects/jhu152/naglemi/mwas/full_covariates_a2/all_caud.csv,/expanse/lustre/projects/jhu152/naglemi/mwas/full_covariates_a2/all_caud-no-meth.csv,/expanse/lustre/projects/jhu152/naglemi/mwas/full_covariates_a2/all_caud-no-meth-no-dx.csv,/expanse/lustre/projects/jhu152/naglemi/mwas/full_covariates_a2/all_caud-no-dx.csv


In [10]:
set.seed(2018)
wind <- c(10000)
# output directory
#outd <- "/dcl02/lieber/shan/shizhong/finemapping/GWAS/tags/scz3/mwas/chr22/1/"
outd <- "20-OUT_original_mwas_sanity_test/"

## Functions

### Shizhong's original

In [11]:
###### model: learn elastic net model on training data 
######---------Input: trainX, trainY
######---------Return: selected features and coefficents

# original
# elastic.net <- function(trainX,trainY){
#     if(nrow(trainX)!=length(trainY)){
#             stop("Number of observations is differerent")
#     } 

#     # optimize alpha---mixing parameter  
#     a <- 0.5
#     search <- foreach(ai = a, .combine = rbind) %dopar% {
#         cv.fit <- cv.glmnet(
#                         trainX,
#                         trainY,
#                         nfold = 5,
#                         type.measure = "mse",
#                         paralle = TRUE,
#                         alpha = ai
#                         )
#         data.frame(
#                         cvm = min(cv.fit$cvm),
#                         lambda = cv.fit$lambda.min,
#                         alpha = ai
#                         )
#         } 
#     cv.opt <- search[search$cvm == min(search$cvm),] 

#         # fit model by optimized alpha and lambda
#         yfit = glmnet(
#         trainX,
#         trainY,
#         lambda = cv.opt$lambda,
#         alpha = cv.opt$alpha
#                 )       
#         idf <- coef(yfit)
#         idx <- which(idf != 0)
#         selectf <- data.frame(
#                 features = idf@Dimnames[[1]][idx], 
#                 coefs = idf [idx]
#         )
# }

MWAS <- function(gwas, weight, geno){
        z <- gwas %*% weight
        z.cor <- cor(geno)
        se <- sqrt(weight %*%  z.cor %*%  weight)
        z <- z/se
        p=pnorm(abs(z),lower.tail=F)*2
        return(c(z, p))
}

#### Modified `elastic.net` to use `lambda.1se`

In [12]:
# modified to use lambda 1se and appropriate cvm
elastic.net <- function(trainX,trainY, lambda.choice = "1se"){
    if(nrow(trainX)!=length(trainY)){
            stop("Number of observations is differerent")
    } 

    # optimize alpha---mixing parameter  
    a <- 0.5
    search <- foreach(ai = a, .combine = rbind) %dopar% {
        #set.seed(42)
        cv.fit <- cv.glmnet(
                        trainX,
                        trainY,
                        nfold = 5,
                        type.measure = "mse",
                        parallel = TRUE,
                        alpha = ai
                        )
        #print(paste0("Dim of trainX: ", dim(trainX)))
        #print(paste0("Len of trainY: ", length(trainY)))
        coef_matrix <- as.matrix(coef(cv.fit))

        if(lambda.choice == "1se"){
            chosen_lambda <- cv.fit$lambda.1se
            chosen_cvm <- cv.fit$cvm[cv.fit$lambda == cv.fit$lambda.1se]
        }
        if(lambda.choice == "min"){
            chosen_lambda <- cv.fit$lambda.min
            chosen_cvm <- min(cv.fit$cvm)
        }


        data.frame(
                        cvm = chosen_cvm,
                        lambda = chosen_lambda,
                        alpha = ai
                        )
        } 
    cv.opt <- search[search$cvm == min(search$cvm),] 

        # fit model by optimized alpha and lambda
    #set.seed(42)
    yfit <- glmnet(
    trainX,
    trainY,
    lambda = cv.opt$lambda,
    alpha = cv.opt$alpha)

    idf <- coef(yfit)
    idx <- which(idf != 0)
    selectf <- data.frame(
            features = idf@Dimnames[[1]][idx], 
            coefs = idf [idx]
    )
}

## Replace all old objects with new objects in same format

### Methylation data

In [13]:
suppressWarnings(library(bsseq))

Loading required package: BiocGenerics


Attaching package: ‘BiocGenerics’


The following objects are masked from ‘package:stats’:

    IQR, mad, sd, var, xtabs


The following objects are masked from ‘package:base’:

    anyDuplicated, aperm, append, as.data.frame, basename, cbind,
    colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
    get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
    match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
    Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
    table, tapply, union, unique, unsplit, which.max, which.min


Loading required package: GenomicRanges

Loading required package: stats4

Loading required package: S4Vectors


Attaching package: ‘S4Vectors’


The following objects are masked from ‘package:data.table’:

    first, second


The following objects are masked from ‘package:Matrix’:

    expand, unname


The following object is masked from ‘package:utils’:

    findMatches


The

In [14]:
# # The code in this block is for NEW methylation data, which can be for any chromosome or portion

# # load data for mwas
# # load("./rda/caudate_mwas_data_chr22.rda")
load(df$methylation_data[i])

p <- getMeth(BSobj2)


rownames(p) <- start(BSobj2)

# These are the same sites from our test BSsample object on chr 1, and we're extracting these sites from
# the new bsseq object for the whole chromosome.
sites_to_test_pos <-
c(1980077, 1980101, 1980129, 1980136, 1980145, 1980179, 1980183, 1980193, 1980205, 1980239, 
1980247, 1980287, 1980291, 1980303, 1980320, 1980326, 1980355, 1980373, 1980390, 1980396, 
1980424, 1980430, 1980448, 1980458, 1980464, 1980496, 1980577, 1980581, 1980642, 1980661, 
1980787, 1980833, 1980852, 1980860, 1980866, 1980869, 1980893, 1980896, 1980905, 1980933, 
1980948, 1980952, 1980997, 1981035, 1981057, 1981111, 1981113, 1981123, 1981186, 1981200, 
1981213, 1981219, 1981246, 1981253, 1981262, 1981286, 1981328, 1981352, 1981359, 1981386, 
1981410, 1981427, 1981449, 1981479, 1981518, 1981608, 1981613, 1981675, 1981682, 1981958, 
1982004, 1982017, 1982035, 1982064, 1982079, 1982246, 1982264, 1982311, 1982357, 1982397, 
1982428, 1982488, 1982495, 1982498, 1982500, 1982541, 1982564, 1982583, 1982587, 1982648, 
1982740, 1982933, 1983048, 1983077, 1983083, 1983085, 1983105, 1983115, 1983133, 1983135, 
1983137, 1983139, 1983141, 1983143, 1983145, 1983147, 1983149, 1983151, 1983153, 1983296, 
1983405, 1983484, 1983612, 1983632, 1983694, 1983703, 1983733, 1983747, 1984150, 1984336, 
1984422, 1984473, 1984518, 1984556, 1984603, 1984623, 1984705, 1984833, 1984895, 1984908, 
1984972, 1985070, 1985172, 1985187, 1985193, 1985231, 1985257, 1985261, 1985292, 1985304, 
1985311, 1985342, 1985365, 1985419, 1985445, 1985468, 1985486, 1985521, 1985572, 1985585, 
1985591, 1985721, 1985744, 1985751, 1985759, 1985767, 1985825, 1985848, 1985857, 1985872, 
1985878, 1985883, 1985906, 1985911, 1985971, 1986087, 1986093, 1986164, 1986194, 1986214, 
1986256, 1986268, 1986331, 1986354, 1986377, 1986395, 1986407, 1986410, 1986417, 1986424, 
1986443, 1986450, 1986462, 1986465, 1986484, 1986486, 1986495, 1986497, 1986500, 1986519, 
1986521, 1986530, 1986532, 1986535, 1986554, 1986556, 1986583, 1986603, 1986624, 1986652, 
1986733, 1986784, 1986909, 1986957, 1987071, 1987106, 1987187, 1987226, 1987253, 1987269, 
1987319, 1987330, 1987338, 1987363, 1987365, 1987388, 1987395, 1987403, 1987407, 1987413, 
1987418, 1987434, 1987449, 1987458, 1987468, 1987492, 1987511, 1987527, 1987565, 1987604, 
1987616, 1987627, 1987648, 1987656, 1987659, 1987665, 1987669, 1987680, 1987684, 1987688, 
1987697, 1987705, 1987719, 1987725, 1987732, 1987778, 1987797, 1987808, 1987853, 1987858, 
1987861, 1987896, 1987910, 1987930, 1988010, 1988039, 1988059, 1988066, 1988078, 1988129, 
1988133, 1988141, 1988171, 1988201, 1988216, 1988292, 1988307, 1988321, 1988325, 1988332, 
1988401, 1988423, 1988546, 1988580, 1988612, 1988630, 1988634, 1988636, 1988655, 1988662, 
1988680, 1988708, 1988718, 1988745, 1988748, 1988776, 1988785, 1988847, 1988874, 1989119, 
1989169, 1989183, 1989201, 1989209, 1989218, 1989223, 1989241, 1989249, 1989258, 1989263, 
1989275, 1989298, 1989308, 1989332, 1989394, 1989442, 1989451, 1989482, 1989497, 1989500, 
1989503, 1989510, 1989527, 1989534, 1989550, 1989605, 1989607, 1989617, 1989622, 1989628, 
1989642, 1989658, 1989674, 1989690, 1989754, 1989761, 1989767, 1989793, 1989924, 1989957)

sites_to_test <- which(start(BSobj2) %in% sites_to_test_pos)
p <- p[sites_to_test, ]
cg <- as.numeric(rownames(p))

Subset our p object to the same sites as the old one, compare values

In [15]:
# # Old dataset
# load("BSsample.rda", verbose = TRUE) # we get the @colData attribute as a data.frame here
# load("p1.rda", verbose = TRUE) # and the p matrix from getMeth() here

# BSobj2 <- BSsample
# cg <- as.numeric(rownames(p)) # This line same whether we use old or new version
# sites_to_test_pos <- cg

### covariates

In [16]:
# The code in this block is for NEW covariate data

# covs <- fread(df$cov_file[i])
# covs <- t(covs)
# colnames(covs) <- covs[1, ]
# covs <- covs[2:nrow(covs), ]
# # transpose so we have same orientation as original code

In [17]:
# This is the OLD covariate data

load("covs_for_meqtl.rda")

### Regress methylation data over covariates

In [18]:
BSobj2$brnum <- gsub("Br0", "Br", BSobj2$brnum)
colnames(covs) <- gsub("Br0", "Br", colnames(covs))

In [19]:
mat <- match(BSobj2$brnum,colnames(covs)) 
covs <- t(covs[,mat])
p.residual=matrix(NA,dim(p)[1],dim(p)[2])

In [20]:
# This is something we only need to do for new covariates
if("genoPC1" %in% colnames(covs)){
    rownames(covs)[is.na(covs[, 'genoPC1'])] <- BSobj2$brnum[is.na(covs[, 'genoPC1'])]
}

In [21]:
colnames(p.residual) <- BSobj2$brnum

In [22]:
covs <- as.data.frame(covs)
# Convert all columns except Dx and Sex from character to numeric
cols_to_convert <- setdiff(names(covs), c("Dx", "Sex"))

for (col in cols_to_convert) {
  covs[[col]] <- as.numeric(covs[[col]])
}

# Print the modified data frame to check the conversion
#print(dat)


In [23]:
for (i in 1:dim(p)[1]) { # For each methylation site
    dat <- as.data.frame(cbind(y = p[i,], covs))
    
    # Check for rows with NAs (the ones for which we don't have covariate data)
    valid_rows <- complete.cases(dat)
    
    if (sum(valid_rows) > 0) {
        dat_valid <- dat[valid_rows,]
        model.res <- lm(y ~ ., data = dat_valid)
        
        # Store residuals in the corresponding positions
        p.residual[i, valid_rows] <- resid(model.res)
    }
}


# for(i in 1:dim(p)[1]){ # foro each methylation site
#         dat <- as.data.frame(cbind(p[i,],covs))
#         colnames(dat) <- c("y",paste0("x",1:ncol(covs)))
#         model.res <- lm(reformulate(paste0("x",1:ncol(covs)), "y"),dat)
#         p.residual[i,] = resid(model.res) 
# }

In [24]:
dim(p.residual)

In [25]:
p.residual[1:8, 1:8]

Br1122,Br2285,Br1764,Br1464,Br5062,Br1446,Br1503,Br1946
0.002232965,-0.004849795,-0.01679492,-0.04202224,-0.003789381,0.009639331,-0.01622287,0.03343785
0.001787295,-0.005130747,-0.01797141,-0.04142438,-0.00243269,0.00897174,-0.01519726,0.03414512
0.001243924,-0.005327983,-0.0191454,-0.04065603,-0.0009236619,0.00816776,-0.01415377,0.03493359
0.001100681,-0.00535655,-0.01942087,-0.04042885,-0.0005403275,0.00796446,-0.01390613,0.03510781
0.0009117989,-0.005380756,-0.01976715,-0.04011588,-4.259886e-05,0.007708162,-0.01359378,0.03531592
0.0001538429,-0.005328695,-0.02101351,-0.03875007,0.00186969,0.006802326,-0.01247287,0.03595717
6.046079e-05,-0.00530526,-0.02115441,-0.0385723,0.002096595,0.006702691,-0.01234632,0.03601842
-0.0001760729,-0.005228496,-0.02150147,-0.03811384,0.002664634,0.006460093,-0.012034,0.03615868


In [26]:
snp.gwas2 <- NULL

In [27]:
#load("p1.rda", verbose = TRUE)

### summary stats

In [28]:
library(data.table)
library(CpGWAS)

In [29]:
ss_path <- "/home/naglemi/mwas/gwas/gwas_stat_scz"

In [30]:
snp.gwas2 <- fread(ss_path, skip = 1, header = FALSE)
colnames(snp.gwas2) <- strsplit(readLines(ss_path, n = 1), "\t")[[1]]

In [31]:
snp.gwas2$z <- log(snp.gwas2$OR)/snp.gwas2$SE

In [32]:
snp.gwas2 <- snp.gwas2[, c(2, 1, 3, 3, 8, 4, 5, 20, 11)]

In [33]:
head(snp.gwas2, n = 1)

SNP,CHR,BP,BP,INFO,A1,A2,z,P
<chr>,<int>,<int>,<int>.1,<dbl>,<chr>,<chr>,<dbl>,<dbl>
rs62513865,8,100579985,100579985,0.963,C,T,0.7016221,0.4847


In [34]:
colnames(snp.gwas2)[1:5] <- c("snp", "chr", "pos_hg38", "pos_hg38", "info")

In [35]:
CHR <- chr # to avoid R df local env mixing variables of columns name and subset variable

In [36]:
snp.gwas2 <- snp.gwas2[which(snp.gwas2$chr == CHR), ]

In [37]:
if(filter_snps == TRUE){
    snp.gwas2 <- snp.gwas2[which(snp.gwas2$pos_hg38 >= (min(sites_to_test_pos)-10000) & snp.gwas2$pos_hg38 <= (max(sites_to_test_pos) + 10000)), ]
}

In [38]:
snp.gwas2 <- snp.gwas2[order(snp.gwas2$pos_hg38), ]

In [39]:
# built predition models
idx.ea <- BSobj2$race == "CAUC"

In [40]:
levels(factor(snp.gwas2$chr))

### SNPs in LIBD population

#### Old dataset

#### New dataset prep: For reference, first load Shizhong's formatted SNPs on Chr7

In [41]:
# load("p1.rda", verbose = TRUE)

# snp2_sorted <- snp2[, order(names(snp2))]

# colnames(snp2) <- gsub("Br0", "Br", colnames(snp2))

# snp2 <- snp2[, colnames(snp2) %in% colnames(p.residual)]

# snp2_positions <- stringr::str_split_fixed(rownames(snp2), ":", 3)[, 2]

#### New dataset: Now let's load ours on Chr1

In [42]:
paths <- list(
  pvar_path = paste0("/expanse/lustre/projects/jhu152/naglemi/mwas/gwas/libd_chr", chr, ".pvar"),
  pgen_path = paste0("/expanse/lustre/projects/jhu152/naglemi/mwas/gwas/libd_chr", chr, ".pgen"),
  psam_path = paste0("/expanse/lustre/projects/jhu152/naglemi/mwas/gwas/libd_chr", chr, ".psam")
)

my_SNPs <- loadSNPData(paths$pvar_path, paths$pgen_path, paths$psam_path)

In [43]:
filter_snps <- TRUE

In [44]:
if(filter_snps == TRUE){
    snp_indices_of_interest <- which(my_SNPs$pvar_dt$POS >= min(sites_to_test_pos)-100000 & my_SNPs$pvar_dt$POS <= max(sites_to_test_pos) + 100000)
}

In [45]:
if(filter_snps == TRUE){
    snp3 <- pgenlibr::ReadList(my_SNPs$pgen,
                               variant_subset = snp_indices_of_interest)
    colnames(snp3) <- my_SNPs$pvar_dt$ID[snp_indices_of_interest]
} else {
    snp3 <- pgenlibr::ReadList(my_SNPs$pgen)
    colnames(snp3) <- my_SNPs$pvar_dt$ID
}

In [46]:
snp3[1:10, 1:10]

chr7:1880535:G:A,chr7:1880720:G:A,chr7:1881190:C:T,chr7:1881432:G:A,chr7:1881726:C:T,chr7:1881793:G:C,chr7:1883749:T:C,chr7:1884059:C:T,chr7:1884597:C:T,chr7:1885530:C:G
0.9849854,0.0,1.0,0.9860229,0,0.9860229,1.0,0.9840088,0,0.992981
0.0,2.0,0.0,0.0,2,0.0,2.0,0.0,0,2.0
1.0,0.0,1.0,1.0,0,1.0,1.0,1.0,0,1.0
0.0,0.99798584,0.0,0.0,1,0.0,1.0,0.0,0,0.9979858
1.0,0.0,1.0,1.0,0,1.0,1.0,1.0,0,1.0
0.0,0.999023438,0.0,0.0,1,0.0,1.0,0.0,0,0.9990234
0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0,0.0
1.9160156,0.0,1.929993,1.9160156,0,1.914978,1.999023,1.9550171,0,1.9680176
1.9949951,0.002990723,2.0,1.9949951,0,1.9949951,2.0,1.9940186,0,1.9949951
1.0,0.0,1.0,1.0,0,1.0,1.0,1.0,0,1.0


In [47]:
rownames(snp3) <- my_SNPs$psam$`#IID`

In [48]:
map3 <- data.frame(POS = stringr::str_split_fixed(colnames(snp3), ":", 3)[, 2])

In [49]:
snp3 <- t(snp3)

In [50]:
map3 <- data.frame(POS = stringr::str_split_fixed(rownames(snp3), ":", 3)[, 2])

In [51]:
dim(map3)

### SNPs in reference population

In [52]:
#snp.1kg.eur2

In [53]:
paths <- list(
  pvar_path = paste0("/expanse/lustre/projects/jhu152/naglemi/mwas/gwas/ref_EUR_chr", chr, ".pvar"),
  pgen_path = paste0("/expanse/lustre/projects/jhu152/naglemi/mwas/gwas/ref_EUR_chr", chr, ".pgen"),
  psam_path = paste0("/expanse/lustre/projects/jhu152/naglemi/mwas/gwas/ref_EUR_chr", chr, ".psam")
)

my_SNPs <- loadSNPData(paths$pvar_path, paths$pgen_path, paths$psam_path)

In [54]:
if(filter_snps == TRUE){
    snp_indices_of_interest <- which(my_SNPs$pvar_dt$POS >= min(sites_to_test_pos)-10000 & my_SNPs$pvar_dt$POS <= max(sites_to_test_pos) + 10000)
}

In [55]:
snp.1kg.eur2 <- pgenlibr::ReadList(my_SNPs$pgen,
                        variant_subset = snp_indices_of_interest)
colnames(snp.1kg.eur2) <- my_SNPs$pvar_dt$ID[snp_indices_of_interest]
rownames(snp.1kg.eur2) <- my_SNPs$psam$`IID`

In [56]:
map.1kg.eur2 <- my_SNPs$pvar_dt

In [57]:
map.1kg.eur2 <- map.1kg.eur2[snp_indices_of_interest, ]

In [58]:
snp.1kg.eur2 <- t(snp.1kg.eur2)

### Set window size and any other parameters

In [59]:
wind <- 10000

Is 1se vs min for lambda the problem?

#### Final formatting steps to prepare for MWAS - sort samples

In [60]:
p.residual <- p.residual[, order(colnames(p.residual))]

In [61]:
snp3 <- snp3[, colnames(snp3) %in% colnames(p.residual)]

In [62]:
snp3 <- snp3[, order(colnames(snp3))]

## MWAS

### With `lambda.1se`

#### Stage 1

Quick debug: A version to see why we end up with empty `models.aa` and `models.ea`

In [63]:
set.seed(42)
for(k in 1:length(wind)){
    models.ea <- c()
    models.all <- c()
    for(i in 1:length(cg)){
    #for(i in 1){
            #cat(i,"\n")
            #print(paste0("This cg is: ", cg[i]))
            range1 <- ifelse(cg[i] - wind[k] > 0,cg[i] - wind[k],0)
            range2 <- cg[i] + wind[k]
            idx <- map3$POS > range1 & map3$POS < range2
            # go to next cg if no snps within window
            if(sum(idx) <= 1){
                    next
            }
            geno <- snp3[idx,] # changed snp2 to snp3
            rownames(geno) <- map3$POS[idx]
            trainX <- t(geno)
            trainY <- p.residual[i,]
            #fit <- elastic.net(trainX,trainY, "1se")
            fit <- tryCatch(
                    elastic.net(trainX,trainY, "1se"),
                    error = function(e) {return ("err")})
            if(!is.data.frame(fit)){
                if(fit == "err"){
                    next
                }
            }
            if(nrow(fit) == 0) next

            fit$cg <- cg[i]
            models.all <- rbind(models.all,fit)
            # EA only
            trainX <- trainX[idx.ea,]
            if(sum(apply(trainX,2,var)!=0) <= 1){
                    next
            }
            trainY <- trainY[idx.ea]
            fit <- tryCatch(
                    elastic.net(trainX,trainY, "1se"),
                    error = function(e) {return ("err")})
            if(!is.data.frame(fit)){
                if(fit == "err"){
                    next
                }
            }
            if(nrow(fit) == 0) next
            fit$cg <- cg[i]
            models.ea <- rbind(models.ea,fit)
    }
}

“executing %dopar% sequentially: no parallel backend registered”


In [64]:
models.ea <- models.ea[models.ea[,1] != "(Intercept)",]
models.all <- models.all[models.all[,1] != "(Intercept)",]

#### Stage 2

In [65]:
if(!dir.exists(outd)) dir.create(outd)

In [66]:
set.seed(42)
# mwas by models of all samples
cg2 <- unique(models.all$cg)
mwas.all <- matrix(0,nrow=length(cg2),ncol=2)

for(i in 1:length(cg2)){
    pos <- models.all[models.all$cg == cg2[i],1]
    gwas <- snp.gwas2$z[is.element(snp.gwas2$pos_hg38, pos)]
    weight <- models.all[models.all$cg == cg2[i],2]
    #geno <- snp.1kg.eur2[match(pos,map.1kg.eur2$POS),]

    match_indices <- match(pos, map.1kg.eur2$POS)

    tryCatch({
        geno <- snp.1kg.eur2[match_indices, , drop = FALSE]
    }, error = function(e) {
        cat("Error accessing genotype data at iteration:", i, "\n")
        cat("Error message:", e$message, "\n")
        stop("Stopping execution due to error.")
    })

    tryCatch({
        mwas.all[i,] <- MWAS(gwas, weight, t(geno))
    }, error = function(e) {
        cat("Error at iteration:", i, "\n")
        cat("cg2[i]:", cg2[i], "\n")
        cat("gwas:\n")
        print(gwas)
        cat("weight:\n")
        print(weight)
        stop(e)
    })
}

rownames(mwas.all) <- cg2
colnames(mwas.all) <- c("z","p")

In [67]:
i

In [68]:
pos

In [69]:
gwas

In [70]:
length(pos)

In [71]:
length(gwas)

In [72]:
models.all[models.all$cg == cg2[i], ]

Unnamed: 0_level_0,features,coefs,cg
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>
1737,1996464,3.818457e-18,1989957


In [73]:
snp.gwas2[is.element(snp.gwas2$pos_hg38, pos), ]

snp,chr,pos_hg38,pos_hg38,info,A1,A2,z,P
<chr>,<int>,<int>,<int>.1,<dbl>,<chr>,<chr>,<dbl>,<dbl>
rs6953187,7,1996464,1996464,0.995,A,G,-1.779363,0.07505


In [74]:
# Why do we get non-conformable arguments for iteration 57?

In [75]:
weight

In [76]:
length(weight)

In [77]:
gwas

In [78]:
length(gwas)

In [79]:
head(gwas)

In [80]:
head(map3)

Unnamed: 0_level_0,POS
Unnamed: 0_level_1,<chr>
1,1880535
2,1880720
3,1881190
4,1881432
5,1881726
6,1881793


In [81]:
head(geno)

Unnamed: 0,HG00096,HG00097,HG00099,HG00101,HG00102,HG00103,HG00105,HG00107,HG00108,HG00109,⋯,NA20814,NA20815,NA20818,NA20819,NA20821,NA20822,NA20826,NA20827,NA20828,NA20832
rs6953187,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0


In [82]:
# mwas by models of EA samples
cg2 <- unique(models.ea$cg)
mwas.ea <- matrix(0,nrow=length(cg2),ncol=2)
for(i in 1:length(cg2)){
        pos <- models.ea[models.ea$cg == cg2[i],1]
        gwas <- snp.gwas2$z[is.element(snp.gwas2$pos_hg38, pos)]
        weight <- models.ea[models.ea$cg == cg2[i],2]
        #geno <- snp.1kg.eur2[match(pos,map.1kg.eur2$POS),]

        match_indices <- match(pos, map.1kg.eur2$POS)

        tryCatch({
            geno <- snp.1kg.eur2[match_indices, , drop = FALSE]
        }, error = function(e) {
            cat("Error accessing genotype data at iteration:", i, "\n")
            cat("Error message:", e$message, "\n")
            stop("Stopping execution due to error.")
        })    
    
        mwas.ea[i,] <- MWAS(gwas, weight, t(geno))
}
rownames(mwas.ea) <- cg2
colnames(mwas.ea) <- c("z","p")

In [83]:
head(mwas.all)

Unnamed: 0,z,p
1980077,-9.855348,6.499098000000001e-23
1980101,-9.825174,8.772250000000001e-23
1980129,-9.887751,4.7047730000000005e-23
1980136,-9.850409,6.826528e-23
1980145,-9.889617,4.6179170000000006e-23
1980179,-9.89292,4.468029e-23


In [84]:
mwas.all.1se <- mwas.all

## Compare

In [91]:
class(mwas.all.1se)

In [90]:
mwas.all.1se$p

ERROR: Error in mwas.all.1se$p: $ operator is invalid for atomic vectors


In [85]:
library(ggplot2)

data <- data.frame(
  p_1se = as.data.frame(mwas.all.1se)$p,
  p_min = mwas.all.min$p
)

ggplot(data, aes(x = log(p_1se), y = log(p_min))) +
  geom_point(alpha = 0.6) +
  labs(x = "Log of p-values (lambda.1se)",
       y = "Log of p-values (lambda.min)",
       title = "Scattergram of MWAS p-values",
       subtitle = "Comparing lambda.1se and lambda.min") +
  theme_minimal()

ERROR: Error in mwas.all.1se$p: $ operator is invalid for atomic vectors


In [92]:
head(mwas.all.1se)

Unnamed: 0,z,p
1980077,-9.855348,6.499098000000001e-23
1980101,-9.825174,8.772250000000001e-23
1980129,-9.887751,4.7047730000000005e-23
1980136,-9.850409,6.826528e-23
1980145,-9.889617,4.6179170000000006e-23
1980179,-9.89292,4.468029e-23


In [93]:
head(mwas.all.min)

ERROR: Error in h(simpleError(msg, call)): error in evaluating the argument 'x' in selecting a method for function 'head': object 'mwas.all.min' not found
